1
考慮以下將整數轉換為浮點數的 C 程式:
#include <stdint.h>
#define I2F_MAX_BITS 15
#define I2F_MAX_INPUT ((1 << I2F_MAX_BITS) - 1)
#define I2F_SHIFT (24 - I2F_MAX_BITS)
uint32_t u2f(uint32_t input) {
if ((input & I2F_MAX_INPUT) == 0) return 0;
uint32_t exponent = 126 + I2F_MAX_BITS;
uint32_t fraction = (input & I2F_MAX_INPUT) << I2F_SHIFT;
for (int i = 0; i < I2F_MAX_BITS; i++) {
if (fraction & 0x800000) break;
fraction = fraction << 1;
exponent = exponent - 1;
}
return exponent << 23 | (fraction & 0x7fffff);
}
注意上述有效的 input
範圍僅在 0 到 215 - 1 (請思考為何如此),以下用 bitwse 改寫得到更快的實作:
#include <stdint.h>
static inline uint32_t ror32(uint32_t word, unsigned int shift) {
return (word >> shift) | (word << (32 - shift));
}
/**
* __fls - find last (most-significant) set bit in a long word
* @word: the word to search
*/
#ifdef CONFIG_64BIT
#define BITS_PER_LONG 64
#else
#define BITS_PER_LONG 32
#endif
static inline unsigned long __fls(unsigned long word) {
int num = BITS_PER_LONG - 1;
#if BITS_PER_LONG == 64
if (!(word & (~0ul << 32))) {
num -= 32;
word <<= 32;
}
#endif
if (!(word & (~0ul << (BITS_PER_LONG - 16)))) {
num -= 16;
word <<= 16;
}
if (!(word & (~0ul << (BITS_PER_LONG - 8)))) {
num -= 8;
word <<= 8;
}
if (!(word & (~0ul << (BITS_PER_LONG - 4)))) {
num -= 4;
word <<= 4;
}
if (!(word & (~0ul << (BITS_PER_LONG - 2)))) {
num -= 2;
word <<= 2;
}
if (!(word & (~0ul << (BITS_PER_LONG - 1))))
num -= 1;
return num;
}
#define I2F_FRAC_BITS 23
#define I2F_MASK ((1 << I2F_FRAC_BITS) - 1)
uint32_t u2f(uint32_t x) {
if (!x)
return 0;
uint32_t msb = __fls(x);
uint32_t fraction = ror32(x, (msb - I2F_FRAC_BITS) & Q1) & I2F_MASK;
uint32_t exponent = (127 + msb) << I2F_FRAC_BITS;
return Q2;
}
補上以上程式碼。
作答區
Q1 = ?
(a)
0xff(b)
0x7f(c)
0x3f(d)
0x2f(e)
0x1f(f)
0x0f(g)
0x1Q2 = ?
(a)
fraction | exponent(b)
fraction + exponent(c)
fraction ^ exponent(d)
exponent << 23 | (fraction & 0x7f)(e)
exponent << 23 | (fraction & 0x3f)(f)
exponent << 23 | (fraction & 0x1f)延伸問題: 在 Linux 核心的 RAID 程式碼找到 i2f
的實作並且解說,需要涵蓋 x86_64 加速的 __fls
和相關效能分析
測驗 2
考慮以下 pthread_barrier 的替代實作:
#include <pthread.h>
#if 0 /* redefine pthread_barrier_* */
#define pthread_barrier_t barrier_t
#define pthread_barrier_init(B, A, N) (barrier_init((B), (N)), 0)
#define pthread_barrier_destroy(B) (barrier_destroy(B), 0)
#define pthread_barrier_wait barrier_wait
#endif
typedef struct barrier_t {
unsigned count;
unsigned total;
pthread_mutex_t m;
pthread_cond_t cv;
} barrier_t;
#define BARRIER_FLAG (1UL << 31)
void barrier_destroy(barrier_t *b) {
pthread_mutex_lock(&b->m);
while (b->total > BARRIER_FLAG) {
/* Wait until everyone exits the barrier */
pthread_cond_wait(&b->cv, &b->m);
}
pthread_mutex_unlock(&b->m);
pthread_cond_destroy(&b->cv);
pthread_mutex_destroy(&b->m);
}
void barrier_init(barrier_t *b, unsigned count) {
pthread_mutex_init(&b->m, NULL);
pthread_cond_init(&b->cv, NULL);
b->count = count;
b->total = BARRIER_FLAG;
}
int barrier_wait(barrier_t *b) {
pthread_mutex_lock(&b->m);
while (b->total > BARRIER_FLAG) {
/* Wait until everyone exits the barrier */
pthread_cond_wait(&b->cv, &b->m);
}
/* Are we the first to enter? */
if (b->total == BARRIER_FLAG)
b->total = 0;
B1;
if (b->total == b->count) {
b->total += BARRIER_FLAG - 1;
pthread_cond_broadcast(&b->cv);
pthread_mutex_unlock(&b->m);
return PTHREAD_BARRIER_SERIAL_THREAD;
} else {
while (b->total < BARRIER_FLAG) {
/* Wait until enough threads enter the barrier */
pthread_cond_wait(&b->cv, &b->m);
}
B2;
/* Get entering threads to wake up */
if (b->total == BARRIER_FLAG)
pthread_cond_broadcast(&b->cv);
pthread_mutex_unlock(&b->m);
return 0;
}
}
pthread_barrier_
系列函式其實只做而且只能做一件事:將先後到達的多個執行緒擋在同一個 barrier 前,直到所有執行緒到齊,然後撤下 barrier 同時放行。重要操作有:
init
函數: 負責指定要等待的執行緒個數;wait
函數: 由每個執行緒主動呼叫,它通知 barreir 說「我到起跑線前了」。在 wait
執行尾聲,barrier 會檢查是否所有人都到 barrier 前了,若是,barrier 就消失,所有執行緒繼續執行下一行程式碼;如果不是,則所有已到 wait
的執行緒持續等待,剩下沒執行到 wait
的執行緒可繼續執行;destroy
: 釋放 init
申請的資源;補完程式碼。
作答區
B1 = ?
(a)
b->total–(b)
b->total++B2 = ?
(a)
b->total–(b)
b->total++延伸問題: 比較上述 barrier 和內建 pthread_barrier_
的實作效能差異,並且用 atomics 和 futex 改寫