2018q3 第 17 週測驗題 (下)

測驗 `1`

考慮以下將整數轉換為浮點數的 C 程式:

#include <stdint.h>
#define I2F_MAX_BITS 15
#define I2F_MAX_INPUT ((1 << I2F_MAX_BITS) - 1)
#define I2F_SHIFT (24 - I2F_MAX_BITS)
uint32_t u2f(uint32_t input) {
    if ((input & I2F_MAX_INPUT) == 0) return 0;
    uint32_t exponent = 126 + I2F_MAX_BITS;
    uint32_t fraction = (input & I2F_MAX_INPUT) << I2F_SHIFT;
    for (int i = 0; i < I2F_MAX_BITS; i++) {
        if (fraction & 0x800000) break;
        fraction = fraction << 1;
        exponent = exponent - 1;           
    }
    return exponent << 23 | (fraction & 0x7fffff);
}

注意上述有效的 input 範圍僅在 0 到 2¹⁵ - 1 (請思考為何如此)，以下用 bitwse 改寫得到更快的實作：

#include <stdint.h>
static inline uint32_t ror32(uint32_t word, unsigned int shift) {
    return (word >> shift) | (word << (32 - shift));
}
/**
 * __fls - find last (most-significant) set bit in a long word
 * @word: the word to search
 */
#ifdef CONFIG_64BIT
#define BITS_PER_LONG 64
#else
#define BITS_PER_LONG 32
#endif
static inline unsigned long __fls(unsigned long word) {
    int num = BITS_PER_LONG - 1;
#if BITS_PER_LONG == 64
    if (!(word & (~0ul << 32))) {
        num -= 32;
        word <<= 32;
    }
#endif
    if (!(word & (~0ul << (BITS_PER_LONG - 16)))) {
        num -= 16;
        word <<= 16;
    }
    if (!(word & (~0ul << (BITS_PER_LONG - 8)))) {
        num -= 8;
        word <<= 8;
    }
    if (!(word & (~0ul << (BITS_PER_LONG - 4)))) {
        num -= 4;
        word <<= 4;
    }
    if (!(word & (~0ul << (BITS_PER_LONG - 2)))) {
        num -= 2;
        word <<= 2;
    }
    if (!(word & (~0ul << (BITS_PER_LONG - 1))))
        num -= 1;
    return num;                           
}

#define I2F_FRAC_BITS 23
#define I2F_MASK ((1 << I2F_FRAC_BITS) - 1)
uint32_t u2f(uint32_t x) {
    if (!x)
        return 0;
    uint32_t msb = __fls(x);
    uint32_t fraction = ror32(x, (msb - I2F_FRAC_BITS) & Q1) & I2F_MASK;
    uint32_t exponent = (127 + msb) << I2F_FRAC_BITS;
    return Q2;
}

補上以上程式碼。

作答區

Q1 = ?

(a) 0xff
(b) 0x7f
(c) 0x3f
(d) 0x2f
(e) 0x1f
(f) 0x0f
(g) 0x1

Q2 = ?

(a) fraction | exponent
(b) fraction + exponent
(c) fraction ^ exponent
(d) exponent << 23 | (fraction & 0x7f)
(e) exponent << 23 | (fraction & 0x3f)
(f) exponent << 23 | (fraction & 0x1f)

延伸問題: 在 Linux 核心的 RAID 程式碼找到 i2f 的實作並且解說，需要涵蓋 x86_64 加速的 __fls 和相關效能分析

測驗 2

考慮以下 pthread_barrier 的替代實作:

#include <pthread.h>

#if 0 /* redefine pthread_barrier_* */
#define pthread_barrier_t barrier_t
#define pthread_barrier_init(B, A, N) (barrier_init((B), (N)), 0)
#define pthread_barrier_destroy(B) (barrier_destroy(B), 0)
#define pthread_barrier_wait barrier_wait
#endif

typedef struct barrier_t {
    unsigned count;
    unsigned total;
    pthread_mutex_t m;
    pthread_cond_t cv;
} barrier_t;

#define BARRIER_FLAG (1UL << 31)
void barrier_destroy(barrier_t *b) {       
    pthread_mutex_lock(&b->m);

    while (b->total > BARRIER_FLAG) {
        /* Wait until everyone exits the barrier */
        pthread_cond_wait(&b->cv, &b->m);
    }

    pthread_mutex_unlock(&b->m);

    pthread_cond_destroy(&b->cv);
    pthread_mutex_destroy(&b->m);
}

void barrier_init(barrier_t *b, unsigned count) {
    pthread_mutex_init(&b->m, NULL);
    pthread_cond_init(&b->cv, NULL);
    b->count = count;
    b->total = BARRIER_FLAG;
}

int barrier_wait(barrier_t *b) {
    pthread_mutex_lock(&b->m);

    while (b->total > BARRIER_FLAG) {
        /* Wait until everyone exits the barrier */
        pthread_cond_wait(&b->cv, &b->m);
    }

    /* Are we the first to enter? */
    if (b->total == BARRIER_FLAG)
        b->total = 0;

    B1;

    if (b->total == b->count) {
        b->total += BARRIER_FLAG - 1;
        pthread_cond_broadcast(&b->cv);

        pthread_mutex_unlock(&b->m);

        return PTHREAD_BARRIER_SERIAL_THREAD;
    } else {
        while (b->total < BARRIER_FLAG) {
            /* Wait until enough threads enter the barrier */
            pthread_cond_wait(&b->cv, &b->m);
        }

        B2;

        /* Get entering threads to wake up */
        if (b->total == BARRIER_FLAG)
            pthread_cond_broadcast(&b->cv);

        pthread_mutex_unlock(&b->m);

        return 0;
    }
}

pthread_barrier_ 系列函式其實只做而且只能做一件事：將先後到達的多個執行緒擋在同一個 barrier 前，直到所有執行緒到齊，然後撤下 barrier 同時放行。重要操作有:

init 函數: 負責指定要等待的執行緒個數;
wait 函數: 由每個執行緒主動呼叫，它通知 barreir 說「我到起跑線前了」。在 wait 執行尾聲，barrier 會檢查是否所有人都到 barrier 前了，若是，barrier 就消失，所有執行緒繼續執行下一行程式碼；如果不是，則所有已到 wait 的執行緒持續等待，剩下沒執行到 wait 的執行緒可繼續執行;
destroy: 釋放 init 申請的資源;

補完程式碼。

作答區

B1 = ?

(a) b->total–
(b) b->total++

B2 = ?

(a) b->total–
(b) b->total++

延伸問題: 比較上述 barrier 和內建 pthread_barrier_ 的實作效能差異，並且用 atomics 和 futex 改寫

2018q3 第 17 週測驗題 (下)

測驗 1

測驗 `1`