Try   HackMD

2020q1 Homework3 (quiz4)

contributed by < hankchang805 >

tags : linux2020

quiz4 題目

測驗 1

考慮到以下 bitcpy 實作,允許開發者對指定的記憶體區域,逐 bit 進行資料複製:

#include <stdint.h>
#include <stdio.h>
#include <string.h>

void bitcpy(void *_dest,      /* Address of the buffer to write to */
            size_t _write,    /* Bit offset to start writing to */
            const void *_src, /* Address of the buffer to read from */
            size_t _read,     /* Bit offset to start reading from */
            size_t _count)
{
    uint8_t data, original, mask;
    size_t bitsize;
    size_t read_lhs = _read & 7;
    size_t read_rhs = 8 - read_lhs;
    const uint8_t *source = _src + (_read / 8);
    size_t write_lhs = _write & 7; //KK1
    size_t write_rhs = 8 - write_lhs;
    uint8_t *dest = _dest + (_write / 8);

    static const uint8_t read_mask[] = {
        0x00, /*	== 0	00000000b	*/
        0x80, /*	== 1	10000000b	*/
        0xC0, /*	== 2	11000000b	*/
        0xE0, /*	== 3	11100000b	*/
        0xF0, /*	== 4	11110000b	*/
        0xF8, /*	== 5	11111000b	*/
        0xFC, /*	== 6	11111100b	*/
        0xFE, /*	== 7	11111110b	*/
        0xFF  /*	== 8	11111111b	*/
    };

    static const uint8_t write_mask[] = {
        0xFF, /*	== 0	11111111b	*/
        0x7F, /*	== 1	01111111b	*/
        0x3F, /*	== 2	00111111b	*/
        0x1F, /*	== 3	00011111b	*/
        0x0F, /*	== 4	00001111b	*/
        0x07, /*	== 5	00000111b	*/
        0x03, /*	== 6	00000011b	*/
        0x01, /*	== 7	00000001b	*/
        0x00  /*	== 8	00000000b	*/
    };

    while (_count > 0) {
        data = *source++;
        bitsize = (_count > 8) ? 8 : _count;
        if (read_lhs > 0) {
            data <<= read_lhs;
            if (bitsize > read_rhs)
                data |= (*source >> read_rhs);
        }

        if (bitsize < 8)
            data &= read_mask[bitsize];

        original = *dest;
        if (write_lhs > 0) {
            mask = read_mask[write_lhs];
            if (bitsize > write_rhs) {
                *dest++ = (original & mask) | (data >> write_lhs);
                original = *dest & write_mask[bitsize - write_rhs];
                *dest = original | (data << write_rhs);
            } else {
                if ((bitsize - write_lhs) > 0)
                    mask = mask | write_mask[8 - (bitsize - write_lhs)];
                *dest++ = (original & mask) | (data >> write_lhs);
            }
        } else {
            if (bitsize < 8)
                data = data | (original & write_mask[bitsize]);
            *dest++ = data;
        }

        _count -= bitsize; //KK2
    }
}

測試的程式碼:

static uint8_t output[8], input[8];

static inline void dump_8bits(uint8_t _data)
{
    for (int i = 0; i < 8; ++i)
        printf("%d", (_data & (0x80 >> i)) ? 1 : 0);
}
static inline void dump_binary(uint8_t *_buffer, size_t _length)
{
    for (int i = 0; i < _length; ++i)
        dump_8bits(*_buffer++);
}

int main(int _argc, char **_argv)
{
    memset(&input[0], 0xFF, sizeof(input));

    for (int i = 1; i <= 32; ++i) {
        for (int j = 0; j < 16; ++j) {
            for (int k = 0; k < 16; ++k) {
                memset(&output[0], 0x00, sizeof(output));
                printf("%2d:%2d:%2d ", i, k, j);
                bitcpy(&output[0], k, &input[0], j, i);
                dump_binary(&output[0], 8);
                printf("\n");
            }
        }
    }

    return 0;
}

延伸問題:

  1. 解釋上述程式運作的原理,改寫為更精簡且等效的程式碼;
  2. 考慮到 alignment 和 word size,改寫為執行效率更高的程式碼;
  3. 在 Linux 核心原始程式碼找出逐 bit 進行資料複製的程式碼,並解說相對應的情境;

提示: 觀察 linux/bitops.h 裡頭巨集在原始程式碼的運用狀況

解釋程式運作原理

void bitcpy(void *_dest,      /* Address of the buffer to write to */
            size_t _write,    /* Bit offset to start writing to */
            const void *_src, /* Address of the buffer to read from */
            size_t _read,     /* Bit offset to start reading from */
            size_t _count){
    uint8_t data, original, mask;
    size_t bitsize;
    size_t read_lhs = _read & 7;
    size_t read_rhs = 8 - read_lhs;
    const uint8_t *source = _src + (_read / 8);
    size_t write_lhs = _write & 7; //KK1
    size_t write_rhs = 8 - write_lhs;
    uint8_t *dest = _dest + (_write / 8);
    .....
    .....
}

在這段程式碼中可以看見 read_lhsread_rhs ,因為 _src 傳進來是 uint8_t 所以我們就以 8 bit 為一個單位來操作,&7 其實就是在 mod 8 ,算出來的 read_lhs 就是指要複製的那個 bit 左邊有幾個 bit , read_rhs 則反之;那 write 的部分也是如此

static const uint8_t read_mask[] = {
        0x00, /*	== 0	00000000b	*/
        0x80, /*	== 1	10000000b	*/
        0xC0, /*	== 2	11000000b	*/
        0xE0, /*	== 3	11100000b	*/
        0xF0, /*	== 4	11110000b	*/
        0xF8, /*	== 5	11111000b	*/
        0xFC, /*	== 6	11111100b	*/
        0xFE, /*	== 7	11111110b	*/
        0xFF  /*	== 8	11111111b	*/
    };

    static const uint8_t write_mask[] = {
        0xFF, /*	== 0	11111111b	*/
        0x7F, /*	== 1	01111111b	*/
        0x3F, /*	== 2	00111111b	*/
        0x1F, /*	== 3	00011111b	*/
        0x0F, /*	== 4	00001111b	*/
        0x07, /*	== 5	00000111b	*/
        0x03, /*	== 6	00000011b	*/
        0x01, /*	== 7	00000001b	*/
        0x00  /*	== 8	00000000b	*/
    };

從這邊可以看到 readwrite 的遮罩,舉例來說 read_mask[3] 代表要複製的位元長度為 3 bit ,所以就對前三個位元做 &1 運算至於其他部分就做 &0 ,如此一來便可以只保留前三位元的資料

    while (_count > 0) {
        data = *source++;
        bitsize = (_count > 8) ? 8 : _count;
        if (read_lhs > 0) {
            data <<= read_lhs;
            if (bitsize > read_rhs)
                data |= (*source >> read_rhs);
        }

        if (bitsize < 8)
            data &= read_mask[bitsize];

        original = *dest;
        if (write_lhs > 0) {
            mask = read_mask[write_lhs];
            if (bitsize > write_rhs) {
                *dest++ = (original & mask) | (data >> write_lhs);
                original = *dest & write_mask[bitsize - write_rhs];
                *dest = original | (data << write_rhs);
            } else {
                if ((bitsize - write_lhs) > 0)
                    mask = mask | write_mask[8 - (bitsize - write_lhs)];
                *dest++ = (original & mask) | (data >> write_lhs);
            }
        } else {
            if (bitsize < 8)
                data = data | (original & write_mask[bitsize]);
            *dest++ = data;
        }

        _count -= bitsize; //KK2
    }
}

這段程式碼在進行逐 bit 的資料複製, _count 代表的是總共要複製的資料長度 ( bit ) ,以 8 bit 為一段進行處理;考慮到我們所複製的位元資料需要位移所以必須利用上面算出來的 read_lhsread_rhs 做位元的調整再透過遮罩處理以得到要複製進去的資料;在 write 的部分也是如此。

改寫為等效且精簡的程式碼

目前程式的 if - else 敘述太多,減少 branch 指令是我目前想到的改進
TODO : 把程式碼補上來

測驗 2

vector 的操作:

int main()
{
    v(float, 3, vec1);
    v(int, 2, vec2, 13, 42);

    printf("pos(vec2,0)=%d, pos(vec2,1)=%d\n", vec_pos(vec2, 0),
           vec_pos(vec2, 1));
    vec_push_back(vec2, 88);
    vec_reserve(vec2, 100);
    printf("capacity(vec1)=%zu, capacity(vec2)=%zu\n", vec_capacity(vec1),
           vec_capacity(vec2));
    printf("pos(vec2,2)=%d\n", vec_pos(vec2, 2));

#define display(v)                               \
    do {                                         \
        for (size_t i = 0; i < vec_size(v); i++) \
            printf("%.2f ", vec_pos(v, i));      \
        puts(v.on_heap ? "heap" : "stack");      \
    } while (0)

    display(vec1);
    vec_push_back(vec1, 0.0);
    display(vec1);
    vec_push_back(vec1, 1.1);
    display(vec1);
    vec_push_back(vec1, 2.2);
    display(vec1);
    vec_push_back(vec1, 3.3);
    display(vec1);
    vec_push_back(vec1, 4.4);
    display(vec1);
    vec_push_back(vec1, 5.5);
    display(vec1);
    vec_pop_back(vec1);
    display(vec1);
    vec_pop_back(vec1);
    display(vec1);
    vec_pop_back(vec1);
    display(vec1);
    vec_pop_back(vec1);
    display(vec1);
    vec_pop_back(vec1);
    display(vec1);
    vec_pop_back(vec1);
    display(vec1);

    return 0;
}

程式碼本體如下:

#include <stddef.h>
#include <stdio.h>                                                                                                                                            
#include <stdlib.h>
#include <string.h>

/* vector with small buffer optimization */

#define STRUCT_BODY(type)                                                  \
    struct {                                                               \
        size_t size : 54, on_heap : 1, capacity : 6, flag1 : 1, flag2 : 1, \
            flag3 : 1;                                                     \
        type *ptr;                                                         \
    }

#define NEXT_POWER_OF_2(s) \
    (s&(s-1)) == 0         \ // VV1
        ? s                \
        : (size_t) 1 << (64 - __builtin_clzl(s)) /* next power of 2 */

#define v(t, s, name, ...)                                              \
    union {                                                             \
        STRUCT_BODY(t);                                                 \
        struct {                                                        \
            size_t filler;                                              \
            t buf[NEXT_POWER_OF_2(s)];                                  \
        };                                                              \
    } name __attribute__((cleanup(vec_free))) = {.buf = {__VA_ARGS__}}; \
    name.size = sizeof((__typeof__(name.buf[0])[]){0, __VA_ARGS__}) /   \
                    sizeof(name.buf[0]) - 1;                            \
    name.capacity = sizeof(name.buf) / sizeof(name.buf[0])

#define vec_size(v) v.size
#define vec_capacity(v) \
    (v.on_heap ? (size_t) 1 << v.capacity : sizeof(v.buf) / sizeof(v.buf[0]))

#define vec_data(v) (v.on_heap ? v.ptr : v.buf) /* always contiguous buffer */

#define vec_elemsize(v) sizeof(v.buf[0])
#define vec_pos(v, n) vec_data(v)[n] /* lvalue */

#define vec_reserve(v, n) __vec_reserve(&v, n, vec_elemsize(v), vec_capacity(v))
#define vec_push_back(v, e)                                            \
    __vec_push_back(&v, &(__typeof__(v.buf[0])[]){e}, vec_elemsize(v), \
                    vec_capacity(v))

#define vec_pop_back(v) (void) (v.size -= 1) // VV2

/* This function attribute specifies function parameters that are not supposed
 * to be null pointers. This enables the compiler to generate a warning on
 * encountering such a parameter.
 */
#define NON_NULL __attribute__((nonnull))

static NON_NULL void vec_free(void *p)
{
    STRUCT_BODY(void) *s = p;
    if (s->on_heap)
        free(s->ptr);
}

static inline int ilog2(size_t n)
{
    return 64 - __builtin_clzl(n) - ((n & (n - 1)) == 0);
}

static NON_NULL void __vec_reserve(void *vec,
                                   size_t n,
                                   size_t elemsize,
                                   size_t capacity)
{
    union {
        STRUCT_BODY(void);
        struct {
            size_t filler;
            char buf[];
        };
    } *v = vec;

    if (n > capacity) {
        if (v->on_heap) {
            v->ptr = realloc(v->ptr,
                             elemsize * (size_t) 1 << (v->capacity = ilog2(n)));
        } else {
            void *tmp =
                malloc(elemsize * (size_t) 1 << (v->capacity = ilog2(n)));
            memcpy(tmp, v->buf, elemsize * v->size);
            v->ptr = tmp;
            v->on_heap = 1;
        }
    }
}

static NON_NULL void __vec_push_back(void *restrict vec,
                                     void *restrict e,
                                     size_t elemsize,
                                     size_t capacity)
{
    union {
        STRUCT_BODY(char);
        struct {
            size_t filler;
            char buf[];
        };
    } *v = vec;

    if (v->on_heap) {
        if (v->size == capacity)
            v->ptr = realloc(v->ptr, elemsize * (size_t) 1 << ++v->capacity);
        memcpy(&v->ptr[v->size++ * elemsize], e, elemsize);
    } else {
        if (v->size == capacity) {
            void *tmp =
                malloc(elemsize * (size_t) 1 << (v->capacity = capacity + 1));
            memcpy(tmp, v->buf, elemsize * v->size);
            v->ptr = tmp;
            v->on_heap = 1;
            memcpy(&v->ptr[v->size++ * elemsize], e, elemsize);
        } else
            memcpy(&v->buf[v->size++ * elemsize], e, elemsize);
    }
}

延伸問題:

  1. 解釋上述程式運作的原理,比照 quiz2 進行效率分析;
  2. 指出上述實作的限制並提出改進方案;
  3. 參照 folly::fbvector 文件和對應原始程式碼,以上述程式為基礎,實作 vector 經典操作

解釋程式碼

#define NEXT_POWER_OF_2(s) \
    (s&(s-1)) == 0         \ // VV1
        ? s                \
        : (size_t) 1 << (64 - __builtin_clzl(s)) /* next power of 2 */

看這個 Macro 的名字可以得知在求大於數字 s 的下一個

2n 是什麼,這邊 VV1 的解答為 (s&(s-1)) == 0 ,判斷式成立的時候代表 s 是
2n
數,如果不成立的時候就得透過 __builtin_clzl(s) 把最高位元前面有幾個 0 找出來,這代表我們找出最高的 1 是在第幾個位元,再透過向左移動去求大於 s 的下一個
2n
是多少

#define vec_pop_back(v) (void) (v.size -= 1) // VV2

這邊是做 vector 很常使用的操作 vec_pop_back ,這邊把 v.size 扣掉 1,會這樣做的目的是為了配合 void display(v) 這個 Macro

#define display(v)                               \
    do {                                         \
        for (size_t i = 0; i < vec_size(v); i++) \
            printf("%.2f ", vec_pos(v, i));      \
        puts(v.on_heap ? "heap" : "stack");      \
    } while (0)

可以看見裡面定義是 vector size 有多長就印出多長,如此一來就相當於把 vector 的最後一個元素 pop 掉