# 2020q1 Homework3 (quiz4) contributed by < `hankchang805` > ###### tags : `linux2020` > [quiz4 題目](https://hackmd.io/@sysprog/linux2020-quiz4) ## 測驗 `1` 考慮到以下 `bitcpy` 實作,允許開發者對指定的記憶體區域,逐 bit 進行資料複製: ```cpp #include <stdint.h> #include <stdio.h> #include <string.h> void bitcpy(void *_dest, /* Address of the buffer to write to */ size_t _write, /* Bit offset to start writing to */ const void *_src, /* Address of the buffer to read from */ size_t _read, /* Bit offset to start reading from */ size_t _count) { uint8_t data, original, mask; size_t bitsize; size_t read_lhs = _read & 7; size_t read_rhs = 8 - read_lhs; const uint8_t *source = _src + (_read / 8); size_t write_lhs = _write & 7; //KK1 size_t write_rhs = 8 - write_lhs; uint8_t *dest = _dest + (_write / 8); static const uint8_t read_mask[] = { 0x00, /* == 0 00000000b */ 0x80, /* == 1 10000000b */ 0xC0, /* == 2 11000000b */ 0xE0, /* == 3 11100000b */ 0xF0, /* == 4 11110000b */ 0xF8, /* == 5 11111000b */ 0xFC, /* == 6 11111100b */ 0xFE, /* == 7 11111110b */ 0xFF /* == 8 11111111b */ }; static const uint8_t write_mask[] = { 0xFF, /* == 0 11111111b */ 0x7F, /* == 1 01111111b */ 0x3F, /* == 2 00111111b */ 0x1F, /* == 3 00011111b */ 0x0F, /* == 4 00001111b */ 0x07, /* == 5 00000111b */ 0x03, /* == 6 00000011b */ 0x01, /* == 7 00000001b */ 0x00 /* == 8 00000000b */ }; while (_count > 0) { data = *source++; bitsize = (_count > 8) ? 8 : _count; if (read_lhs > 0) { data <<= read_lhs; if (bitsize > read_rhs) data |= (*source >> read_rhs); } if (bitsize < 8) data &= read_mask[bitsize]; original = *dest; if (write_lhs > 0) { mask = read_mask[write_lhs]; if (bitsize > write_rhs) { *dest++ = (original & mask) | (data >> write_lhs); original = *dest & write_mask[bitsize - write_rhs]; *dest = original | (data << write_rhs); } else { if ((bitsize - write_lhs) > 0) mask = mask | write_mask[8 - (bitsize - write_lhs)]; *dest++ = (original & mask) | (data >> write_lhs); } } else { if (bitsize < 8) data = data | (original & write_mask[bitsize]); *dest++ = data; } _count -= bitsize; //KK2 } } ``` 測試的程式碼: ```cpp static uint8_t output[8], input[8]; static inline void dump_8bits(uint8_t _data) { for (int i = 0; i < 8; ++i) printf("%d", (_data & (0x80 >> i)) ? 1 : 0); } static inline void dump_binary(uint8_t *_buffer, size_t _length) { for (int i = 0; i < _length; ++i) dump_8bits(*_buffer++); } int main(int _argc, char **_argv) { memset(&input[0], 0xFF, sizeof(input)); for (int i = 1; i <= 32; ++i) { for (int j = 0; j < 16; ++j) { for (int k = 0; k < 16; ++k) { memset(&output[0], 0x00, sizeof(output)); printf("%2d:%2d:%2d ", i, k, j); bitcpy(&output[0], k, &input[0], j, i); dump_binary(&output[0], 8); printf("\n"); } } } return 0; } ``` :::success 延伸問題: 1. 解釋上述程式運作的原理,改寫為更精簡且等效的程式碼; 2. 考慮到 alignment 和 word size,改寫為執行效率更高的程式碼; 3. 在 Linux 核心原始程式碼找出逐 bit 進行資料複製的程式碼,並解說相對應的情境; 提示: 觀察 linux/bitops.h 裡頭巨集在原始程式碼的運用狀況 ::: ### 解釋程式運作原理 ```cpp void bitcpy(void *_dest, /* Address of the buffer to write to */ size_t _write, /* Bit offset to start writing to */ const void *_src, /* Address of the buffer to read from */ size_t _read, /* Bit offset to start reading from */ size_t _count){ uint8_t data, original, mask; size_t bitsize; size_t read_lhs = _read & 7; size_t read_rhs = 8 - read_lhs; const uint8_t *source = _src + (_read / 8); size_t write_lhs = _write & 7; //KK1 size_t write_rhs = 8 - write_lhs; uint8_t *dest = _dest + (_write / 8); ..... ..... } ``` 在這段程式碼中可以看見 `read_lhs` 、 `read_rhs` ,因為 `_src` 傳進來是 `uint8_t` 所以我們就以 8 bit 為一個單位來操作,`&7` 其實就是在 mod 8 ,算出來的 `read_lhs` 就是指要複製的那個 bit 左邊有幾個 bit , `read_rhs` 則反之;那 `write` 的部分也是如此 ```cpp static const uint8_t read_mask[] = { 0x00, /* == 0 00000000b */ 0x80, /* == 1 10000000b */ 0xC0, /* == 2 11000000b */ 0xE0, /* == 3 11100000b */ 0xF0, /* == 4 11110000b */ 0xF8, /* == 5 11111000b */ 0xFC, /* == 6 11111100b */ 0xFE, /* == 7 11111110b */ 0xFF /* == 8 11111111b */ }; static const uint8_t write_mask[] = { 0xFF, /* == 0 11111111b */ 0x7F, /* == 1 01111111b */ 0x3F, /* == 2 00111111b */ 0x1F, /* == 3 00011111b */ 0x0F, /* == 4 00001111b */ 0x07, /* == 5 00000111b */ 0x03, /* == 6 00000011b */ 0x01, /* == 7 00000001b */ 0x00 /* == 8 00000000b */ }; ``` 從這邊可以看到 `read` 、 `write` 的遮罩,舉例來說 `read_mask[3]` 代表要複製的位元長度為 3 bit ,所以就對前三個位元做 `&1` 運算至於其他部分就做 `&0` ,如此一來便可以只保留前三位元的資料 ```cpp while (_count > 0) { data = *source++; bitsize = (_count > 8) ? 8 : _count; if (read_lhs > 0) { data <<= read_lhs; if (bitsize > read_rhs) data |= (*source >> read_rhs); } if (bitsize < 8) data &= read_mask[bitsize]; original = *dest; if (write_lhs > 0) { mask = read_mask[write_lhs]; if (bitsize > write_rhs) { *dest++ = (original & mask) | (data >> write_lhs); original = *dest & write_mask[bitsize - write_rhs]; *dest = original | (data << write_rhs); } else { if ((bitsize - write_lhs) > 0) mask = mask | write_mask[8 - (bitsize - write_lhs)]; *dest++ = (original & mask) | (data >> write_lhs); } } else { if (bitsize < 8) data = data | (original & write_mask[bitsize]); *dest++ = data; } _count -= bitsize; //KK2 } } ``` 這段程式碼在進行逐 bit 的資料複製, `_count` 代表的是總共要複製的資料長度 ( bit ) ,以 8 bit 為一段進行處理;考慮到我們所複製的位元資料需要位移所以必須利用上面算出來的 `read_lhs` 、 `read_rhs` 做位元的調整再透過遮罩處理以得到要複製進去的資料;在 `write` 的部分也是如此。 ### 改寫為等效且精簡的程式碼 目前程式的 `if - else` 敘述太多,減少 branch 指令是我目前想到的改進 TODO : 把程式碼補上來 ## 測驗 2 vector 的操作: ```cpp int main() { v(float, 3, vec1); v(int, 2, vec2, 13, 42); printf("pos(vec2,0)=%d, pos(vec2,1)=%d\n", vec_pos(vec2, 0), vec_pos(vec2, 1)); vec_push_back(vec2, 88); vec_reserve(vec2, 100); printf("capacity(vec1)=%zu, capacity(vec2)=%zu\n", vec_capacity(vec1), vec_capacity(vec2)); printf("pos(vec2,2)=%d\n", vec_pos(vec2, 2)); #define display(v) \ do { \ for (size_t i = 0; i < vec_size(v); i++) \ printf("%.2f ", vec_pos(v, i)); \ puts(v.on_heap ? "heap" : "stack"); \ } while (0) display(vec1); vec_push_back(vec1, 0.0); display(vec1); vec_push_back(vec1, 1.1); display(vec1); vec_push_back(vec1, 2.2); display(vec1); vec_push_back(vec1, 3.3); display(vec1); vec_push_back(vec1, 4.4); display(vec1); vec_push_back(vec1, 5.5); display(vec1); vec_pop_back(vec1); display(vec1); vec_pop_back(vec1); display(vec1); vec_pop_back(vec1); display(vec1); vec_pop_back(vec1); display(vec1); vec_pop_back(vec1); display(vec1); vec_pop_back(vec1); display(vec1); return 0; } ``` 程式碼本體如下: ```cpp #include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> /* vector with small buffer optimization */ #define STRUCT_BODY(type) \ struct { \ size_t size : 54, on_heap : 1, capacity : 6, flag1 : 1, flag2 : 1, \ flag3 : 1; \ type *ptr; \ } #define NEXT_POWER_OF_2(s) \ (s&(s-1)) == 0 \ // VV1 ? s \ : (size_t) 1 << (64 - __builtin_clzl(s)) /* next power of 2 */ #define v(t, s, name, ...) \ union { \ STRUCT_BODY(t); \ struct { \ size_t filler; \ t buf[NEXT_POWER_OF_2(s)]; \ }; \ } name __attribute__((cleanup(vec_free))) = {.buf = {__VA_ARGS__}}; \ name.size = sizeof((__typeof__(name.buf[0])[]){0, __VA_ARGS__}) / \ sizeof(name.buf[0]) - 1; \ name.capacity = sizeof(name.buf) / sizeof(name.buf[0]) #define vec_size(v) v.size #define vec_capacity(v) \ (v.on_heap ? (size_t) 1 << v.capacity : sizeof(v.buf) / sizeof(v.buf[0])) #define vec_data(v) (v.on_heap ? v.ptr : v.buf) /* always contiguous buffer */ #define vec_elemsize(v) sizeof(v.buf[0]) #define vec_pos(v, n) vec_data(v)[n] /* lvalue */ #define vec_reserve(v, n) __vec_reserve(&v, n, vec_elemsize(v), vec_capacity(v)) #define vec_push_back(v, e) \ __vec_push_back(&v, &(__typeof__(v.buf[0])[]){e}, vec_elemsize(v), \ vec_capacity(v)) #define vec_pop_back(v) (void) (v.size -= 1) // VV2 /* This function attribute specifies function parameters that are not supposed * to be null pointers. This enables the compiler to generate a warning on * encountering such a parameter. */ #define NON_NULL __attribute__((nonnull)) static NON_NULL void vec_free(void *p) { STRUCT_BODY(void) *s = p; if (s->on_heap) free(s->ptr); } static inline int ilog2(size_t n) { return 64 - __builtin_clzl(n) - ((n & (n - 1)) == 0); } static NON_NULL void __vec_reserve(void *vec, size_t n, size_t elemsize, size_t capacity) { union { STRUCT_BODY(void); struct { size_t filler; char buf[]; }; } *v = vec; if (n > capacity) { if (v->on_heap) { v->ptr = realloc(v->ptr, elemsize * (size_t) 1 << (v->capacity = ilog2(n))); } else { void *tmp = malloc(elemsize * (size_t) 1 << (v->capacity = ilog2(n))); memcpy(tmp, v->buf, elemsize * v->size); v->ptr = tmp; v->on_heap = 1; } } } static NON_NULL void __vec_push_back(void *restrict vec, void *restrict e, size_t elemsize, size_t capacity) { union { STRUCT_BODY(char); struct { size_t filler; char buf[]; }; } *v = vec; if (v->on_heap) { if (v->size == capacity) v->ptr = realloc(v->ptr, elemsize * (size_t) 1 << ++v->capacity); memcpy(&v->ptr[v->size++ * elemsize], e, elemsize); } else { if (v->size == capacity) { void *tmp = malloc(elemsize * (size_t) 1 << (v->capacity = capacity + 1)); memcpy(tmp, v->buf, elemsize * v->size); v->ptr = tmp; v->on_heap = 1; memcpy(&v->ptr[v->size++ * elemsize], e, elemsize); } else memcpy(&v->buf[v->size++ * elemsize], e, elemsize); } } ``` :::success 延伸問題: 1. 解釋上述程式運作的原理,比照 quiz2 進行效率分析; 2. 指出上述實作的限制並提出改進方案; 3. 參照 folly::fbvector 文件和對應原始程式碼,以上述程式為基礎,實作 vector 經典操作 ::: ### 解釋程式碼 ```cpp #define NEXT_POWER_OF_2(s) \ (s&(s-1)) == 0 \ // VV1 ? s \ : (size_t) 1 << (64 - __builtin_clzl(s)) /* next power of 2 */ ``` 看這個 Macro 的名字可以得知在求大於數字 s 的下一個 $2^n$ 是什麼,這邊 `VV1` 的解答為 `(s&(s-1)) == 0` ,判斷式成立的時候代表 s 是 $2^n$ 數,如果不成立的時候就得透過 `__builtin_clzl(s)` 把最高位元前面有幾個 0 找出來,這代表我們找出最高的 1 是在第幾個位元,再透過向左移動去求大於 s 的下一個 $2^n$ 是多少 ```cpp #define vec_pop_back(v) (void) (v.size -= 1) // VV2 ``` 這邊是做 vector 很常使用的操作 `vec_pop_back` ,這邊把 `v.size` 扣掉 1,會這樣做的目的是為了配合 `void display(v)` 這個 Macro ```cpp #define display(v) \ do { \ for (size_t i = 0; i < vec_size(v); i++) \ printf("%.2f ", vec_pos(v, i)); \ puts(v.on_heap ? "heap" : "stack"); \ } while (0) ``` 可以看見裡面定義是 vector size 有多長就印出多長,如此一來就相當於把 vector 的最後一個元素 pop 掉