2017q3 Homework1 (clz) ===== contributed by <`loolofen`> ## 開發環境 ```` $ lscpu Architecture: i686 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 4 On-line CPU(s) list: 0-3 Thread(s) per core: 1 Core(s) per socket: 4 Socket(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 60 Model name: Intel(R) Core(TM) i5-4460 CPU @ 3.20GHz Stepping: 3 CPU MHz: 3200.625 CPU max MHz: 3400.0000 CPU min MHz: 800.0000 BogoMIPS: 6385.15 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 6144K ```` ## 分析 ### Iteration version ```clike= static inline __attribute((always_inline)) unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return (n - x); } ``` 直接代入數值觀測: 若有一數值 x = 0x0000abcd,依序執行 * x=0x0000abcd,y=0x00000000,c=8,n=32 * x=0x000000ab,y=0x000000ab,c=4,n=24 * x=0x0000000a,y=0x000000ab,c=2,n=20 * x=0x00000002,y=0x00000002,c=1,n=18 * x=0x00000001,y=0x00000001,c=0,n=17 * return (17 - 1) 此效果相當於先把數值分為兩半,若左半部有大於 0 之數字則往左半部計算,否則往右半部進行計算,持續此計算直到 $x = 1 或 0$。並回傳 leading zero 的值 ### Binary Search version ```clike= static inline __attribute((always_inline)) unsigned clz(uint32_t x) { if (x == 0) return 32; int n = 0; if (x <= 0x0000FFFF) { n += 16; x <<= 16; } if (x <= 0x00FFFFFF) { n += 8; x <<= 8; } if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; } if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; } return n; } ``` 原理和 Iteration version 一樣 ### Byte Shift version ```clike= static inline __attribute((always_inline)) unsigned clz(uint32_t x) { if (x == 0) return 32; int n = 1; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } return n; } ``` 原理和 Iteration version 一樣 ### Recursive version ```clike static const int mask[] = { 0, 8, 12,14 }; static const int magic[] = { 2, 1, 0, 0 }; unsigned clz2(uint32_t x,int c) { if (!x && !c) return 32; uint32_t upper = (x >> (16 >> c)); uint32_t lower = (x & (0xFFFF >> mask[c])); if (c == 3) return upper ? magic[upper] : 2 + magic[lower]; return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1); } ``` 原理和 Iteration version 一樣 ### Harley’s Algorithm version ```clike static inline __attribute((always_inline)) unsigned clz(uint32_t x) { #ifdef CTZ static uint8_t const Table[] = { 0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF, 16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF, 0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF, 0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF, 14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF, 18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13, 26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25, 0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF, }; #else static uint8_t const Table[] = { 32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0, 0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0, 17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18, 5, 0, 0,23, 0,27, 0, 6,0,24, 7, 0, 8, 0, 0, 0 }; #endif /* Propagate leftmost 1-bit to the right */ x = x | (x >> 1); x = x | (x >> 2); x = x | (x >> 4); x = x | (x >> 8); x = x | (x >> 16); /* x = x * 0x6EB14F9 */ x = (x << 3) - x; /* Multiply by 7. */ x = (x << 8) - x; /* Multiply by 255. */ x = (x << 8) - x; /* Again. */ x = (x << 8) - x; /* Again. */ return Table[(x >> 26)]; } ``` 把 leading zero 後的位元全部補為 1 再查表得到 leading zero 的數量 ## 程式執行 由下圖可以看出 byte-shift 跟 binary search 是最快速的,接著是 iteration,byharley,最後是 recursive ![](https://i.imgur.com/1owk5hk.png)