# 2017q3 Homework(clz) contributed by < `kevin55216` > ## 主要測試平台 ``` OS: ubuntu 16.04 LTS AMD64 RAM 5.7GB CPU i5-2410M 2.30GHz 2C4T Disk 52.8GB ``` ## 次要測試平台 ``` OS: ubuntu 16.04 LTS AMD64 RAM: 31.4GB CPU: i7-4790k 4.0GHz 4C8T Disk: 212.3GB ``` ## 比較在各個值域的表現 ![](https://i.imgur.com/5js90MU.png) clz output => 32 - clzoutput 這邊我做了0x00000001、0x00000002、0x000000004到0x80000000各100000次的統計。 會選擇這樣的統計方法是考慮到 $0\ to\ 2^{32}-1$ 需要許多的執行時間且在低值域執行數量過少 ![](https://i.imgur.com/uZwlC4I.png) iteration in 67100000 to 67116000 ![](https://i.imgur.com/v5AQCTI.png) byte in 67100000 to 67116000 ![](https://i.imgur.com/L8wroTZ.png) binary in 67100000 to 67116000 ![](https://i.imgur.com/R63QbHU.png) recursive in 67100000 to 67116000 ![](https://i.imgur.com/Tr9JnwO.png) harley in 67100000 to 67116000 由上面五張圖可以看出recursive穩定度最差且時間表現也最差。 ![](https://i.imgur.com/5QhLABj.png) 上圖 clz 結果為 32 到 18 ![](https://i.imgur.com/yHd7mZi.png) 上圖 clz 結果為 6 到 5 ![](https://i.imgur.com/NjRnRnX.png) 上圖 clz 結果為 1 ``` Performance counter stats for './recursive 67100000 67116384': 1,5165 cache-misses # 1.549 % of all cache refs 97,8706 cache-references 4,7521,1287 instructions # 0.37 insn per cycle 12,6822,3770 cycles 0.301447319 seconds time elapsed Performance counter stats for './harley 67100000 67116384': 1,9492 cache-misses # 2.115 % of all cache refs 92,1710 cache-references 2,3851,1222 instructions # 0.17 insn per cycle 13,6648,7269 cycles 0.327269713 seconds time elapsed Performance counter stats for './iteration 67100000 67116384': 1,0984 cache-misses # 1.141 % of all cache refs 96,2360 cache-references 3,0070,0970 instructions # 0.24 insn per cycle 12,5552,9385 cycles 0.298588728 seconds time elapsed Performance counter stats for './binary 67100000 67116384': 1,2518 cache-misses # 1.450 % of all cache refs 86,3076 cache-references 2,1702,6129 instructions # 0.18 insn per cycle 12,0022,9473 cycles 0.284078212 seconds time elapsed Performance counter stats for './byte 67100000 67116384': 9805 cache-misses # 1.053 % of all cache refs 93,0713 cache-references 2,3360,0979 instructions # 0.19 insn per cycle 12,0301,5914 cycles 0.286465763 seconds time elapsed ``` 發現執行時間為$recursive>iteration=harley>binary=byte$ ## recursive ```clike= static const int mask[] = { 0, 8, 12,14 }; static const int magic[] = { 2, 1, 0, 0 }; unsigned clz2(uint32_t x,int c) { if (!x && !c) return 32; uint32_t upper = (x >> (16 >> c)); uint32_t lower = (x & (0xFFFF >> mask[c])); if (c == 3) return upper ? magic[upper] : 2 + magic[lower]; return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1); } ``` 以6為例 ``` clz(6, 0) x = 6 => 00000000000000000000000000000110 upper = x >> 16 = 0 lower = x & (0xFFFF >> 0) = 6 return 0 ? clz2(0, 1) : (16 >> (0)) + clz2(6, 1) =return 16 + clz2(6, 1) ---------------------------------------------------------------------------- clz(6, 1) upper = x >> (16 >> 1) = x >> 8 = 0 lower = x & (0xFFFF >> 8) = 6 return 0 ? clz2(0, 2) : (16 >> (1)) + clz2(6, 2) =return 8 + clz2(6, 2) ---------------------------------------------------------------------------- clz(6, 2) upper=x >> 4 = 0 lower=x & (0xFFFF >> 12) = 6 return 0 ? clz2(0, 3) : (16 >> (2)) + clz2(6, 3) = return 4 + clz2(6, 3) ---------------------------------------------------------------------------- clz2(6, 3) upper = x >> 2 = 1 lower = x & (0xFFFF >> 14) = 2 if(c == 3)return 1 ? magic[1] : 2+magic[2] = return 1 ---------------------------------------------------------------------------- output => 1 + 4 + 8 + 16 = 29 ``` 以0x7FFFFFFF為例 ``` clz(0x7FFFFFFF, 0) x = 0x7FFFFFFF => 01111111111111111111111111111111 upper = x >> 16 = 0x7FFF lower=x&(0xFFFF>>0)=0xFFFF return 0x7FFF ? clz2(0x7FFF, 1) : (16 >> (0)) + clz2(0xFFFF, 1) =return clz2(0x7FFF, 1) ---------------------------------------------------------------------------- clz(0x7FFF,1) upper = x >> (16 >> 1) = x >> 8 = 0x7F return clz2(0x7F, 2) ---------------------------------------------------------------------------- clz(0x7F, 2) upper = x >> 4 = 7 return clz2(0x7, 3) ---------------------------------------------------------------------------- clz2(7, 3) upper = x >> 2 = 1 lower = x&(0xFFFF >> 14) = 3 if(c == 3)return 1?magic[1]:2+magic[3]; = return 1 ---------------------------------------------------------------------------- output => 1 ``` 結論:recursive 一定要作到c=3才會得出解。 ## byte ```clike= unsigned clz(uint32_t x) { if (x == 0) return 32; int n = 1; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } n = n - (x >> 31); return n; } ``` 以6為例 ``` clz(6) 6 = 110 n = 1 if(6 >> 16 == 0) n = 16 + 1 = 17 x = 1100000000000000000 if(1100000000000000000 >> 24 == 0) n = 8+17=25 x = 110000000000000000000000000 if(110000000000000000000000000 >> 28 == 0) n = 4+25=29 x = 1100000000000000000000000000000 if(1100000000000000000000000000000 >> 30 == 0) => if(1 == 0) n = 29 - 0 return 29 ``` 0x7FFFFFFF ``` clz(0x7FFFFFFF) n=1 if ((x >> 16) == 0) if ((x >> 24) == 0) if ((x >> 28) == 0) if ((x >> 30) == 0) 1 = 1 - (x >> 31) = 1 - 0 = 1 return 1; ``` ## harley ```clike= static inline __attribute((always_inline)) unsigned clz(uint32_t x) { #ifdef CTZ static uint8_t const Table[] = { 0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF, 16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF, 0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF, 0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF, 14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF, 18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13, 26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25, 0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF, }; #else static uint8_t const Table[] = { 32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0, 0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0, 17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18, 5, 0, 0,23, 0,27, 0, 6,0,24, 7, 0, 8, 0, 0, 0 }; #endif /* Propagate leftmost 1-bit to the right */ x = x | (x >> 1); x = x | (x >> 2); x = x | (x >> 4); x = x | (x >> 8); x = x | (x >> 16); /* x = x * 0x6EB14F9 */ x = (x << 3) - x; /* Multiply by 7. */ x = (x << 8) - x; /* Multiply by 255. */ x = (x << 8) - x; /* Again. */ x = (x << 8) - x; /* Again. */ return Table[(x >> 26)]; } ``` 以0x40000000 ``` 0x40000000= 01000000000000000000000000000000 -------------------------------------------------------------------- step 1 : 01100000000000000000000000000000 (x | (x >> 1)) 01111000000000000000000000000000 (x | (x >> 2)) 01111111100000000000000000000000 (x | (x >> 4)) 01111111111111111000000000000000 (x | (x >> 8)) 01111111111111111111111111111111 (x | (x >> 16)) step 2 : -------------------------------------------------------------------- 11111111111111111111111111111000 (x << 3) - 01111111111111111111111111111111 (- x) = 01111111111111111111111111111001 -------------------------------------------------------------------- 11111111111111111111100100000000 (x << 8) - 01111111111111111111111111111001 (- x) = 01111111111111111111100100000111 -------------------------------------------------------------------- 11111111111110010000011100000000 (x << 8) - 01111111111111111111100100000111 (- x) = 01111111111111111111111111111001 -------------------------------------------------------------------- 11111111111111111111100100000000 (x << 8) - 01111111111111111111111111111001 (- x) = 01111111111111111111100100000111 -------------------------------------------------------------------- x >> 26 = 011111 = 31 table[31] = 1 ``` $for\ all\ x\ [0:OxFFFFFFFF]\ after\ step\ 1,\ x\ will\ only\ be\ 2^i-1\ where\ i\ [0:32]\ so\\ if\ we\ can\ hash\ 2^i-1\ in\ to\ a\ table\ we\ can\ get\ clz$ ## 試圖改良(失敗) ```clike= static const int mask[] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}; unsigned clz(uint32_t x) { if (!x) return 32; int n = 1; if (!(x >> 16)) { n += 16; x <<= 16; } if (!(x >> 24)) { n += 8; x <<= 8; } n += mask[x >> 28]; return n; } ``` ![](https://imgur.com/yXrvlAs.png) 看起來load的速度是沒辦法加速 ## 結論 recursive 需要做三次recursive,harley 計算完後需要去做查表,byte 做完計算直接得出結 我們可以看到雙b效能最好、recursive 效能最差。 執行時間為$recursive>iteration=harley>binary=byte$ binary 和 byte 演算法 甚至 recursive 也已經作到 $Olg(n)$ 了,我認為要加速除非把值做hash selection來達到$O(1)$的時間複雜度,harley 雖然也有製作hash table 但是前面再做 補 1 的動作就用$Olg(n)$了