2016q3 Homework1 (clz) === contributed by <`RayPan`> ### 開發環境 * Ubuntu 16.04 LTS * CPU ``` Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 4 On-line CPU(s) list: 0-3 Thread(s) per core: 2 Core(s) per socket: 2 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 60 Model name: Intel(R) Core(TM) i5-4200H CPU @ 2.80GHz Stepping: 3 CPU MHz: 2869.015 CPU max MHz: 3400.0000 CPU min MHz: 800.0000 BogoMIPS: 5587.49 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 3072K NUMA node0 CPU(s): 0-3 ``` --- ### 開發目標 比較 32-bit 數值對以下實做的效能差異: * recursive version * iteration version * binary search technique * byte-shift version * Harley's algorithm --- ### 案例分析 * #### Recursive ```c=7 uint8_t clz_recursive(uint32_t x, int shift_len) { if(shift_len == 0) return 0; /* shift upper half down, rest is filled up with 0s */ uint16_t upper = (x >> shift_len); // mask upper half away uint16_t lower = (x & (0xFFFF >> (16 - shift_len))); return upper ? clz_recursive(upper, shift_len >> 1) : shift_len + clz_recursive(lower, shift_len >> 1); } ``` * #### Iteration ```c=18 uint8_t clz_iteration(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return (n - x); } ``` * #### Binary Serach ```c=33 uint8_t clz_binary_search(uint32_t x) { int n = 0; if (x == 0) return 32; if (x <= 0x0000FFFF){ n += 16; x <<= 16; } if (x <= 0x00FFFFFF){ n += 8; x <<= 8; } if (x <= 0x0FFFFFFF){ n += 4; x <<= 4; } if (x <= 0x3FFFFFFF){ n += 2; x <<= 2; } if (x <= 0x7FFFFFFF){ n += 1; x <<= 1; } return n; } ``` * #### Byte Shift ```c=60 uint8_t clz_byte_shift(uint32_t x) { int n = 1; if (x == 0) return 32; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } n = n - (x >> 31); return n; } ``` * #### Harley ```c=84 int clz_harley(uint32_t x) { static uint8_t const Table[] = { 32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u }; /* Propagate leftmost 1-bit to the right */ x = x | (x >> 1); x = x | (x >> 2); x = x | (x >> 4); x = x | (x >> 8); x = x | (x >> 16); /* x = x * 0x6EB14F9 */ x = (x << 3) - x; /* Multiply by 7. */ x = (x << 8) - x; /* Multiply by 255. */ x = (x << 8) - x; /* Again. */ x = (x << 8) - x; /* Again. */ return Table[x >> 26]; } ``` --- ### 效能分析 ![](https://i.imgur.com/bM6Pton.png) --- 加工中 --- 參考共筆: [TempoJiJi](https://hackmd.io/s/ryiig7YT) 作業詳細資訊: [A04:clz](https://hackmd.io/s/B1LHefQ6) ###### tags `RayPan` `A04 Clz`