2017q3 Homework1 (clz)

--- tags: sysprog2017 --- # 2017q3 Homework1 (clz) contributed by <`HexRabbit`,`Yuessiah`[^1]> ## 簡介 clz 即為 count leading zero 的簡寫，檢查在固定位元長度下最高位的 1 左側共有多少個 0，在網路上查了查，作法雖多但幾乎都是二分搜尋與 hashtable 查表的作法。 ## 案例探討 ### Linear search #### bit shift ```C int clz(uint32_t x) { if(x == 0) return 32; int count = 0; while(x) { if (x & 0x80000000) return count; x <<= 1; count++; } } ``` 最直觀的想法，不斷左移檢查 MSB 直到遇到第一個 1，但時間複雜度卻是所有作法中最糟的 O(n)。 ### Binary search #### iterative ```C int clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return (n - x); } ``` 用 y 來檢查當前二分搜索的左半邊是否存在 1，若存在則更新 x 為左半邊的值並且扣掉右邊的 bit 數，最後退出 while 迴圈時有兩種情況，若一開始輸入之 x 為零則 return n，否則會需要扣掉一開始在最高位的 1 變成return (n-1)，所以才會寫成 return (n-x)。由於反向扣除的寫法較不直觀，並且最後 n-x 的作法不容易看出目的何在，我重寫了一個正向計算的作法。 ```C int clz(uint32_t x) { int n = 0, c = 16; do { uint32_t y = x >> c; if (!y) n += c; else x = y; c >>= 1; } while (c); return n; } ``` #### recursive ```C int clz(uint32_t x, int n, int c) { if(c == 0) return n; if(y = (x >> c)) return clz(y, n, c>>1) else return clz(x, n+c, c>>1); } ``` 其實就是上面 iterative 的遞迴版本，預設 n = 0, c = 16。 #### bit mask ```C int clz(uint32_t x) { if (x == 0) return 32; int n = 0; if (x <= 0x0000FFFF) { n += 16; x <<= 16; } if (x <= 0x00FFFFFF) { n += 8; x <<= 8; } if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; } if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; } if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; } return n; } ``` 與 iterative 的作法相似，不過是透過比較大小來確定最高位的 1 在哪半邊，若落在右邊則左移並重複檢查。 #### byte shift ```C int clz(uint32_t x) { if (x == 0) return 32; int n = 1; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } n = n - (x >> 31); return n; } ``` 與上述幾個作法相似，不再贅述。 ### Hash table #### De Bruijn sequence ```C const int tab32[32] = { 0 , 1 , 28, 2 , 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4 , 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6 , 11, 5 , 10, 9 }; int clz(uint32_t value) { value |= value >> 1; value |= value >> 2; value |= value >> 4; value |= value >> 8; value |= value >> 16; return tab32[((uint32_t)((value - (value >> 1))*0x077CB531U)) >> 27]; } ``` 這個作法相當神奇，首先透過算法製造出一任意連續5bit不重複的32bit神奇數字(De Brujin sequence)，共32種變化(0~31)，再來將其與輸入之value的最高位相乘(即為左移操作)，並將結果右移27位得到一個5bit的數字，其後便可透過任意連續5bit不重複的特性查表反推出原始最高位1的位置。 #### Harley's algorithm ```C uint8_t clz(uint32_t x) { static prog_uint8_t const Table[] = { 0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF, 16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF, 0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF, 0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF, 14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF, 18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13, 26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25, 0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF, }; /* Propagate leftmost 1-bit to the right */ x = x | (x >> 1); x = x | (x >> 2); x = x | (x >> 4); x = x | (x >> 8); x = x | (x >> 16); /* x = x * 0x6EB14F9 */ x = (x << 3) - x; /* Multiply by 7. */ x = (x << 8) - x; /* Multiply by 255. */ x = (x << 8) - x; /* Again. */ x = (x << 8) - x; /* Again. */ return pgm_read_byte(&Table[x >> 26]); } ``` ## 效能差異 ![](https://i.imgur.com/dvTbe89.png) 完全未做優化的情況下，效能由優至劣排序依序是： $$ binary = byte > harley = iteration > recursive $$ ### 嘗試使用O2最佳化選項 ```C ;>> cpuid; main.c:14 asm volatile ("CPUID\n\t" rdtsc mov edi, edx mov r8d, eax rdtscp; main.c:24 asm volatile("RDTSCP\n\t" mov esi, edx mov r11d, eax cpuid ;<< shl rsi, 0x20; main.c:37 end = (((uint64_t) high2 << 32) | low2); shl rdi, 0x20; main.c:36 start = (((uint64_t) high1 << 32) | low1); mov r11d, r11d; main.c:37 end = (((uint64_t) high2 << 32) | low2); mov r8d, r8d; main.c:36 start = (((uint64_t) high1 << 32) | low1); or rsi, r11; main.c:37 end = (((uint64_t) high2 << 32) | low2); or rdi, r8; main.c:36 start = (((uint64_t) high1 << 32) | low1); sub rsi, rdi; main.c:38 return end - start; ; '+' add r10, rsi sub r9d, 1; main.c:-11 jne 0x740;[gj] ``` 以harley為例，發現代碼被大量精簡 ( 而且clz的程式碼被簡化掉了 )，也沒有測量到正確的時間。似乎是不可行，只好自己硬幹。 ### 以組語改寫harley原始C code 首先觀察到在harley的組語中，存在大量暫存器與記憶體互動的代碼，並且沒有複雜的條件分支 ```jsx cpuid; main.c:14 asm volatile ("CPUID\n\t" rdtsc mov edi, edx mov esi, eax mov rax, qword [local_28h]; clz.h:-22 mov dword [rax], edi mov rax, qword [local_20h] mov dword [rax], esi mov eax, dword [local_8ch] mov dword [local_6ch], eax; clz.h:-20 mov eax, dword [local_6ch] shr eax, 1 or dword [local_6ch], eax mov eax, dword [local_6ch] shr eax, 2 or dword [local_6ch], eax mov eax, dword [local_6ch] shr eax, 4 or dword [local_6ch], eax mov eax, dword [local_6ch] shr eax, 8 or dword [local_6ch], eax mov eax, dword [local_6ch] shr eax, 0x10 or dword [local_6ch], eax mov eax, dword [local_6ch] shl eax, 3 sub eax, dword [local_6ch] mov dword [local_6ch], eax mov eax, dword [local_6ch] shl eax, 8 sub eax, dword [local_6ch] mov dword [local_6ch], eax mov eax, dword [local_6ch] shl eax, 8 sub eax, dword [local_6ch] mov dword [local_6ch], eax mov eax, dword [local_6ch] shl eax, 8 sub eax, dword [local_6ch] mov dword [local_6ch], eax lea rax, [local_94h] mov qword [local_38h], rax lea rax, [local_90h] mov qword [local_30h], rax rdtscp mov edi, edx mov esi, eax cpuid; main.c:-220 ``` 嘗試以inline asm改寫 ```C __asm__ __volatile__( ".intel_syntax noprefix\n\t" // "mov rax, 0\n\t" // "mov rdi, 0\n\t" // "syscall\n\t" "mov rax, rdi\n\t" "shr rdi, 1\n\t" "or rax, rdi\n\t" "mov rdi, rax\n\t" "shr rdi, 2\n\t" "or rax, rdi\n\t" "mov rdi, rax\n\t" "shr rdi, 4\n\t" "or rax, rdi\n\t" "mov rdi, rax\n\t" "shr rdi, 8\n\t" "or rax, rdi\n\t" "mov rdi, rax\n\t" "shr rdi, 16\n\t" "or rax, rdi\n\t" "mov rdi, rax\n\t" "shl rax, 3\n\t" "sub rax, rdi\n\t" "mov rdi, rax\n\t" "shl rax, 8\n\t" "sub rax, rdi\n\t" "mov rdi, rax\n\t" "shl rax, 8\n\t" "sub rax, rdi\n\t" "mov rdi, rax\n\t" "shl rax, 8\n\t" "sub rax, rdi\n\t" "shr rax, 26\n\t" ".att_syntax\n\t" ); register int i asm("rax"); return Table[i]; ``` 實驗結果 ![](https://i.imgur.com/kzaTQKE.png) 大致獲得約 35% 效能提昇(平均 82cycle -> 53cycle) [^1]: 本共筆是與`Yuessiah`[共同研究](https://hackmd.io/IwVgbAxgZgJgDATgLQgOwCN1ICwGYpxIICGcxR6q2CUApgEzEz1xA===?view)撰寫而成