2017q3 Homework1 (clz)

contributed by <HexRabbit,Yuessiah^[1]>

簡介

clz 即為 count leading zero 的簡寫，檢查在固定位元長度下最高位的 1 左側共有多少個 0，在網路上查了查，作法雖多但幾乎都是二分搜尋與 hashtable 查表的作法。

案例探討

Linear search

bit shift

int clz(uint32_t x) {
    if(x == 0) return 32;
    int count = 0;
    while(x) {
        if (x & 0x80000000) return count;
        x <<= 1;
        count++;
    }
}

最直觀的想法，不斷左移檢查 MSB 直到遇到第一個 1，但時間複雜度卻是所有作法中最糟的 O(n)。

Binary search

iterative

int clz(uint32_t x) {
    int n = 32, c = 16;
    do {
        uint32_t y = x >> c;
        if (y) { n -= c; x = y; }
        c >>= 1;
    } while (c);
    return (n - x);
}

用 y 來檢查當前二分搜索的左半邊是否存在 1，若存在則更新 x 為左半邊的值並且扣掉右邊的 bit 數，最後退出 while 迴圈時有兩種情況，若一開始輸入之 x 為零則 return n，否則會需要扣掉一開始在最高位的 1 變成return (n-1)，所以才會寫成 return (n-x)。
由於反向扣除的寫法較不直觀，並且最後 n-x 的作法不容易看出目的何在，我重寫了一個正向計算的作法。

int clz(uint32_t x) {
    int n = 0, c = 16;
    do {
        uint32_t y = x >> c;
        if (!y) n += c;
        else x = y;
        c >>= 1;
    } while (c);
    return n;
}

recursive

int clz(uint32_t x, int n, int c) {
    if(c == 0) return n;
    if(y = (x >> c)) return clz(y, n, c>>1)
    else return clz(x, n+c, c>>1);
}

其實就是上面 iterative 的遞迴版本，預設 n = 0, c = 16。

bit mask

int clz(uint32_t x) {
    if (x == 0) return 32;
    int n = 0;
    if (x <= 0x0000FFFF) { n += 16; x <<= 16; }
    if (x <= 0x00FFFFFF) { n += 8; x <<= 8; }
    if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; }
    if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; }
    if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; }
    return n;
}

與 iterative 的作法相似，不過是透過比較大小來確定最高位的 1 在哪半邊，若落在右邊則左移並重複檢查。

byte shift

int clz(uint32_t x) {
    if (x == 0) return 32;
    int n = 1;
    if ((x >> 16) == 0) { n += 16; x <<= 16; }
    if ((x >> 24) == 0) { n += 8; x <<= 8; }
    if ((x >> 28) == 0) { n += 4; x <<= 4; }
    if ((x >> 30) == 0) { n += 2; x <<= 2; }
    n = n - (x >> 31);
    return n;
}

與上述幾個作法相似，不再贅述。

Hash table

De Bruijn sequence

const int tab32[32] = {
    0 , 1 , 28, 2 , 29, 14, 24, 3, 
    30, 22, 20, 15, 25, 17, 4 , 8,
    31, 27, 13, 23, 21, 19, 16, 7,
    26, 12, 18, 6 , 11, 5 , 10, 9
};

int clz(uint32_t value) {
    value |= value >> 1;
    value |= value >> 2;
    value |= value >> 4;
    value |= value >> 8;
    value |= value >> 16;

    return tab32[((uint32_t)((value - (value >> 1))*0x077CB531U)) >> 27];
}

這個作法相當神奇，首先透過算法製造出一任意連續5bit不重複的32bit神奇數字(De Brujin sequence)，共32種變化(0~31)，再來將其與輸入之value的最高位相乘(即為左移操作)，並將結果右移27位得到一個5bit的數字，其後便可透過任意連續5bit不重複的特性查表反推出原始最高位1的位置。

Harley's algorithm

uint8_t clz(uint32_t x)
{
    static prog_uint8_t const Table[] = {
      0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
      16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
      0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
      0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
      14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
      18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
      26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
      0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
    };

    /* Propagate leftmost 1-bit to the right */
    x = x | (x >> 1);
    x = x | (x >> 2);
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x | (x >> 16);
 
    /* x = x * 0x6EB14F9 */
    x = (x << 3) - x;   /* Multiply by 7. */
    x = (x << 8) - x;   /* Multiply by 255. */
    x = (x << 8) - x;   /* Again. */
    x = (x << 8) - x;   /* Again. */

    return pgm_read_byte(&Table[x >> 26]);
}

效能差異

完全未做優化的情況下，效能由優至劣排序依序是：

b i n a r y = b y t e > h a r l e y = i t e r a t i o n > r e c u r s i v e

嘗試使用O2最佳化選項

;>>
cpuid; main.c:14     asm volatile ("CPUID\n\t"                           
rdtsc                                                                    
mov edi, edx                                                             
mov r8d, eax                                                             
rdtscp; main.c:24     asm volatile("RDTSCP\n\t"                          
mov esi, edx                                                             
mov r11d, eax                                                            
cpuid
;<<
shl rsi, 0x20; main.c:37     end = (((uint64_t) high2 << 32) | low2);    
shl rdi, 0x20; main.c:36     start = (((uint64_t) high1 << 32) | low1);  
mov r11d, r11d; main.c:37     end = (((uint64_t) high2 << 32) | low2);   
mov r8d, r8d; main.c:36     start = (((uint64_t) high1 << 32) | low1);   
or rsi, r11; main.c:37     end = (((uint64_t) high2 << 32) | low2);      
or rdi, r8; main.c:36     start = (((uint64_t) high1 << 32) | low1);     
sub rsi, rdi; main.c:38     return end - start;                          
   ; '+'                                                                 
add r10, rsi                                                             
sub r9d, 1; main.c:-11                                                   
jne 0x740;[gj]

以harley為例，發現代碼被大量精簡 ( 而且clz的程式碼被簡化掉了 )，也沒有測量到正確的時間。
似乎是不可行，只好自己硬幹。

以組語改寫harley原始C code

首先觀察到在harley的組語中，存在大量暫存器與記憶體互動的代碼，
並且沒有複雜的條件分支

cpuid; main.c:14     asm volatile ("CPUID\n\t"  
rdtsc                                           
mov edi, edx                                    
mov esi, eax                                    
mov rax, qword [local_28h]; clz.h:-22           
mov dword [rax], edi                            
mov rax, qword [local_20h]                      
mov dword [rax], esi                            
mov eax, dword [local_8ch]                      
mov dword [local_6ch], eax; clz.h:-20           
mov eax, dword [local_6ch]                      
shr eax, 1                                      
or dword [local_6ch], eax                       
mov eax, dword [local_6ch]                      
shr eax, 2                                      
or dword [local_6ch], eax                       
mov eax, dword [local_6ch]                      
shr eax, 4                                      
or dword [local_6ch], eax                       
mov eax, dword [local_6ch]                      
shr eax, 8                                      
or dword [local_6ch], eax                       
mov eax, dword [local_6ch]                      
shr eax, 0x10                                   
or dword [local_6ch], eax                       
mov eax, dword [local_6ch]                      
shl eax, 3                                      
sub eax, dword [local_6ch]                      
mov dword [local_6ch], eax                      
mov eax, dword [local_6ch]                      
shl eax, 8                                      
sub eax, dword [local_6ch]                      
mov dword [local_6ch], eax                      
mov eax, dword [local_6ch]                      
shl eax, 8                                      
sub eax, dword [local_6ch]                      
mov dword [local_6ch], eax                      
mov eax, dword [local_6ch]                      
shl eax, 8                
sub eax, dword [local_6ch]
mov dword [local_6ch], eax
lea rax, [local_94h]      
mov qword [local_38h], rax
lea rax, [local_90h]      
mov qword [local_30h], rax
rdtscp                    
mov edi, edx              
mov esi, eax              
cpuid; main.c:-220

嘗試以inline asm改寫

  __asm__ __volatile__(
      ".intel_syntax noprefix\n\t"
//    "mov rax, 0\n\t"
//    "mov rdi, 0\n\t"
//    "syscall\n\t"
      "mov rax, rdi\n\t"
      "shr rdi, 1\n\t"
      "or rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shr rdi, 2\n\t"
      "or rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shr rdi, 4\n\t"
      "or rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shr rdi, 8\n\t"
      "or rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shr rdi, 16\n\t"
      "or rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shl rax, 3\n\t"
      "sub rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shl rax, 8\n\t"
      "sub rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shl rax, 8\n\t"
      "sub rax, rdi\n\t"
      "mov rdi, rax\n\t"
      "shl rax, 8\n\t"
      "sub rax, rdi\n\t"
      "shr rax, 26\n\t"
      ".att_syntax\n\t"
      );
  register int i asm("rax");
  return Table[i];

實驗結果

大致獲得約 35% 效能提昇(平均 82cycle -> 53cycle)

本共筆是與Yuessiah共同研究撰寫而成 ↩︎