contributed by <HexRabbit
,Yuessiah
[1]>
clz 即為 count leading zero 的簡寫,檢查在固定位元長度下最高位的 1 左側共有多少個 0,在網路上查了查,作法雖多但幾乎都是二分搜尋與 hashtable 查表的作法。
int clz(uint32_t x) {
if(x == 0) return 32;
int count = 0;
while(x) {
if (x & 0x80000000) return count;
x <<= 1;
count++;
}
}
最直觀的想法,不斷左移檢查 MSB 直到遇到第一個 1,但時間複雜度卻是所有作法中最糟的 O(n)。
int clz(uint32_t x) {
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) { n -= c; x = y; }
c >>= 1;
} while (c);
return (n - x);
}
用 y 來檢查當前二分搜索的左半邊是否存在 1,若存在則更新 x 為左半邊的值並且扣掉右邊的 bit 數,最後退出 while 迴圈時有兩種情況,若一開始輸入之 x 為零則 return n,否則會需要扣掉一開始在最高位的 1 變成return (n-1),所以才會寫成 return (n-x)。
由於反向扣除的寫法較不直觀,並且最後 n-x 的作法不容易看出目的何在,我重寫了一個正向計算的作法。
int clz(uint32_t x) {
int n = 0, c = 16;
do {
uint32_t y = x >> c;
if (!y) n += c;
else x = y;
c >>= 1;
} while (c);
return n;
}
int clz(uint32_t x, int n, int c) {
if(c == 0) return n;
if(y = (x >> c)) return clz(y, n, c>>1)
else return clz(x, n+c, c>>1);
}
其實就是上面 iterative 的遞迴版本,預設 n = 0, c = 16。
int clz(uint32_t x) {
if (x == 0) return 32;
int n = 0;
if (x <= 0x0000FFFF) { n += 16; x <<= 16; }
if (x <= 0x00FFFFFF) { n += 8; x <<= 8; }
if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; }
if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; }
if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; }
return n;
}
與 iterative 的作法相似,不過是透過比較大小來確定最高位的 1 在哪半邊,若落在右邊則左移並重複檢查。
int clz(uint32_t x) {
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) { n += 16; x <<= 16; }
if ((x >> 24) == 0) { n += 8; x <<= 8; }
if ((x >> 28) == 0) { n += 4; x <<= 4; }
if ((x >> 30) == 0) { n += 2; x <<= 2; }
n = n - (x >> 31);
return n;
}
與上述幾個作法相似,不再贅述。
const int tab32[32] = {
0 , 1 , 28, 2 , 29, 14, 24, 3,
30, 22, 20, 15, 25, 17, 4 , 8,
31, 27, 13, 23, 21, 19, 16, 7,
26, 12, 18, 6 , 11, 5 , 10, 9
};
int clz(uint32_t value) {
value |= value >> 1;
value |= value >> 2;
value |= value >> 4;
value |= value >> 8;
value |= value >> 16;
return tab32[((uint32_t)((value - (value >> 1))*0x077CB531U)) >> 27];
}
這個作法相當神奇,首先透過算法製造出一任意連續5bit不重複的32bit神奇數字(De Brujin sequence),共32種變化(0~31),再來將其與輸入之value的最高位相乘(即為左移操作),並將結果右移27位得到一個5bit的數字,其後便可透過任意連續5bit不重複的特性查表反推出原始最高位1的位置。
uint8_t clz(uint32_t x)
{
static prog_uint8_t const Table[] = {
0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
};
/* Propagate leftmost 1-bit to the right */
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
/* x = x * 0x6EB14F9 */
x = (x << 3) - x; /* Multiply by 7. */
x = (x << 8) - x; /* Multiply by 255. */
x = (x << 8) - x; /* Again. */
x = (x << 8) - x; /* Again. */
return pgm_read_byte(&Table[x >> 26]);
}
完全未做優化的情況下,效能由優至劣排序依序是:
;>>
cpuid; main.c:14 asm volatile ("CPUID\n\t"
rdtsc
mov edi, edx
mov r8d, eax
rdtscp; main.c:24 asm volatile("RDTSCP\n\t"
mov esi, edx
mov r11d, eax
cpuid
;<<
shl rsi, 0x20; main.c:37 end = (((uint64_t) high2 << 32) | low2);
shl rdi, 0x20; main.c:36 start = (((uint64_t) high1 << 32) | low1);
mov r11d, r11d; main.c:37 end = (((uint64_t) high2 << 32) | low2);
mov r8d, r8d; main.c:36 start = (((uint64_t) high1 << 32) | low1);
or rsi, r11; main.c:37 end = (((uint64_t) high2 << 32) | low2);
or rdi, r8; main.c:36 start = (((uint64_t) high1 << 32) | low1);
sub rsi, rdi; main.c:38 return end - start;
; '+'
add r10, rsi
sub r9d, 1; main.c:-11
jne 0x740;[gj]
以harley為例,發現代碼被大量精簡 ( 而且clz的程式碼被簡化掉了 ),也沒有測量到正確的時間。
似乎是不可行,只好自己硬幹。
首先觀察到在harley的組語中,存在大量暫存器與記憶體互動的代碼,
並且沒有複雜的條件分支
cpuid; main.c:14 asm volatile ("CPUID\n\t"
rdtsc
mov edi, edx
mov esi, eax
mov rax, qword [local_28h]; clz.h:-22
mov dword [rax], edi
mov rax, qword [local_20h]
mov dword [rax], esi
mov eax, dword [local_8ch]
mov dword [local_6ch], eax; clz.h:-20
mov eax, dword [local_6ch]
shr eax, 1
or dword [local_6ch], eax
mov eax, dword [local_6ch]
shr eax, 2
or dword [local_6ch], eax
mov eax, dword [local_6ch]
shr eax, 4
or dword [local_6ch], eax
mov eax, dword [local_6ch]
shr eax, 8
or dword [local_6ch], eax
mov eax, dword [local_6ch]
shr eax, 0x10
or dword [local_6ch], eax
mov eax, dword [local_6ch]
shl eax, 3
sub eax, dword [local_6ch]
mov dword [local_6ch], eax
mov eax, dword [local_6ch]
shl eax, 8
sub eax, dword [local_6ch]
mov dword [local_6ch], eax
mov eax, dword [local_6ch]
shl eax, 8
sub eax, dword [local_6ch]
mov dword [local_6ch], eax
mov eax, dword [local_6ch]
shl eax, 8
sub eax, dword [local_6ch]
mov dword [local_6ch], eax
lea rax, [local_94h]
mov qword [local_38h], rax
lea rax, [local_90h]
mov qword [local_30h], rax
rdtscp
mov edi, edx
mov esi, eax
cpuid; main.c:-220
嘗試以inline asm改寫
__asm__ __volatile__(
".intel_syntax noprefix\n\t"
// "mov rax, 0\n\t"
// "mov rdi, 0\n\t"
// "syscall\n\t"
"mov rax, rdi\n\t"
"shr rdi, 1\n\t"
"or rax, rdi\n\t"
"mov rdi, rax\n\t"
"shr rdi, 2\n\t"
"or rax, rdi\n\t"
"mov rdi, rax\n\t"
"shr rdi, 4\n\t"
"or rax, rdi\n\t"
"mov rdi, rax\n\t"
"shr rdi, 8\n\t"
"or rax, rdi\n\t"
"mov rdi, rax\n\t"
"shr rdi, 16\n\t"
"or rax, rdi\n\t"
"mov rdi, rax\n\t"
"shl rax, 3\n\t"
"sub rax, rdi\n\t"
"mov rdi, rax\n\t"
"shl rax, 8\n\t"
"sub rax, rdi\n\t"
"mov rdi, rax\n\t"
"shl rax, 8\n\t"
"sub rax, rdi\n\t"
"mov rdi, rax\n\t"
"shl rax, 8\n\t"
"sub rax, rdi\n\t"
"shr rax, 26\n\t"
".att_syntax\n\t"
);
register int i asm("rax");
return Table[i];
實驗結果
大致獲得約 35% 效能提昇(平均 82cycle -> 53cycle)