contributed by <claaaaassic
>
yangyang95
>-c
(cpu-list) 的 tag-O0
來抑制最佳化首先先設計實現,比較五種 clz 的效能
這邊直接取出 clz-test 裡面的程式來用
#include "clz2.h"
static const int mask[]={0,8,12,14};
static const int magic[]={2,1,0,0};
unsigned clz2(uint32_t x,int c)
{
if (!x && !c) return 32;
uint32_t upper = (x >> (16>>c));
uint32_t lower = (x & (0xFFFF>>mask[c]));
if (c == 3) return upper ? magic[upper] : 2 + magic[lower];
return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1);
}
參考重新理解數值
int clz(uint32_t x) {
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) { n -= c; x = y; }
c >>= 1;
} while (c);
return (n - x);
}
這邊解釋一下怎麼運作的
n 表示有幾個 0,一開始假設全部都為 0 所以 n = 32,
之後從最低位元用 216 28 24 22 21 去逼近最高位數,如果逼近到正確就 (n - c) 然後把過濾過的位數扣掉
應該有更好的講法但是我講不出來 QQ
參考 重新理解數值
這跟 iteration version 原理一樣,只是把 loop 展開然後 n 用加的
int clz(uint32_t x) {
if (x == 0) return 32;
int n = 0;
if (x <= 0x0000FFFF) { n += 16; x <<= 16; }
if (x <= 0x00FFFFFF) { n += 8; x <<= 8; }
if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; }
if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; }
if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; }
return n;
}
參考 重新理解數值
這也跟 binary search version 原理ㄩ一樣
int clz(uint32_t x) {
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) { n += 16; x <<= 16; }
if ((x >> 24) == 0) { n += 8; x <<= 8; }
if ((x >> 28) == 0) { n += 4; x <<= 4; }
if ((x >> 30) == 0) { n += 2; x <<= 2; }
n = n - (x >> 31);
return n;
}
#include "clz.h"
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
// CTZ table
#ifdef CTZ
static uint8_t const Table[] = {
0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
};
// CLZ table
#else
static uint8_t const Table[] ={
32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0,
0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0,
17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18,
5, 0, 0,23, 0,27, 0, 6, 0,24, 7, 0, 8, 0, 0, 0
};
#endif
/* Propagate leftmost 1-bit to the right */
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
/* x = x * 0x6EB14F9 */
x = (x << 3) - x; /* Multiply by 7. */
x = (x << 8) - x; /* Multiply by 255. */
x = (x << 8) - x; /* Again. */
x = (x << 8) - x; /* Again. */
return Table[(x >> 26)];
}
在 Makefile 裡面看到一個沒有看過的指令 taskset
,可以指定在特定的 cpu 核心上執行程式,這邊是指定他在 pid 介於 67100000 67116384 的 process 各執行一次 ( 這裡還沒弄很清楚 )
run: $(EXEC)
taskset -c 1 ./iteration 67100000 67116384
這邊我覺得 binary search technique 與 bite-shift version 效能會最好,剩下三個還不確定,結果是 recursive > harley > interation > binary = byte
yerrorlines
更容易呈現 jitter接著試用裡面的驗證方法 make run PROFILE=1
./iteration
executiom time : 98031747727 cycles
./binary
executiom time : 30276750449 cycles
./byte
executiom time : 30831086507 cycles
./recursive
executiom time : 161100579230 cycles
./harley
executiom time : 66601037786 cycles
使用 $ make run MP=1
__builtin_clzll
實作之 log2
#define LOG2(X) ((unsigned) \
(8 * sizeof (unsigned long long) - \
__builtin_clzll((X)) - 1))
根據 clone 下來的 clz-test 裡面有找到這段程式,他只能計算整數的 log2 ,而且在計算結果為負數時會因為回傳的型態為 unsigned 所以會錯誤