contributed by <jayfeng0225
>
reviewed by <kobeyu
>
閱讀 重新理解數值 裡頭關於 count leading zero (clz) 的描述,設計實驗,比較 32-bit 數值對以下實做的效能差異:
OS : Ubuntu 16.04 LTS
CPU : Intel® Core™ i3-2350M @ 2.3GHz
Cache :
int clz(uint32_t x) {
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) { n -= c; x = y; }
c >>= 1;
} while (c);
return (n - x);
}
int clz(uint32_t x) {
if (x == 0) return 32;
int n = 0;
if (x <= 0x0000FFFF) { n += 16; x <<= 16; }
if (x <= 0x00FFFFFF) { n += 8; x <<= 8; }
if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; }
if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; }
if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; }
return n;
}
int clz(uint32_t x) {
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) { n += 16; x <<= 16; }
if ((x >> 24) == 0) { n += 8; x <<= 8; }
if ((x >> 28) == 0) { n += 4; x <<= 4; }
if ((x >> 30) == 0) { n += 2; x <<= 2; }
n = n - (x >> 31);
return n;
}
首先驗證基本的三個方法的正確性:
接著利用clock_gettime()來紀錄三種方法的執行時間,並且將其繪製成點狀分佈圖。
因為電腦設備較舊,要輸出所有整數的執行結果需要花費很多時間,因此目前僅僅列出執行至 200000的結果。
這部份有先參考作業區其他的同學的結果,所以思考比較快。
Recursive需要修改的問題為
沒有終止條件
shift bit與mask需要隨著遞迴改變
uint8_t clz(uint32_t x)
{
/* shift upper half down, rest is filled up with 0s */
uint16_t upper = (x >> 16);
// mask upper half away
uint16_t lower = (x & 0xFFFF);
return upper ? clz(upper) : 16 + clz(lower);
}
int clz_recursive(uint32_t x,uint32_t shift , uint32_t mask) {
if(x == 0) return 32;
if(shift == 1)
return !(x>>1);
/* shift upper half down, rest is filled up with 0s */
uint16_t upper = (x >> shift);
// mask upper half away
uint16_t lower = (x & mask);
uint32_t shift_temp = shift >> 1;
return upper ? clz_recursive(upper, shift_temp , mask >> shift_temp) :
shift + clz_recursive(lower, shift_temp, mask >> shift_temp );
}
驗證結果我使用原來的iteration版本將1~50000的答案輸出到檔案result,並且執行recursive版本輸出到檔案result_recu。最後在使用vimdiff檢查兩者是否有差異,而結果是輸出相同。
原來提供的harley algorithm的程式碼是ctz的結果(也就是最後面有幾個0),根據workfunction筆記內網站所提到的,要修改成CLZ的版本需要改變Table的值。
#define u 99
static char table[64] =
{
32,31, u,16, u,30, 3, u, 15, u, u, u,29,
10, 2, u, u, u,12,14,21, u,19, u, u,28,
u,25, u, 9, 1, u, 17, u, 4, u, u,
u,11, u, 13,22,20, u,26, u, u,18, 5, u,
u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u
};
其中u代表不會使用到的值,因此將其定義為99
uint8_t clz_harley(uint32_t x)
{
static char table[64] =
{
32,31, u,16, u,30, 3, u, 15, u, u, u,29,
10, 2, u, u, u,12,14,21, u,19, u, u,28,
u,25, u, 9, 1, u, 17, u, 4, u, u,
u,11, u, 13,22,20, u,26, u, u,18, 5, u,
u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u
};
/* Propagate leftmost 1-bit to the right */
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
/* x = x * 0x6EB14F9 */
x = (x << 3) - x; /* Multiply by 7. */
x = (x << 8) - x; /* Multiply by 255. */
x = (x << 8) - x; /* Again. */
x = (x << 8) - x; /* Again. */
return Table[x >> 26];
}
驗證結果同樣輸出1~50000的結果,並且儲存在result_har檔案中再使用vimdiff做比較。結果是輸出相同,因此結果也是正確。
jayfeng0225