Try   HackMD

2016q3 Homework1 (clz)

Reviewed by HaoTse

  • 尚未實作 Harley's Algorithm
  • 沒有看到驗證正確性的結果輸出
  • 沒有對效能比較圖做分析
  • 缺少提出 clz 的應用
  • Initial commit 看不出具體做了哪些事

各版本分析

iterative

int clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return (n - x); }
int clz(uint32_t x) { if (x == 0) return 32; int n = 0; if (x <= 0x0000FFFF) { n += 16; x <<= 16; } if (x <= 0x00FFFFFF) { n += 8; x <<= 8; } if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; } if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; } if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; } return n; }

利用二分法的技巧,32-bit 需要

log232=5 次,64-bit 則是
log264=6
次。

byte shift

int clz(uint32_t x) { if (x == 0) return 32; int n = 1; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } n = n - (x >> 31); return n; }

recursive

作業說明中的 recursive 並沒有指定終止條件,而且 shift offset 和 mask 不應該是固定的。

int clz(uint32_t x) { /* shift upper half down, rest is filled up with 0s */ uint16_t upper = (x >> 16); // mask upper half away uint16_t lower = (x & 0xFFFF); return upper ? clz(upper) : 16 + clz(lower); }

修正後變成

static uint32_t offset[] = { 16, 8, 4, 2, 1 }; static uint32_t mask[] = { 0xFFFF, 0x00FF, 0x000F, 0x0003, 0x0001 }; static int clz_helper(uint32_t x, int i) { /* shift upper half down, rest is filled up with 0s */ uint16_t upper = (x >> offset[i]); // mask upper half away uint16_t lower = (x & mask[i]); if (i == 4) return !upper; return upper ? clz_helper(upper, i + 1) : offset[i] + clz_helper(lower, i + 1); } int clz(uint32_t x) { if (!x) return 32; clz_helper(x, 0); }

Harley

#define u 99 int clz(uint32_t x) { static uint8_t const Table[] = { 32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u }; /* Propagate leftmost 1-bit to the right */ x = x | (x >> 1); x = x | (x >> 2); x = x | (x >> 4); x = x | (x >> 8); x = x | (x >> 16); /* x = x * 0x6EB14F9 */ x = (x << 3) - x; /* Multiply by 7. */ x = (x << 8) - x; /* Multiply by 255. */ x = (x << 8) - x; /* Again. */ x = (x << 8) - x; /* Again. */ return Table[x >> 26]; }

若要計算 CLZ,原本的表格應該修正成上面的表格才對。
Harley 的原理還要在研究一下

測試方式

驗證正確性

走訪全部的 32-bit 數值,為了加速執行時間,所以使用了 OpenMP。

if (clz(0) != 32) { printf("Error on 0\n"); } #pragma omp parallel for for (uint32_t i = 1; i <= 0xFFFFFFFE; ++i) { if (clz(i) != __builtin_clz(i)) { printf("Error on %u\n", i); } }

效能比較

參考資料