Try   HackMD

2017q3 Homework1 (clz)

contributed by <loolofen>

開發環境

$ lscpu

Architecture:          i686
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                4
On-line CPU(s) list:   0-3
Thread(s) per core:    1
Core(s) per socket:    4
Socket(s):             1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 60
Model name:            Intel(R) Core(TM) i5-4460  CPU @ 3.20GHz
Stepping:              3
CPU MHz:               3200.625
CPU max MHz:           3400.0000
CPU min MHz:           800.0000
BogoMIPS:              6385.15
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              6144K

分析

Iteration version

static inline __attribute((always_inline)) unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return (n - x); }

直接代入數值觀測:
若有一數值 x = 0x0000abcd,依序執行

  • x=0x0000abcd,y=0x00000000,c=8,n=32
  • x=0x000000ab,y=0x000000ab,c=4,n=24
  • x=0x0000000a,y=0x000000ab,c=2,n=20
  • x=0x00000002,y=0x00000002,c=1,n=18
  • x=0x00000001,y=0x00000001,c=0,n=17
  • return (17 - 1)

此效果相當於先把數值分為兩半,若左半部有大於 0 之數字則往左半部計算,否則往右半部進行計算,持續此計算直到

x=10。並回傳 leading zero 的值

Binary Search version

static inline __attribute((always_inline)) unsigned clz(uint32_t x) { if (x == 0) return 32; int n = 0; if (x <= 0x0000FFFF) { n += 16; x <<= 16; } if (x <= 0x00FFFFFF) { n += 8; x <<= 8; } if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; } if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; } return n; }

原理和 Iteration version 一樣

Byte Shift version

static inline __attribute((always_inline)) unsigned clz(uint32_t x) { if (x == 0) return 32; int n = 1; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } return n; }

原理和 Iteration version 一樣

Recursive version

static const int mask[] =  { 0, 8, 12,14 };
static const int magic[] = { 2, 1,  0, 0 };

unsigned clz2(uint32_t x,int c)
{
    if (!x && !c) return 32;

    uint32_t upper = (x >> (16 >> c));
    uint32_t lower = (x & (0xFFFF >> mask[c]));
    if (c == 3) return upper ? magic[upper] : 2 + magic[lower];
    return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1);
}

原理和 Iteration version 一樣

Harley’s Algorithm version

static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
#ifdef CTZ
    static uint8_t const Table[] = {
        0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
        16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
        0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
        0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
        14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
        18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
        26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
        0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
    };

#else
    static uint8_t const Table[] = {
        32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0,
        0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0,
        17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18,
        5, 0, 0,23, 0,27, 0, 6,0,24, 7, 0, 8, 0, 0, 0
    };
#endif

    /* Propagate leftmost 1-bit to the right */
    x = x | (x >> 1);
    x = x | (x >> 2);
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x | (x >> 16);

    /* x = x * 0x6EB14F9 */
    x = (x << 3) - x;   /* Multiply by 7. */
    x = (x << 8) - x;   /* Multiply by 255. */
    x = (x << 8) - x;   /* Again. */
    x = (x << 8) - x;   /* Again. */

    return Table[(x >> 26)];
}


把 leading zero 後的位元全部補為 1
再查表得到 leading zero 的數量

程式執行

由下圖可以看出 byte-shift 跟 binary search 是最快速的,接著是 iteration,byharley,最後是 recursive