2017q3 Homework1 (clz)
=====
contributed by <`loolofen`>
## 開發環境
````
$ lscpu
Architecture: i686
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 1
Core(s) per socket: 4
Socket(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 60
Model name: Intel(R) Core(TM) i5-4460 CPU @ 3.20GHz
Stepping: 3
CPU MHz: 3200.625
CPU max MHz: 3400.0000
CPU min MHz: 800.0000
BogoMIPS: 6385.15
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 6144K
````
## 分析
### Iteration version
```clike=
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return (n - x);
}
```
直接代入數值觀測:
若有一數值 x = 0x0000abcd,依序執行
* x=0x0000abcd,y=0x00000000,c=8,n=32
* x=0x000000ab,y=0x000000ab,c=4,n=24
* x=0x0000000a,y=0x000000ab,c=2,n=20
* x=0x00000002,y=0x00000002,c=1,n=18
* x=0x00000001,y=0x00000001,c=0,n=17
* return (17 - 1)
此效果相當於先把數值分為兩半,若左半部有大於 0 之數字則往左半部計算,否則往右半部進行計算,持續此計算直到 $x = 1 或 0$。並回傳 leading zero 的值
### Binary Search version
```clike=
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
if (x == 0) return 32;
int n = 0;
if (x <= 0x0000FFFF) {
n += 16;
x <<= 16;
}
if (x <= 0x00FFFFFF) {
n += 8;
x <<= 8;
}
if (x <= 0x0FFFFFFF) {
n += 4;
x <<= 4;
}
if (x <= 0x3FFFFFFF) {
n += 2;
x <<= 2;
}
return n;
}
```
原理和 Iteration version 一樣
### Byte Shift version
```clike=
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) {
n += 16;
x <<= 16;
}
if ((x >> 24) == 0) {
n += 8;
x <<= 8;
}
if ((x >> 28) == 0) {
n += 4;
x <<= 4;
}
if ((x >> 30) == 0) {
n += 2;
x <<= 2;
}
return n;
}
```
原理和 Iteration version 一樣
### Recursive version
```clike
static const int mask[] = { 0, 8, 12,14 };
static const int magic[] = { 2, 1, 0, 0 };
unsigned clz2(uint32_t x,int c)
{
if (!x && !c) return 32;
uint32_t upper = (x >> (16 >> c));
uint32_t lower = (x & (0xFFFF >> mask[c]));
if (c == 3) return upper ? magic[upper] : 2 + magic[lower];
return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1);
}
```
原理和 Iteration version 一樣
### Harley’s Algorithm version
```clike
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
#ifdef CTZ
static uint8_t const Table[] = {
0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
};
#else
static uint8_t const Table[] = {
32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0,
0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0,
17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18,
5, 0, 0,23, 0,27, 0, 6,0,24, 7, 0, 8, 0, 0, 0
};
#endif
/* Propagate leftmost 1-bit to the right */
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
/* x = x * 0x6EB14F9 */
x = (x << 3) - x; /* Multiply by 7. */
x = (x << 8) - x; /* Multiply by 255. */
x = (x << 8) - x; /* Again. */
x = (x << 8) - x; /* Again. */
return Table[(x >> 26)];
}
```
把 leading zero 後的位元全部補為 1
再查表得到 leading zero 的數量
## 程式執行
由下圖可以看出 byte-shift 跟 binary search 是最快速的,接著是 iteration,byharley,最後是 recursive
![](https://i.imgur.com/1owk5hk.png)