# 2017q3 Homework(clz)
contributed by < `kevin55216` >
## 主要測試平台
```
OS: ubuntu 16.04 LTS AMD64
RAM 5.7GB
CPU i5-2410M 2.30GHz 2C4T
Disk 52.8GB
```
## 次要測試平台
```
OS: ubuntu 16.04 LTS AMD64
RAM: 31.4GB
CPU: i7-4790k 4.0GHz 4C8T
Disk: 212.3GB
```
## 比較在各個值域的表現

clz output => 32 - clzoutput
這邊我做了0x00000001、0x00000002、0x000000004到0x80000000各100000次的統計。
會選擇這樣的統計方法是考慮到 $0\ to\ 2^{32}-1$ 需要許多的執行時間且在低值域執行數量過少

iteration in 67100000 to 67116000

byte in 67100000 to 67116000

binary in 67100000 to 67116000

recursive in 67100000 to 67116000

harley in 67100000 to 67116000
由上面五張圖可以看出recursive穩定度最差且時間表現也最差。

上圖 clz 結果為 32 到 18

上圖 clz 結果為 6 到 5

上圖 clz 結果為 1
```
Performance counter stats for './recursive 67100000 67116384':
1,5165 cache-misses # 1.549 % of all cache refs
97,8706 cache-references
4,7521,1287 instructions # 0.37 insn per cycle
12,6822,3770 cycles
0.301447319 seconds time elapsed
Performance counter stats for './harley 67100000 67116384':
1,9492 cache-misses # 2.115 % of all cache refs
92,1710 cache-references
2,3851,1222 instructions # 0.17 insn per cycle
13,6648,7269 cycles
0.327269713 seconds time elapsed
Performance counter stats for './iteration 67100000 67116384':
1,0984 cache-misses # 1.141 % of all cache refs
96,2360 cache-references
3,0070,0970 instructions # 0.24 insn per cycle
12,5552,9385 cycles
0.298588728 seconds time elapsed
Performance counter stats for './binary 67100000 67116384':
1,2518 cache-misses # 1.450 % of all cache refs
86,3076 cache-references
2,1702,6129 instructions # 0.18 insn per cycle
12,0022,9473 cycles
0.284078212 seconds time elapsed
Performance counter stats for './byte 67100000 67116384':
9805 cache-misses # 1.053 % of all cache refs
93,0713 cache-references
2,3360,0979 instructions # 0.19 insn per cycle
12,0301,5914 cycles
0.286465763 seconds time elapsed
```
發現執行時間為$recursive>iteration=harley>binary=byte$
## recursive
```clike=
static const int mask[] = { 0, 8, 12,14 };
static const int magic[] = { 2, 1, 0, 0 };
unsigned clz2(uint32_t x,int c)
{
if (!x && !c) return 32;
uint32_t upper = (x >> (16 >> c));
uint32_t lower = (x & (0xFFFF >> mask[c]));
if (c == 3) return upper ? magic[upper] : 2 + magic[lower];
return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1);
}
```
以6為例
```
clz(6, 0)
x = 6 => 00000000000000000000000000000110
upper = x >> 16 = 0
lower = x & (0xFFFF >> 0) = 6
return 0 ? clz2(0, 1) : (16 >> (0)) + clz2(6, 1)
=return 16 + clz2(6, 1)
----------------------------------------------------------------------------
clz(6, 1)
upper = x >> (16 >> 1) = x >> 8 = 0
lower = x & (0xFFFF >> 8) = 6
return 0 ? clz2(0, 2) : (16 >> (1)) + clz2(6, 2)
=return 8 + clz2(6, 2)
----------------------------------------------------------------------------
clz(6, 2)
upper=x >> 4 = 0
lower=x & (0xFFFF >> 12) = 6
return 0 ? clz2(0, 3) : (16 >> (2)) + clz2(6, 3)
= return 4 + clz2(6, 3)
----------------------------------------------------------------------------
clz2(6, 3)
upper = x >> 2 = 1
lower = x & (0xFFFF >> 14) = 2
if(c == 3)return 1 ? magic[1] : 2+magic[2]
= return 1
----------------------------------------------------------------------------
output => 1 + 4 + 8 + 16 = 29
```
以0x7FFFFFFF為例
```
clz(0x7FFFFFFF, 0)
x = 0x7FFFFFFF => 01111111111111111111111111111111
upper = x >> 16 = 0x7FFF
lower=x&(0xFFFF>>0)=0xFFFF
return 0x7FFF ? clz2(0x7FFF, 1) : (16 >> (0)) + clz2(0xFFFF, 1)
=return clz2(0x7FFF, 1)
----------------------------------------------------------------------------
clz(0x7FFF,1)
upper = x >> (16 >> 1) = x >> 8 = 0x7F
return clz2(0x7F, 2)
----------------------------------------------------------------------------
clz(0x7F, 2)
upper = x >> 4 = 7
return clz2(0x7, 3)
----------------------------------------------------------------------------
clz2(7, 3)
upper = x >> 2 = 1
lower = x&(0xFFFF >> 14) = 3
if(c == 3)return 1?magic[1]:2+magic[3];
= return 1
----------------------------------------------------------------------------
output => 1
```
結論:recursive 一定要作到c=3才會得出解。
## byte
```clike=
unsigned clz(uint32_t x)
{
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) {
n += 16;
x <<= 16;
}
if ((x >> 24) == 0) {
n += 8;
x <<= 8;
}
if ((x >> 28) == 0) {
n += 4;
x <<= 4;
}
if ((x >> 30) == 0) {
n += 2;
x <<= 2;
}
n = n - (x >> 31);
return n;
}
```
以6為例
```
clz(6)
6 = 110
n = 1
if(6 >> 16 == 0)
n = 16 + 1 = 17
x = 1100000000000000000
if(1100000000000000000 >> 24 == 0)
n = 8+17=25
x = 110000000000000000000000000
if(110000000000000000000000000 >> 28 == 0)
n = 4+25=29
x = 1100000000000000000000000000000
if(1100000000000000000000000000000 >> 30 == 0) => if(1 == 0)
n = 29 - 0
return 29
```
0x7FFFFFFF
```
clz(0x7FFFFFFF)
n=1
if ((x >> 16) == 0)
if ((x >> 24) == 0)
if ((x >> 28) == 0)
if ((x >> 30) == 0)
1 = 1 - (x >> 31) = 1 - 0 = 1
return 1;
```
## harley
```clike=
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
#ifdef CTZ
static uint8_t const Table[] = {
0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
};
#else
static uint8_t const Table[] = {
32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0,
0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0,
17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18,
5, 0, 0,23, 0,27, 0, 6,0,24, 7, 0, 8, 0, 0, 0
};
#endif
/* Propagate leftmost 1-bit to the right */
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
/* x = x * 0x6EB14F9 */
x = (x << 3) - x; /* Multiply by 7. */
x = (x << 8) - x; /* Multiply by 255. */
x = (x << 8) - x; /* Again. */
x = (x << 8) - x; /* Again. */
return Table[(x >> 26)];
}
```
以0x40000000
```
0x40000000=
01000000000000000000000000000000
--------------------------------------------------------------------
step 1 :
01100000000000000000000000000000 (x | (x >> 1))
01111000000000000000000000000000 (x | (x >> 2))
01111111100000000000000000000000 (x | (x >> 4))
01111111111111111000000000000000 (x | (x >> 8))
01111111111111111111111111111111 (x | (x >> 16))
step 2 :
--------------------------------------------------------------------
11111111111111111111111111111000 (x << 3)
- 01111111111111111111111111111111 (- x)
= 01111111111111111111111111111001
--------------------------------------------------------------------
11111111111111111111100100000000 (x << 8)
- 01111111111111111111111111111001 (- x)
= 01111111111111111111100100000111
--------------------------------------------------------------------
11111111111110010000011100000000 (x << 8)
- 01111111111111111111100100000111 (- x)
= 01111111111111111111111111111001
--------------------------------------------------------------------
11111111111111111111100100000000 (x << 8)
- 01111111111111111111111111111001 (- x)
= 01111111111111111111100100000111
--------------------------------------------------------------------
x >> 26 = 011111 = 31
table[31] = 1
```
$for\ all\ x\ [0:OxFFFFFFFF]\ after\ step\ 1,\ x\ will\ only\ be\ 2^i-1\ where\ i\ [0:32]\ so\\ if\ we\ can\ hash\ 2^i-1\ in\ to\ a\ table\ we\ can\ get\ clz$
## 試圖改良(失敗)
```clike=
static const int mask[] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
unsigned clz(uint32_t x)
{
if (!x) return 32;
int n = 1;
if (!(x >> 16)) {
n += 16;
x <<= 16;
}
if (!(x >> 24)) {
n += 8;
x <<= 8;
}
n += mask[x >> 28];
return n;
}
```

看起來load的速度是沒辦法加速
## 結論
recursive 需要做三次recursive,harley 計算完後需要去做查表,byte 做完計算直接得出結
我們可以看到雙b效能最好、recursive 效能最差。
執行時間為$recursive>iteration=harley>binary=byte$
binary 和 byte 演算法 甚至 recursive 也已經作到 $Olg(n)$ 了,我認為要加速除非把值做hash selection來達到$O(1)$的時間複雜度,harley 雖然也有製作hash table 但是前面再做
補 1 的動作就用$Olg(n)$了