contributed by < kevin55216
>
OS: ubuntu 16.04 LTS AMD64
RAM 5.7GB
CPU i5-2410M 2.30GHz 2C4T
Disk 52.8GB
OS: ubuntu 16.04 LTS AMD64
RAM: 31.4GB
CPU: i7-4790k 4.0GHz 4C8T
Disk: 212.3GB
Performance counter stats for './recursive 67100000 67116384':
1,5165 cache-misses # 1.549 % of all cache refs
97,8706 cache-references
4,7521,1287 instructions # 0.37 insn per cycle
12,6822,3770 cycles
0.301447319 seconds time elapsed
Performance counter stats for './harley 67100000 67116384':
1,9492 cache-misses # 2.115 % of all cache refs
92,1710 cache-references
2,3851,1222 instructions # 0.17 insn per cycle
13,6648,7269 cycles
0.327269713 seconds time elapsed
Performance counter stats for './iteration 67100000 67116384':
1,0984 cache-misses # 1.141 % of all cache refs
96,2360 cache-references
3,0070,0970 instructions # 0.24 insn per cycle
12,5552,9385 cycles
0.298588728 seconds time elapsed
Performance counter stats for './binary 67100000 67116384':
1,2518 cache-misses # 1.450 % of all cache refs
86,3076 cache-references
2,1702,6129 instructions # 0.18 insn per cycle
12,0022,9473 cycles
0.284078212 seconds time elapsed
Performance counter stats for './byte 67100000 67116384':
9805 cache-misses # 1.053 % of all cache refs
93,0713 cache-references
2,3360,0979 instructions # 0.19 insn per cycle
12,0301,5914 cycles
0.286465763 seconds time elapsed
發現執行時間為
static const int mask[] = { 0, 8, 12,14 };
static const int magic[] = { 2, 1, 0, 0 };
unsigned clz2(uint32_t x,int c)
{
if (!x && !c) return 32;
uint32_t upper = (x >> (16 >> c));
uint32_t lower = (x & (0xFFFF >> mask[c]));
if (c == 3) return upper ? magic[upper] : 2 + magic[lower];
return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1);
}
以6為例
clz(6, 0)
x = 6 => 00000000000000000000000000000110
upper = x >> 16 = 0
lower = x & (0xFFFF >> 0) = 6
return 0 ? clz2(0, 1) : (16 >> (0)) + clz2(6, 1)
=return 16 + clz2(6, 1)
----------------------------------------------------------------------------
clz(6, 1)
upper = x >> (16 >> 1) = x >> 8 = 0
lower = x & (0xFFFF >> 8) = 6
return 0 ? clz2(0, 2) : (16 >> (1)) + clz2(6, 2)
=return 8 + clz2(6, 2)
----------------------------------------------------------------------------
clz(6, 2)
upper=x >> 4 = 0
lower=x & (0xFFFF >> 12) = 6
return 0 ? clz2(0, 3) : (16 >> (2)) + clz2(6, 3)
= return 4 + clz2(6, 3)
----------------------------------------------------------------------------
clz2(6, 3)
upper = x >> 2 = 1
lower = x & (0xFFFF >> 14) = 2
if(c == 3)return 1 ? magic[1] : 2+magic[2]
= return 1
----------------------------------------------------------------------------
output => 1 + 4 + 8 + 16 = 29
以0x7FFFFFFF為例
clz(0x7FFFFFFF, 0)
x = 0x7FFFFFFF => 01111111111111111111111111111111
upper = x >> 16 = 0x7FFF
lower=x&(0xFFFF>>0)=0xFFFF
return 0x7FFF ? clz2(0x7FFF, 1) : (16 >> (0)) + clz2(0xFFFF, 1)
=return clz2(0x7FFF, 1)
----------------------------------------------------------------------------
clz(0x7FFF,1)
upper = x >> (16 >> 1) = x >> 8 = 0x7F
return clz2(0x7F, 2)
----------------------------------------------------------------------------
clz(0x7F, 2)
upper = x >> 4 = 7
return clz2(0x7, 3)
----------------------------------------------------------------------------
clz2(7, 3)
upper = x >> 2 = 1
lower = x&(0xFFFF >> 14) = 3
if(c == 3)return 1?magic[1]:2+magic[3];
= return 1
----------------------------------------------------------------------------
output => 1
結論:recursive 一定要作到c=3才會得出解。
unsigned clz(uint32_t x)
{
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) {
n += 16;
x <<= 16;
}
if ((x >> 24) == 0) {
n += 8;
x <<= 8;
}
if ((x >> 28) == 0) {
n += 4;
x <<= 4;
}
if ((x >> 30) == 0) {
n += 2;
x <<= 2;
}
n = n - (x >> 31);
return n;
}
以6為例
clz(6)
6 = 110
n = 1
if(6 >> 16 == 0)
n = 16 + 1 = 17
x = 1100000000000000000
if(1100000000000000000 >> 24 == 0)
n = 8+17=25
x = 110000000000000000000000000
if(110000000000000000000000000 >> 28 == 0)
n = 4+25=29
x = 1100000000000000000000000000000
if(1100000000000000000000000000000 >> 30 == 0) => if(1 == 0)
n = 29 - 0
return 29
0x7FFFFFFF
clz(0x7FFFFFFF)
n=1
if ((x >> 16) == 0)
if ((x >> 24) == 0)
if ((x >> 28) == 0)
if ((x >> 30) == 0)
1 = 1 - (x >> 31) = 1 - 0 = 1
return 1;
static inline __attribute((always_inline))
unsigned clz(uint32_t x)
{
#ifdef CTZ
static uint8_t const Table[] = {
0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13,
26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25,
0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
};
#else
static uint8_t const Table[] = {
32,31, 0,16, 0,30, 3, 0,15, 0, 0, 0,29,10, 2, 0,
0, 0,12,14,21, 0,19, 0, 0,28, 0,25, 0, 9, 1, 0,
17, 0, 4, 0, 0, 0,11, 0,13,22,20, 0,26, 0, 0,18,
5, 0, 0,23, 0,27, 0, 6,0,24, 7, 0, 8, 0, 0, 0
};
#endif
/* Propagate leftmost 1-bit to the right */
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
/* x = x * 0x6EB14F9 */
x = (x << 3) - x; /* Multiply by 7. */
x = (x << 8) - x; /* Multiply by 255. */
x = (x << 8) - x; /* Again. */
x = (x << 8) - x; /* Again. */
return Table[(x >> 26)];
}
以0x40000000
0x40000000=
01000000000000000000000000000000
--------------------------------------------------------------------
step 1 :
01100000000000000000000000000000 (x | (x >> 1))
01111000000000000000000000000000 (x | (x >> 2))
01111111100000000000000000000000 (x | (x >> 4))
01111111111111111000000000000000 (x | (x >> 8))
01111111111111111111111111111111 (x | (x >> 16))
step 2 :
--------------------------------------------------------------------
11111111111111111111111111111000 (x << 3)
- 01111111111111111111111111111111 (- x)
= 01111111111111111111111111111001
--------------------------------------------------------------------
11111111111111111111100100000000 (x << 8)
- 01111111111111111111111111111001 (- x)
= 01111111111111111111100100000111
--------------------------------------------------------------------
11111111111110010000011100000000 (x << 8)
- 01111111111111111111100100000111 (- x)
= 01111111111111111111111111111001
--------------------------------------------------------------------
11111111111111111111100100000000 (x << 8)
- 01111111111111111111111111111001 (- x)
= 01111111111111111111100100000111
--------------------------------------------------------------------
x >> 26 = 011111 = 31
table[31] = 1
static const int mask[] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
unsigned clz(uint32_t x)
{
if (!x) return 32;
int n = 1;
if (!(x >> 16)) {
n += 16;
x <<= 16;
}
if (!(x >> 24)) {
n += 8;
x <<= 8;
}
n += mask[x >> 28];
return n;
}
看起來load的速度是沒辦法加速
recursive 需要做三次recursive,harley 計算完後需要去做查表,byte 做完計算直接得出結
我們可以看到雙b效能最好、recursive 效能最差。
執行時間為
binary 和 byte 演算法 甚至 recursive 也已經作到
補 1 的動作就用