tundergod
hw1
2016q3
contributed by <tundergod
>
int clz1()
{
n=0;
if (x == 0) return 32;
for (n = 0; ((x & 0x80000000) == 0); n++, x <<= 1);
return n;
}
int clz2()
{
n=0;
if (x == 0) return 32;
for (n = 0; ((x & 0xF0000000) == 0); n += 4, x <<= 4);
n += (int)clz_table_4bit[x >> (32-4)];
return n;
}
int clz3()
{
if (x == 0) return(32);
n = 0;
if (x & 0x0000FFFF) {n = n +16; x = x <<16;}
if (x & 0x00FFFFFF) {n = n + 8; x = x << 8;}
if (x & 0x0FFFFFFF) {n = n + 4; x = x << 4;}
if (x & 0x3FFFFFFF) {n = n + 2; x = x << 2;}
if (x & 0x7FFFFFFF) {n = n + 1;}
return n;
}
int clz4()
{
n=0;
if ((x & 0xFFFF0000) == 0) {n = 16; x <<= 16;} else {n = 0;}
if ((x & 0xFF000000) == 0) {n += 8; x <<= 8;}
if ((x & 0xF0000000) == 0) {n += 4; x <<= 4;}
n += (int)clz_table_4bit[x >> (32-4)];
return n;
}
int clz5()
{
n=0;
if ((x & 0xFFFF0000) == 0)
return (int)clz5_table_16bit[x] + 16;
else
return (int)clz5_table_16bit[x >> 16];
}
static uint8_t clz5_table_16bit[65536] =
{16,15,14,14,13,13,13,13,12,12,12,12,..........
..................................0,0,0,0,0,0};
for i in `seq 160000 100000 20000000`;
可以看到無論clz所求值爲多少,執行時間幾乎不變.
而意外的是最簡單的單位元位移算法竟然是最快的lim wen sheng
uint8_t clz(uint32_t x,int half)
{
if(x == 0) return 32;
//end recursive after fifth
if(half == 0) return 0;
/* shift upper half down, rest is filled up with 0s */
uint16_t upper = (x >> half);
// mask upper half away
uint16_t lower = (x & (0x0000FFFF >> (16 - half)));
//if upper have value move it to lower else ans+16 exe recursive for another half
return upper ? clz(upper,half>>1) : (half) + clz(lower,half>>1);
}
uint8_t clz(uint32_t x)
{
int n = 0;
if (x == 0) return 32;
//every iterative left shift 1 bit if val = 0, ans++
for (n = 0; ((x & 0x80000000) == 0); n++, x <<= 1);
return n;
}
uint8_t clz(uint32_t x)
{
int n = 0;
if (x == 0) return(32);
//32 -> 16
if (x & 0x0000FFFF) {n = n +16; x = x << 16;}
//16 -> 8
if (x & 0x00FFFFFF) {n = n + 8; x = x << 8;}
//8 -> 4
if (x & 0x0FFFFFFF) {n = n + 4; x = x << 4;}
//4 -> 2
if (x & 0x3FFFFFFF) {n = n + 2; x = x << 2;}
//2 -> 1
if (x & 0x7FFFFFFF) {n = n + 1;}
return n;
}
參考學長筆記看到的版本都長這樣但是不知道爲什麼這是byte-shiftlim wen sheng
unsigned clz(uint32_t x)
{
if (x == 0) return 32;
int n = 1;
if ((x >> 16) == 0) { n += 16; x <<= 16; }
if ((x >> 24) == 0) { n += 8; x <<= 8; }
if ((x >> 28) == 0) { n += 4; x <<= 4; }
if ((x >> 30) == 0) { n += 2; x <<= 2; }
n = n - (x >> 31);
return n;
}
在上述網站可以看到6種類似Harley's Algorithm的做法,原理還在研究中…lim wen sheng
#include <stdio.h>
#include <stdint.h>
uint8_t clz(uint32_t x)
{
static unsigned char table [48] = {
32, 6, 5, 0, 4, 12, 0, 20,
15, 3, 11, 0, 0, 18, 25, 31,
8, 14, 2, 0, 10, 0, 0, 0,
0, 0, 0, 21, 0, 0, 19, 26,
7, 0, 13, 0, 16, 1, 22, 27,
9, 0, 17, 23, 28, 24, 29, 30
};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x & ~(x >> 16);
x = x * 0x3EF5D037;
return table[x >> 26];
}
recursive version
iteration version
binary search technique
byte-shift version
Harley’s algorithm
Compare All: