2017q1 Homework4 (clz)

contributed by <xdennisx>

Count Leading Zero

recursive version
iteration version
binary search technique
byte-shift version
Harley’s algorithm

一開始 make 的時候發現沒有 install-git-hooks 所以就把之前的複製貼上

加了發現大大增加 commit 的難度= =

原本的 for loop 會移動到一次最多 32 bit，這是不合法的


for(uint32_t j=(1<<i);j<(1<<(i+1));j++)

所以我改了一下


for (uint32_t j = (1 << i); j < (1 << ((i + 1) == 32 ? 31 : (i+1))); j++)

calculate.c 的兩個 FILE 好像也是錯的


FILE *fptr = fopen("iteration.txt","r");
FILE *output = fopen("output.txt","w");

我也改了一下


FILE *fp = fopen("iteration.txt","r");
//FILE *output = fopen("output.txt","w");

main.c 裡面的 output file 沒有初始化


FILE *output;

改成


FILE *output = NULL;

終於給我 commit 了!!!

先把原始圖的輸出

分析：

recursive 演算法會有規率性的凸起
harley、binary、byte 較穩定
iteration 後面表現比前面好

recursive version














#include "clz2.h"

static const int mask[]={0,8,12,14};
static const int magic[]={2,1,0,0};

unsigned clz2(uint32_t x,int c)
{
    if (!x && !c) return 32;

    uint32_t upper = (x >> (16>>c)); 
    uint32_t lower = (x & (0xFFFF>>mask[c]));
    if (c == 3) return upper ? magic[upper] : 2 + magic[lower];
    return upper ? clz2(upper, c + 1) : (16 >> (c)) + clz2(lower, c + 1);
}

原理：

C 從 0~3 代入，一開始 upper 跟 lower 分別會把 32 bit 拆成 16+16 個 bit，之後就看 upper 是否為 0，不是 0 的話就只要把 upper 代入下一層 recursive，下一層就會再把 upper 有效的那些 bit 再拆成一半，到 C==3 的時候終止。如果 upper 等於 0，就只要把 lower 代入下一層 recursive，算出來的答案再加上當層 upper 的 bit 數就是答案。

iteration version











include "clz.h"

unsigned clz(uint32_t x) {
    int n = 32, c = 16; 
    do {
        uint32_t y = x >> c;
        if (y) { n -= c; x = y; }
        c >>= 1;
    } while (c);
    return (n - x); 
}

原理：

c 是切割的 bit 數，y 是類似 upper 的東西，如果 y 等於 1，就把要看的 bit 數縮小到 (n-c) 的範圍，再進入下一次 loop 的時候就只要看前面 y 個 bit 就好，最後再看首位是不是 0。

binary search technique











#include "clz.h"
unsigned clz(uint32_t x) {
    if (x == 0) return 32;
    int n = 0;
    if (x <= 0x0000FFFF) { n += 16; x <<= 16; }
    if (x <= 0x00FFFFFF) { n += 8; x <<= 8; }
    if (x <= 0x0FFFFFFF) { n += 4; x <<= 4; }
    if (x <= 0x3FFFFFFF) { n += 2; x <<= 2; }
    if (x <= 0x7FFFFFFF) { n += 1; x <<= 1; }
    return n;
}

原理：

if 裡面的判斷式都是把目前在看的有效 bit 數砍半，例如 x <= 0x0000FFFF 就是判斷前16 bit 是不是 0，如果是都是 0，就把 n+16，接著繼續判斷後面 16 bit。如果不是 0，就把前 16 bit 的前面一半的 bit 判斷是否為 0，一直判斷到第一位 bit。

byte-shift version











#include "clz.h"
unsigned clz(uint32_t x) {
    if (x == 0) return 32;
    int n = 1;
    if ((x >> 16) == 0) { n += 16; x <<= 16; }
    if ((x >> 24) == 0) { n += 8; x <<= 8; }
    if ((x >> 28) == 0) { n += 4; x <<= 4; }
    if ((x >> 30) == 0) { n += 2; x <<= 2; }
    n = n - (x >> 31);
    return n;
}

原理：

這也是一直把 32 bit 一直砍半的原理，if ((x >> 16) == 0) { n += 16; x <<= 16; } 意思就是如果前 16 bit 都是 0，那我答案就先加這 16 bit，之後就把這 16 個 0 踢掉不看，再來就看後面 16 bit 的一半，以此類推，最後 x >> 31 就是看第一位是不是 0 就是答案。

Harley’s algorithm






























#include "clz.h"

unsigned clz(uint32_t x)
{
    static uint8_t const Table[] = { 
      0xFF, 0, 0xFF, 15, 0xFF, 1, 28, 0xFF,
      16, 0xFF, 0xFF, 0xFF, 2, 21, 29, 0xFF,
      0xFF, 0xFF, 19, 17, 10, 0xFF, 12, 0xFF,
      0xFF, 3, 0xFF, 6, 0xFF, 22, 30, 0xFF,
      14, 0xFF, 27, 0xFF, 0xFF, 0xFF, 20, 0xFF,
      18, 9, 11, 0xFF, 5, 0xFF, 0xFF, 13, 
      26, 0xFF, 0xFF, 8, 0xFF, 4, 0xFF, 25, 
      0xFF, 7, 24, 0xFF, 23, 0xFF, 31, 0xFF,
    };  

    /* Propagate leftmost 1-bit to the right */
    x = x | (x >> 1); 
    x = x | (x >> 2); 
    x = x | (x >> 4); 
    x = x | (x >> 8); 
    x = x | (x >> 16);
 
    /* x = x * 0x6EB14F9 */
    x = (x << 3) - x;   /* Multiply by 7. */
    x = (x << 8) - x;   /* Multiply by 255. */
    x = (x << 8) - x;   /* Again. */
    x = (x << 8) - x;   /* Again. */

    return Table[x >> 26];
}

原理：

理解ing

_Generic

一個類似 C++ Template 的東西，C11 以 macro 為基礎的 type-generic functions，主要透過 _Generic 關鍵字。














include <stdio.h>
void funci(int x) { printf("func value = %d\n", x); }
void funcc(char c) { printf("func char = %c\n", c); }
void funcdef(double v) { printf("Def func's value = %lf\n", v); }
#define func(X) \
    _Generic((X), \
        int: funci, char: funcc, default: funcdef \
    )(X)
int main() {
        func(1);
        func('a');
        func(1.3);
        return 0;
}

輸出結果：

func value = 1
func value = 97
Def func's value = 1.300000

也可以套用在 printf 上




















#define printf_dec_format(x) _Generic((x), \
    char: "%c", \
    signed char: "%hhd", \
    unsigned char: "%hhu", \
    signed short: "%hd", \
    unsigned short: "%hu", \
    signed int: "%d", \
    unsigned int: "%u", \
    long int: "%ld", \
    unsigned long int: "%lu", \
    long long int: "%lld", \
    unsigned long long int: "%llu", \
    float: "%f", \
    double: "%f", \
    long double: "%Lf", \
    char *: "%s", \
    void *: "%p")
 
#define print(x) printf(printf_dec_format(x), x)
#define printnl(x) printf(printf_dec_format(x), x), printf("\n");

輸出結果：

    printnl('a');    // prints "97" (on an ASCII system)
    printnl((char)'a');  // prints "a"
    printnl(123);    // prints "123"
    printnl(1.234);      // prints "1.234000"

在這邊 'a' 是 int，如果要讓他已字元方式輸出需要強制 cast 他

所以我把 printf 改掉


printf("%d %lu cycles\n", i, timecall / 100);

改成



print(i);
print(timecall / 100);
printnl("cycles");

改成這樣的優點就是不會在 format 的時候打錯對應的符號，造成輸出誤差

Modularize the build system

運用 % 來簡化 Makefile 的內容，把產生執行檔的方式改寫一下

clz_%: %_method.o %.c
	$(CC) $(CFLAGS_common) $? -o $@

%_method.o: $(SRCS_common)
	$(CC) -c $(CFLAGS_common) $< -D$(shell echo $(subst _method.o,,$@)) -o $@

這樣以後就不用為了產生新的執行檔就要再多一條 rule

clz 的應用場合

cpython 在 fractions 計算浮點的時候
壓縮 data，把前面一串 0 變成一個不為 0 的數字
利用 clz(x − y) >> 5 去計算 32 bit x == y 是不是對的，是 0 的話就是錯的，是 1 的話就是對的(下面網站寫錯了)