Try   HackMD

2022q1 Homework2 (quiz2)

contributed by < Wallmountain >

測驗 1

原始程式碼, 用以取得兩個無號整數的平均

#include <stdint.h>
uint32_t average(uint32_t a, uint32_t b)
{
    return (a + b) / 2;
}

改善 overflow 問題的版本

#include <stdint.h>
uint32_t average(uint32_t low, uint32_t high)
{
    return low + (high - low) / 2;
}

接著回答以下等價的實作

#include <stdint.h>
uint32_t average(uint32_t a, uint32_t b)
{
    return (a >> 1) + (b >> 1) + (EXP1);
}

若不看 EXP1 的函式輸出

a b average result
1 1 0
3 4 3
4 4 4
5 11 7

會發現 a, b 最低位元因右移被捨棄, 因此必須要去判斷是否同時為1,若是, 則回傳值加1

  • EXP1 = a & b & 1

接著為下一個等價的實作

uint32_t average(uint32_t a, uint32_t b)
{
    return (EXP2) + ((EXP3) >> 1);
}

可分成兩個部分

  • a, b 在同一位元同時為1,相加會進位, 但因為右移1, 故不進位 -> EXP2 = a & b
  • a, b 在同一位元只有一方為1,在同一位元留下1,接著右移1 -> EXP3 = a ^ b

比較對應的組合語言輸出

gcc -S -Og test1.c 得到, 只比較 main 中實作的差異, 故得到以下組合語言

uint32_t average(uint32_t a, uint32_t b)
{
    return (a + b) / 2;
}
endbr64
leal    (%rdi,%rsi), %eax
shrl	%eax
ret

參考 What does the endbr64 instruction actually do?, endbr64 stands for "End Branch 64 bit". The instruction is otherwise considered a NOP

leal 這個指令屬於 load effactive address 的指令, 雖然為 load 指令, 但很常被用來當成 add 指令使用,以上可視為將 ab 兩個值相加並存入 %eax

shrl 為右移1,可視為除以二

根據 ret 裡面所說, The ret instruction transfers control to the return address located on the stack, 相當於 return

uint32_t average(uint32_t low, uint32_t high)
{
    return low + (high - low) / 2;
}
endbr64 movl %esi, %eax subl %edi, %eax shrl %eax addl %edi, %eax ret

第2, 3兩行為 high - low 並將結果存入 %eax
shrl同樣為右移1,可視為除以二
第5行為最後加上 low

uint32_t average(uint32_t a, uint32_t b)
{
    return (a >> 1) + (b >> 1) + (a & b & 1);
}
endbr64
movl	%edi, %eax
shrl	%eax
movl	%esi, %edx
shrl	%edx
addl	%edx, %eax
andl	%esi, %edi
andl	$1, %edi
addl	%edi, %eax
ret

movlshrl 的組合分別為 a >> 1b >> 1
接著, 將 a >> 1b >> 1 相加
連續的 andla & b & 1
最後將上面結果相加

uint32_t average(uint32_t a, uint32_t b)
{
    return (a & b) + ((a ^ b) >> 1);
}
endbr64
movl	%edi, %eax
andl	%esi, %eax
xorl	%esi, %edi
shrl	%edi
addl	%edi, %eax
ret

movlandl 的組合為 a & b
xorla ^ b
shrla ^ b 右移1
最後將上面結果相加

測驗 2

解釋程式碼

#include <stdint.h>
uint32_t max(uint32_t a, uint32_t b)
{
    return a ^ ((a ^ b) & -(a < b));
}

這邊先思考 max 會用到的回傳值,比較 a, b 的大小, 回傳較大的, 也就是只會回傳 a 或 b
a ^ a ^ b = b, 故思考後面的判斷式的用意

  • a < b 成立時, (a ^ b) & -(a < b) = a ^ b, 函式回傳 a ^ a ^ b = b
  • a < b 不成立, (a ^ b) & -(a < b) = 0, 函式回傳 a ^ 0 = a

針對 32 位元無號/有號整數,撰寫同樣 branchless 的實作

int32_t max(int32_t a, int32_t b)
{
    return a ^ ((a ^ b) & -(a < b));
}

無論 a, b 為有號還是無號, -(a < b) 的結果都一樣

  • if a < b, -(a < b) = 0xFFFFFFFF -> a ^ a ^ b = b
  • if a >= b, -(a < b) = 0x00000000 -> a ^ 0 = a

所以在實作上除了 type, 並無二致

測驗 3

原64 位元 GCD

#include <stdint.h>
uint64_t gcd64(uint64_t u, uint64_t v)
{
    if (!u || !v) return u | v;
    while (v) {                               
        uint64_t t = v;
        v = u % v;
        u = t;
    }
    return u;
}

改寫為以下等價實作

#include <stdint.h>
uint64_t gcd64(uint64_t u, uint64_t v)
{
    if (!u || !v) return u | v;
    int shift;
    for (shift = 0; !((u | v) & 1); shift++) {
        u /= 2, v /= 2;
    }
    while (!(u & 1))
        u /= 2;
    do {
        while (!(v & 1))
            v /= 2;
        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (v);
    return u << shift;
}

對照 GCD 演算法

  1. If both x and y are 0, gcd is zero
    gcd(0,0)=0
    .
  2. gcd(x,0)=x
    and
    gcd(0,y)=y
    because everything divides 0.
if (!u || !v) return u | v;
  1. If x and y are both even,
    gcd(x,y)=2gcd(x2,y2)
    because
    2
    is a common divisor. Multiplication with
    2
    can be done with bitwise shift operator.
for (shift = 0; !((u | v) & 1); shift++) {
    u /= 2, v /= 2;
}
  1. If x is even and y is odd,
    gcd(x,y)=gcd(x2,y)
    .
while (!(u & 1))
    u /= 2;
  1. On similar lines, if x is odd and y is even, then
    gcd(x,y)=gcd(x,y2)
    . It is because
    2
    is not a common divisor.
while (!(v & 1))
    v /= 2;

對照到演算法第2點, 且只有除數有可能為 0, 故離開 do while 迴圈的條件為 v

do {
    ...
} while (v);

類似 輾轉相除法, 大減小, 減完後大的當被除數

if (u < v) {
    v -= u;
} else {
    uint64_t t = u - v;
    u = v;
    v = t;
}

最後被除數要補償根據演算法第三點多除的

2

return u << shift;

在 x86_64 上透過 __builtin_ctz 改寫 GCD,分析對效能的提升

利用 __builtin_ctz 取代以下 while 迴圈

while (!(u & 1))
    u /= 2;

shift 則用測驗2所學實作 min 的功能, 避免使用 branch

shift = x ^ ((x ^ y) & -(x > y));
uint64_t gcd64(uint64_t u, uint64_t v)
{
    if (!u || !v) return u | v;
    int shift, x, y;
    x = __builtin_ctz(u);
    y = __builtin_ctz(v);
    shift = x ^ ((x ^ y) & -(x > y));
    u >>= x;
    v >>= y;
    do {
        v >>= __builtin_ctz(v);
        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (v);
    return u << shift;
}

在 main 中使用 for 迴圈重複呼叫函式以測試效能改善

int main()
{
    srand(5);
    for(int i=0;i<10000000;i++) {
        gcd64(rand(), rand());
    }
    return 0;
}
  • 最後使用 perf 量測時間, 大約快1.83倍

Linux 核心中也內建 GCD (而且還不只一種實作),例如 lib/math/gcd.c,請解釋其實作手法和探討在核心內的應用場景。

  • __ffs 相當於 __builtin_ctz
  • r & -r 代表只剩 r 從最低位元開始的第一個1, 剩下為0, 也就是 1 >> __ffs(r), 而在 for 迴圈中的 r & -r 可改寫為 a << __ffs(r) ,可將兩個 if 合成一個 if, 讓程式碼更簡潔
  • 整體同樣運用類似 輾轉相除法的方式實作
unsigned long gcd(unsigned long a, unsigned long b)
{
    unsigned long r = a | b;

    if (!a || !b)
        return r;

    b >>= __ffs(b);
    if (b == 1)
        return r & -r;

    for (;;) {
        a >>= __ffs(a);
        if (a == 1)
            return r & -r;
        if (a == b)
            return a << __ffs(r);

        if (a < b)
            swap(a, b);
        a -= b;
    }
}

測驗4

在影像處理中,bit array (也稱 bitset) 廣泛使用,考慮以下程式碼

  • 實作中用到的 branch 可以避免
  • for (int i = 0; i < 64; i++) 避免逐個bit去搜尋
#include <stddef.h>
size_t naive(uint64_t *bitmap, size_t bitmapsize, uint32_t *out)
{
    size_t pos = 0;
    for (size_t k = 0; k < bitmapsize; ++k) {
        uint64_t bitset = bitmap[k];
        size_t p = k * 64;
        for (int i = 0; i < 64; i++) {
            if ((bitset >> i) & 0x1)
                out[pos++] = p + i;
        }
    }
    return pos;
}

用 GNU extension 的 __builtin_ctzll 以改寫的程式碼如下:

#include <stddef.h>
size_t improved(uint64_t *bitmap, size_t bitmapsize, uint32_t *out)
{
    size_t pos = 0;
    uint64_t bitset;
    for (size_t k = 0; k < bitmapsize; ++k) {
        bitset = bitmap[k];
        while (bitset != 0) {
            uint64_t t = bitset & -bitset;
            int r = __builtin_ctzll(bitset);
            out[pos++] = k * 64 + r;
            bitset ^= t;                           
        }
    }
    return pos;
}

改善逐個 bit 搜索, 和避免 branch

  • bitmask bitset & -bitset 用以找到 bitset 從最低位元的 bit 開始第一個為1的 bit, 接著使用 XOR 將其去掉
while (bitset != 0) {
    uint64_t t = bitset & -bitset;
    bitset ^= t;
}

得知從最從最低位元的 bit 開始第一個為1的 bit 距離幾個 0
跟原程式碼比較, 因為沒有逐個 bit 計算, 故使用 __builtin_ctzll 才能知道用 bitmask 得到的 bit 前面有幾個0

int r = __builtin_ctzll(bitset);
out[pos++] = k * 64 + r;

測驗5

以下是 LeetCode 166. Fraction to Recurring Decimal 的可能實作

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "list.h"
    
struct rem_node {
    int key;
    int index;
    struct list_head link;
};
    
static int find(struct list_head *heads, int size, int key)
{
    struct rem_node *node;
    int hash = key % size;
    list_for_each_entry (node, &heads[hash], link) {
        if (key == node->key)
            return node->index;
    }
    return -1;
}

char *fractionToDecimal(int numerator, int denominator)
{
    int size = 1024;
    char *result = malloc(size);
    char *p = result;

    if (denominator == 0) {
        result[0] = '\0';
        return result;
    }

    if (numerator == 0) {
        result[0] = '0';
        result[1] = '\0';
        return result;
    }

    /* using long long type make sure there has no integer > overflow */
    long long n = numerator;
    long long d = denominator;

    /* deal with negtive cases */
    if (n < 0)
        n = -n;
    if (d < 0)
        d = -d;

    bool sign = (float) numerator / denominator >= 0;
    if (!sign)
        *p++ = '-';

    long long remainder = n % d;
    long long division = n / d;

    sprintf(p, "%ld", division > 0 ? (long) division : (long) - division);
    if (remainder == 0)
        return result;

    p = result + strlen(result);
    *p++ = '.';

    /* Using a map to record all of reminders and their position.
     * if the reminder appeared before, which means the repeated loop begin,
     */
    char *decimal = malloc(size);
    memset(decimal, 0, size);
    char *q = decimal;

    size = 1333;
    struct list_head *heads = malloc(size * sizeof(*heads));
    for (int i = 0; i < size; i++)
        INIT_LIST_HEAD(&heads[i]);

    for (int i = 0; remainder; i++) {
        int pos = find(heads, size, remainder);
        if (pos >= 0) {
            while (pos-- > 0)
                *p++ = *decimal++;
            *p++ = '(';
            while (*decimal != '\0')
                *p++ = *decimal++;
            *p++ = ')';
            *p = '\0';
            return result;
        }
        struct rem_node *node = malloc(sizeof(*node));
        node->key = remainder;
        node->index = i;

        list_add(&node->link, &heads[remainder % size]);

        *q++ = (remainder * 10) / d + '0';
        remainder = (remainder * 10) % d;
    }

    strcpy(p, decimal);
    return result;
}
  • find 函式就是利用 key 去 hash table 中尋找是否 key 存在在 heash table
  • int pos = find(heads, size, remainder); 用來尋找是否前面有出現過相同餘數, 若找到則進入 branch
while (pos-- > 0)
    *p++ = *decimal++;
  • 用以補齊至非循環小數
list_add(&node->link, &heads[remainder % size]);
  • 將當前位數餘數存入 hash table

指出其中不足,並予以改進

  • 原程式碼呼叫 malloc 定義 map 卻沒有釋放, 實作 listallfree 函式釋放 map 記憶體
  • 以下水道式風格, 改寫程式碼
  • sprintf(p, "%ld", (long) division); 代替 sprintf(p, "%ld", division > 0 ? (long) division : (long) - division);, 前面有將被除數和除數都變為正值, 故這邊不應出現負值
  • 在原程式碼中, 多用了 decimal 字串來進行實作, 最後再將 decimal 寫入 result, 故思考將 decimal 去除, 直接對 result 做操作, 利用 z 變數紀錄當前在字串位置, 若發現為循環小數, 則從字串尾巴開始改寫至小數非循環的部分
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "list.h"
    
struct rem_node {
    int key;
    int index;
    struct list_head link;
};
    
static int find(struct list_head *heads, int size, int key)
{
    struct rem_node *node;
    int hash = key % size;
    list_for_each_entry (node, &heads[hash], link) {
        if (key == node->key)
            return node->index;
    }
    return -1;
}

void freeallnode(struct list_head *heads, size_t size)
{
    for (int i = 0; i < size; i++) {
        struct rem_node *entry, *safe;
        list_for_each_entry_safe(entry, safe, &heads[i], link) {
            free(entry);
        }
    }
    free(heads);
}

char *fractionToDecimal(int numerator, int denominator)
{
    int size = 1024, z = 0;
    char *result = malloc(size);
    char *p = result;

    if (!denominator) {
        result[0] = '\0';
        goto end;
    }

    if (!numerator) {
        result[0] = '0';
        result[1] = '\0';
        goto end;
    }

    /* using long long type make sure there has no integer > overflow */
    long long n = numerator;
    long long d = denominator;

    /* deal with negtive cases */
    if (n < 0)
        n = -n;
    if (d < 0)
        d = -d;

    if (numerator < 0 ^ denominator < 0) {
        *p++ = '-';
        z++;
    }

    long long remainder = n % d;
    long long division = n / d;

    sprintf(p, "%ld", (long) division);
    z++;
    if (!remainder)
        goto end;

    p = result + strlen(result);
    *p++ = '.';
    z++;

    /* Using a map to record all of reminders and their position.
     * if the reminder appeared before, which means the repeated loop begin,
     */
    char *q = p;

    size = 1333;
    struct list_head *heads = malloc(size * sizeof(*heads));
    for (int i = 0; i < size; i++)
        INIT_LIST_HEAD(&heads[i]);

    for (; remainder; z++) {
        int pos = find(heads, size, remainder);
        if (pos >= 0) {
            *(q + 2) = '\0';
            *(q + 1) = ')';
            while (z-- != pos) {
                *q = *(q - 1);
                q--;
            }
            *q = '(';
            goto end_free;
        }
        struct rem_node *node = malloc(sizeof(*node));
        node->key = remainder;
        node->index = z;

        list_add(&node->link, &heads[remainder % size]);

        *q++ = (remainder * 10) / d + '0';
        remainder = (remainder * 10) % d;
    }
end_free :
    freeallnode(heads, size);
end :
    return result;
}

測驗 6

__alignof__ 是 GNU extension,以下是其可能的實作方式:

/*
 * ALIGNOF - get the alignment of a type
 * @t: the type to test
 *
 * This returns a safe alignment for the given type.
 */
#define ALIGNOF(t) \
    ((char *)(&((struct { char c; t _h; } *)0)->_h) - (char *)0)

這邊用來計算取得 t 在 structure 中需要對齊幾個位元, 故為 (char *)(&((struct { char c; t _h; } *)0)->_h), 後面減去 (char *)0

測驗7

貌似簡單卻蘊含實作深度的 FizzBuzz 題目

  • 從 1 數到 n,如果是 3的倍數,印出 “Fizz”
  • 如果是 5 的倍數,印出 “Buzz”
  • 如果是 15 的倍數,印出 “FizzBuzz”
  • 如果都不是,就印出數字本身

直覺的實作程式碼

#include <stdio.h>
int main() {
    for (unsigned int i = 1; i < 100; i++) {
        if (i % 3 == 0) printf("Fizz");
        if (i % 5 == 0) printf("Buzz");
        if (i % 15 == 0) printf("FizzBuzz");
        if ((i % 3) && (i % 5)) printf("%u", i);
        printf("\n");
    }
    return 0;
}

利用 bitwise 和上述技巧實作的 FizzBuzz 程式碼

static inline bool is_divisible(uint32_t n, uint64_t M)
{
    return n * M <= M - 1;
}

static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1;
static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1;

int main(int argc, char **argv)
{
    for (size_t i = 1; i <= 100; i++) {
        uint8_t div3 = is_divisible(i, div3);
        uint8_t div5 = is_divisible(i, div5);
        unsigned int length = (2 << div3) << div5;

        char fmt[9];
        strncpy(fmt, &"FizzBuzz%u"[(9 >> div5) >> (div3 << 2)], length);
        fmt[length] = '\0';

        printf(fmt, i);
        printf("\n");
    }
    return 0;
}

解釋上述程式運作原理

  • 從 1 數到 n,如果是 3的倍數,則印出長度為 4,右移 4, 9 >> 4 = 0
  • 如果是 5 的倍數,則印出長度為 4,右移 1, 9 >> 1 = 4
  • 如果是 15 的倍數,則印出長度為 8,右移 5, 9 >> 5 = 0
  • 如果都不是,就印出數字本身, 右移 0, 9 >> 0 = 9

用改變字串寫入長度和位置, 來得到輸出字串