2022q1 Homework2 (quiz2)

contributed by < eric88525 >

第 2 週測驗題

Q1

Q1-1

解釋上述程式碼運作的原理
EXP1 = a & b & 1
EXP2 = a & b
EXP3 = a ^ b

a 和 b 最低位元都是 0 的時候運算結果會正確
最低位元為 1 時 shift right 會遺失相關資訊，因此要補上最低位元的數值
取得最低位元方法為&1，唯有當 a 和 b 最低位元都是 1 時才需要+1，因此 exp1 = a & b & 1

uint32_t average(uint32_t a, uint32_t b)
{
    return (a >> 1) + (b >> 1) + (a & b & 1); 
}

推導:
a + b = a ^ b + a & b << 1
(a+b)/2 = (a ^ b + a & b << 1) >> 1
         = a ^ b >> 1 + a & b

最終程式如下

uint32_t average(uint32_t a, uint32_t b)
{
    return (a & b) + ((a ^ b) >> 1);
}

Q1-2

比較上述實作在編譯器最佳化開啟的狀況，對應的組合語言輸出，並嘗試解讀

on X86-64 linux :
- eax is the return value register
- edi is the first function argument
- esi is the second function argument

第一種版本，編譯器選項-O1

average(unsigned int, unsigned int):
        // move argument a to eax register and shift right 
        mov     eax, edi
        shr     eax 
        // move argument b to eax register and shift right 
        mov     edx, esi
        shr     edx    
        // do  (a>>1) + (b>>1)
        add     eax, edx      
        // do a & b & 1 
        and     edi, esi
        and     edi, 1           
        // add result with (a>>1) + (b>>1)
        add     eax, edi
        ret

第二種版本，編譯器選項-O1

average(unsigned int, unsigned int):
        // move argument a to eax register
        mov     eax, edi
        // xor with b
        xor     eax, esi
        // eax(a) shift right
        shr     eax
        // a&b
        and     edi, esi
        //  (a & b) + ((a ^ b) >> 1)
        add     eax, edi
        ret

第二種方式指令數量變少，幾乎是都縮減一半

指令執行次數	mov	shr	add	and	xor
方法1	2	2	2	2	0
方法2	1	1	1	1	1

參考資料：

Q2

Q2-1

解釋上述程式碼運作的原理
EXP4 = a^b
EXP5 = a<b

如要取 max 要讓
- a>b 的情況 ((EXP4) & -(EXP5)) ＝ 0
- a<b 的情況 ((EXP4) & -(EXP5)) ＝ a ^ b
- 因此 exp4 在此直接設定為 a^b，並交由 -(EXP5) 來決定該為0或是a^b
-(EXP5) 應該要是全1或全0，才能決定 exp4 為0或維持原樣
- exp5 如果是0(false)，取負號依舊是0，在 a>b 情況下需要
- exp5 如果是1(true)，取負號變成-1，在 uint32_t 下為 0xFFFFFFFF，在 a<b 情況需要
- 根據前二點，要讓 a>b 情況下 exp5 為 0，可回推 exp5 為 a<b

#include <stdint.h>
uint32_t max(uint32_t a, uint32_t b)
{
    return a ^ ((a^b) & -(a<b) );
}

Q 2-2

針對 32 位元無號/有號整數，撰寫同樣 branchless 的實作

概念源自這篇文章
利用 ((x - y) & -(x < y)) 來讓 x-y 是否維維持或設為 0，因為 -(x>y) 有二種結果
- -1 = 0xffffffff
- -0 = 0x00000000

uint32_t mymax(uint32_t x, uint32_t y)
{
    return x - ((x - y) & ((x - y) & -(x < y)));
}

Q3

Q3-1

解釋上述程式運作原理
COND = v
RET = u << shift

輾轉相除法原理，概念如下

設要找出a和b的最大工因數，如果a比b大，a可寫成如下，q是除數r是餘數

$a = b * q + r$
問題能轉成

$g c d (a, b) = g c d (b, r)$
直到b或r其中一方為0就為解答

uint64_t gcd64(uint64_t u, uint64_t v)
{
    // u 和 v 都是 0 所以回傳 0
    if (!u || !v) return u | v;
    int shift;
    // 找出 u 和 v 最大能有多少 2^shift 的公因數
    for (shift = 0; !((u | v) & 1); shift++) {
        u /= 2, v /= 2;
    }
    // u 和 v 不再有2是公因數，因此把2移除
    while (!(u & 1))
        u /= 2;
    do {
        // u 和 v 不再有2是公因數，因此把2移除
        while (!(v & 1))
            v /= 2;
        // 輾轉相除法
        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (v);
    // 還原最前面有多少 2^shift 公因數
    return u << shift;
}

Q3-2

uint64_t gcd64(uint64_t u, uint64_t v)
{
    // u 和 v 都是 0 所以回傳 0
    if (!u || !v) return u | v;
    int shift;
    // 找出 u 和 v 最大能有多少 2^shift 的公因數
    for (shift = 0; !((u | v) & 1); shift++) {
        u /= 2, v /= 2;
    }
    // u 和 v 不再有2是公因數，因此把2移除
    while (!(u & 1))
        u /= 2;
    do {
        // u 和 v 不再有2是公因數，因此把2移除
        while (!(v & 1))
            v /= 2;
        // 輾轉相除法
        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (v);
    // 還原最前面有多少 2^shift 公因數
    return u << shift;
}

Q4

Q4-1

解釋上述程式運作原理，並舉出這樣的程式碼用在哪些真實案例中;
EXP6 = bitset & -bitset

uint64_t 採用 2 補數，如果讓某數與其2補做 and 運算，可以保留最低1位元

    0101(4)    0010(2)
&   1011(-4)   1110(-2)
-----------------------
    0001       0010

#include <stddef.h>
size_t improved(uint64_t *bitmap, size_t bitmapsize, uint32_t *out)
{
    size_t pos = 0;
    uint64_t bitset;
    for (size_t k = 0; k < bitmapsize; ++k) {
        bitset = bitmap[k];
        while (bitset != 0) {
            uint64_t t = EXP6;
            int r = __builtin_ctzll(bitset);
            out[pos++] = k * 64 + r;
            bitset ^= t;                           
        }
    }
    return pos;
}

Q5

Q5-1

Q6

Q6-1

M = _h
X = 0

先創立 struct { char c; t _h; } *的指標，初始化起點為 0
利用 -> 指定其 t 型態的開始位址
減去 0 就是其 alignment

圖例：以 ALIGNOF(int) 示範

/*
 * ALIGNOF - get the alignment of a type
 * @t: the type to test
 *
 * This returns a safe alignment for the given type.
 */
#define ALIGNOF(t) \
    ((char *)(&((struct { char c; t _h; } *)0)->_h) - (char *)0)

為了驗證，用 gdb 來觀察記憶體位置

struct my_type{
    char c_type;
    int int_type;
};
int main(){
    struct my_type *p = 0;
    return 0;
}

gdb 執行結果如下

(gdb) print p
$5 = (struct my_type *) 0x0
(gdb) print &p->int_type
$4 = (int *) 0x4

可以得知alignment為 0x4 - 0x0 = 0x4

Q7

Q7-1

KK1 = div3
KK2 = div5
KK3 = div3<<2

div3 和 div5 需輸出長度為4的字串，因此 KKK1 和 KKK2 應該填入 div3 div5 來透過 shift 增加 strncpy 的複製長度 length
- 2是基本值，因要在不被3或5整除的情況下輸出原有數字 %u
- 被3或5其中一個整除 shift 一次，都被整除 shift 二次

div3	div5	length
0	0	2
0	1	4
1	0	4
1	1	8

在只有被5整除情況下，應該從第4個字(B)開始輸出

9 = 1001b
1001b >>1 = 100b = 4

在只有被3整除情況下，應該從第0個字(F)開始輸出

9 = 1001b
1001b >> (1<<2) = 0

被3和5整除下，會如同上面從第0個字(F)開始輸出

static inline bool is_divisible(uint32_t n, uint64_t M)
{
    return n * M <= M - 1;
}

static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1;
static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1;

int main(int argc, char **argv)
{
    for (size_t i = 1; i <= 100; i++) {
        // 檢查是否能被 3 5 整除
        uint8_t div3 = is_divisible(i, M3);
        uint8_t div5 = is_divisible(i, M5);
        // 設定要複製幾個字串
        unsigned int length = (2 << div3) << div5;
        
        char fmt[9];
        // 拷貝所需字串 透過 shift 決定起點
        strncpy(fmt, &"FizzBuzz%u"[(9 >> div5) >> (div3<<2)], length);
        fmt[length] = '\0';

        printf(fmt, i);
        printf("\n");
    }
    return 0;
}