2025q1 Homework4 (quiz3+4)

contributed by < liangchingyun >

2025q1 第 3 週測驗題

測驗 `1`

實作一個多精度整數的運算框架，也就是支援比 64-bit 更大的整數計算

延伸問題 1

解釋上方程式碼運作原理

資料結構定義

#include <inttypes.h>
#include <stdint.h>

/* mpi: Multi-Precision Integers */
typedef struct {
    uint32_t *data; // 存放多精度整數每一部分的陣列，每個元素為 31-bit               
    size_t capacity; // 表示 data 陣列目前有多少個 element (即有多少「31-bit 位元段」)
} mpi_t[1];

typedef size_t mp_bitcnt_t;

記憶體管理

// 初始化
void mpi_init(mpi_t rop)
{
    rop->capacity = 0;
    rop->data = NULL;
}

// 釋放記憶體
void mpi_clear(mpi_t rop)
{
    free(rop->data);
}

// 擴展 data 陣列的長度，並初始化新擴展的空間為 0
void mpi_enlarge(mpi_t rop, size_t capacity)
{
    if (capacity > rop->capacity) { // 只有在新的 capacity 大於目前的容量時，才需要實際擴充
        size_t min = rop->capacity;

        rop->capacity = capacity;

        rop->data = realloc(rop->data, capacity * 4); 
        // 新的空間大小是 capacity * 4 bytes
        // 原因是data 裡面每個元素是 uint32_t，其大小就是 4 bytes

        if (!rop->data && capacity != 0) {
            fprintf(stderr, "Out of memory (%zu words requested)\n", capacity);
            abort();
        }

        // 對新分配的區塊（從舊的大小 min 到現在的大小 capacity）進行初始化
        for (size_t n = min; n < capacity; ++n)
            rop->data[n] = 0;
    }
}

// 去除尾端為 0 的位元段，減少記憶體使用
void mpi_compact(mpi_t rop)
{
    size_t capacity = rop->capacity;

    // 如果目前容量是 0，直接結束，無需處理
    if (rop->capacity == 0)
        return;

    // 找出最右邊（高位）非零的那一個元素
    for (size_t i = rop->capacity - 1; i != (size_t) -1; --i) {
        if (rop->data[i])
            break;
        capacity--; // 若 rop->data[i] == 0，代表這一格可以被移除，因此縮減 capacity
    }

    // 保護性檢查
    assert(capacity != (size_t) -1);

    // 重新配置記憶體，只保留實際有用的部份
    rop->data = realloc(rop->data, capacity * 4);
    rop->capacity = capacity;

    if (!rop->data && capacity != 0) {
        fprintf(stderr, "Out of memory (%zu words requested)\n", capacity);
        abort();
    }
}

輔助工具

/* ceiling division without needing floating-point operations. */
static size_t ceil_div(size_t n, size_t d)
{
    return (n + d - 1) / d; // 計算 n ÷ d 的向上取整（ceiling）結果
}

#define INTMAX 0x7fffffff // 31-bit 的最大整數值

n	d	n / d	ceil_div(n, d)
10	3	3.33	4
5	2	2.5	3
8	4	2	2
0	3	0	0

基本設定與轉換

// 將一個 64-bit 的整數，切成多個 31-bit 段，裝入 data 陣列
void mpi_set_u64(mpi_t rop, uint64_t op)
{
    size_t capacity = ceil_div(64, 31); // 需要 3 個欄位

    mpi_enlarge(rop, capacity); // 確保 rop 有足夠空間

    // 拆解 uint64_t 成多個 31-bit 整數
    for (size_t n = 0; n < capacity; ++n) {
        rop->data[n] = op & INTMAX; // 取低 31-bit
        op >>= 31; // 將剩下的往右移，準備下次寫入
    }

    // 清除多餘資料
    for (size_t n = capacity; n < rop->capacity; ++n)
        rop->data[n] = 0;
}

rop: 多精度整數，作為輸出目標。
op: 一個 64-bit 的無號數，要被轉換並儲存。
例子:
- rop->data[0] = 最低 31-bit
- rop->data[1] = 中間 31-bit
- rop->data[2] = 最高 2-bit（因為 31x3 = 93 > 64）

乘法

static void mpi_mul_naive(mpi_t rop, const mpi_t op1, const mpi_t op2)
{
    size_t capacity = op1->capacity + op2->capacity;
    
    mpi_t tmp;
    mpi_init(tmp);

    mpi_enlarge(tmp, capacity);
        
    for (size_t n = 0; n < tmp->capacity; ++n)
        tmp->data[n] = 0;

    for (size_t n = 0; n < op1->capacity; ++n) { 
        for (size_t m = 0; m < op2->capacity; ++m) {
            uint64_t r = (uint64_t) op1->data[n] * op2->data[m]; // r 是中間乘積（會超過 31 bit，所以用 uint64_t）
            uint64_t c = 0;
            for (size_t k = n+m; c || r; ++k) { // 從 k = n + m 開始累加（就是十進位乘法中對齊的位置）
                if (k >= tmp->capacity)
                    mpi_enlarge(tmp, tmp->capacity + 1);
                tmp->data[k] += (r & INTMAX) + c; // 加上本位結果與前一位的進位
                r >>= 31; // 下一個 31 bit
                c = tmp->data[k] >> 31; // 如果 overflow，產生進位
                tmp->data[k] &= INTMAX; // 只保留 31 bit
            }
        }                                     
    }
        
    mpi_set(rop, tmp);
    
    mpi_compact(rop);
    
    mpi_clear(tmp);
}

例子: 假設 op1 = ABC，op2 = DE

       A      B      C
  ×         D      E
  ---------------------
       AE    BE    CE    ← 對齊加總
+    AD    BD    CD
---------------------

除法與餘數

初始化商 q = 0，餘數 r = 0
從最高位開始，比對每一位是否能被除數 d 減去
如果可以，商的那一位設為 1，餘數更新為減掉後的值

void mpi_fdiv_qr(mpi_t q, mpi_t r, const mpi_t n, const mpi_t d)
{
    // 建立副本避免破壞原始數據
    mpi_t n0, d0;
    mpi_init(n0);
    mpi_init(d0);
    mpi_set(n0, n);
    mpi_set(d0, d);                                                                            
    // 除以零檢查
    if (mpi_cmp_u32(d0, 0) == 0) {
        fprintf(stderr, "Division by zero\n");
        abort();
    }

    // 初始化商與餘數
    mpi_set_u32(q, 0);
    mpi_set_u32(r, 0);

    // 從最高位 bit 開始處理
    size_t start = mpi_sizeinbase(n0, 2) - 1;

    // Long Division 迴圈，每次處理一個 bit
    for (size_t i = start; i != (size_t) -1; --i) {
        mpi_mul_2exp(r, r, 1);
        if (mpi_testbit(n0, i) != 0)
            mpi_setbit(r, 0);
        if (mpi_cmp(r, d0) >= 0) {
            mpi_sub(r, r, d0);
            mpi_setbit(q, i);
        }
    }

    mpi_clear(n0);
    mpi_clear(d0);
}

最大公因數

遞迴呼叫 mpi_fdiv_qr，直到餘數為 0，得到 GCD
終止條件：op2 == 0，這時 op1 就是 GCD。

void mpi_gcd(mpi_t rop, const mpi_t op1, const mpi_t op2)
{
    if (mpi_cmp_u32(op2, 0) == 0) {
        mpi_set(rop, op1);
        return;
    }
        
    mpi_t q, r;
    mpi_init(q);
    mpi_init(r);

    mpi_fdiv_qr(q, r, op1, op2);

    mpi_gcd(rop, op2, r);                                     

    mpi_clear(q);
    mpi_clear(r);
}

測試程式碼

驗證 549755813889 ÷ 1234 = 445507142...661 是否正確。

printf("mpi_fdiv_qr\n");
{
    mpi_t n, d, q, r;
    mpi_init(n);
    mpi_init(d);
    mpi_init(q);
    mpi_init(r);

    mpi_set_str(n, "549755813889", 10);
    mpi_set_str(d, "1234", 10);
    mpi_fdiv_qr(q, r, n, d);
    assert(mpi_cmp_u32(q, 445507142) == 0);
    assert(mpi_cmp_u32(r, 661) == 0);

    mpi_clear(n);
    mpi_clear(d);
    mpi_clear(q);
    mpi_clear(r);
}

驗證 GCD(2310, 46189) = 11

printf("GCD test\n");
{
    mpi_t a, b, r;
    mpi_init(a);
    mpi_init(b);
    mpi_init(r);

    mpi_set_str(a, "2310", 10);
    mpi_set_str(b, "46189", 10);

    mpi_gcd(r, a, b);
    assert(mpi_cmp_u32(r, 11) == 0);

    mpi_clear(a);
    mpi_clear(b);
    mpi_clear(r);
}

延伸問題 2

這項實作依賴遞迴呼叫，請避免使用並降低記憶體開銷

延伸問題 3

改進最大公因數的運算效率

測驗 `2`

延伸問題 1

解釋上述程式碼運作原理

判斷一個整數是否為奇數

確認最低位元是否為 1

傳統寫法:

#include <stdint.h>
bool both_odd(uint32_t x, uint32_t y)
{
    return (x & 1) && (y & 1);
}

這樣需要兩次 & 運算與一次&& 邏輯運算。

SWAR 的核心思想是：把多個資料打包在同一個暫存器中，並一次性處理，這可以減少操作次數並提升效能。

bitmask 定義:

static uint64_t SWAR_ODD_MASK = (1L << 32) + 1; // 第 0 位及第 32 位為 1
bool both_odd_swar(uint64_t xy) {
    return (xy & SWAR_ODD_MASK) == SWAR_ODD_MASK;
}

測試程式:

static inline uint64_t bit_compound(uint32_t x, uint32_t y) // 同時檢查兩個位置的最低位是否都是 1
{
    return ((0L + x) << 32) | ((y + 0L) & (-1L >> 32));
}

int main()
{
    int x = 12345678;
    int y = 9012345;
    uint64_t xy = bit_compound(x, y);
    printf("%d, %d\n", both_odd(x, y), both_odd_swar(xy));
}

加速 `memchr` 搜尋

步驟:

對齊檢查：先用 byte-wise 判斷，直到指標對齊。
word 級搜尋：一次檢查 sizeof(long) bytes，判斷這個 word 內是否有目標字元。
當剩下不足一整個 word 時，再回到 byte-wise 搜尋。

#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

/* Nonzero if X is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) - (0x01010101)) & ~(X) & (0x80808080))
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) \
    (((X) - (0x0101010101010101)) & ~(X) & (0x8080808080808080))
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif

/* @return nonzero if (long)X contains the byte used to fill MASK. */
#define DETECT_CHAR(X, mask) DETECT_NULL(X^mask)

void *memchr_opt(const void *str, int c, size_t len)
{
    const unsigned char *src = (const unsigned char *) str;
    unsigned char d = c;

    while (UNALIGNED(src)) {
        if (!len--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

    if (!TOO_SMALL(len)) {
        /* If we get this far, len is large and src is word-aligned. */

        /* The fast code reads the source one word at a time and only performs
         * a bytewise search on word-sized segments if they contain the search
         * character. This is detected by XORing the word-sized segment with a
         * word-sized block of the search character, and then checking for the
         * presence of NULL in the result.
         */
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask |= mask << 16;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask |= mask<<i;

        while (len >= LBLOCKSIZE) {
            if (DETECT_CHAR(*asrc, mask))
                break;
            asrc++;
            len -= LBLOCKSIZE;
        }

        /* If there are fewer than LBLOCKSIZE characters left, then we resort to
         * the bytewise loop.
         */
        src = (unsigned char *) asrc;
    }

    while (len--) {
        if (*src == d)
            return (void *) src;
        src++;
    }

    return NULL;
}

延伸問題 2

比較 Linux 核心原本 (與平台無關) 的實作和 memchr_opt，設計實驗來觀察隨著字串長度和特定 pattern 變化的效能影響

延伸問題 3

研讀 2022 年學生報告，在 Linux 核心原始程式碼找出 x86_64 或 arm64 對應的最佳化實作，跟上述程式碼比較，並嘗試舉出其中的策略和分析
下一步：貢獻程式碼到 Linux 核心
Phoronix 報導

測驗 `3`

2025q1 第 4 週測驗題

測驗 `1`

延伸問題 1

在 Linux 核心原始程式碼中找到 CRC32 的應用案例至少 3 處，並探討其原始程式碼

延伸問題 2

設計實驗來分析 Fast CRC32 提出的手法 (要涵蓋 branchless 和查表的變形)，並與硬體的 CRC32 指令效能進行比較。

延伸問題 3

以 x86-64 或 Arm64 為探討標的，說明 Linux 核心如何使用硬體加速的 CRC32 指令

測驗 `2`

延伸問題 1

解釋上述程式碼運作原理，搭配「信號與系統」教材

延伸問題 2

探討定點數的使用和其精確度

延伸問題 3

改進 MIDI 一閃一閃亮晶晶的音效輸出

測驗 `3`

延伸問題 1

解釋上述程式碼運作原理

延伸問題 2

學習 jserv/ttt 程式碼，在不使用 (n)curses 一類現成的函式庫前提下，改進網路聊天室的文字呈現介面

延伸問題 3

在既有的程式碼之上，實作帳號登入 (login) 及基本管理功能，並該有線上與否的提示，留意 ping value 的處理機制

2025q1 Homework4 (quiz3+4)

2025q1 第 3 週測驗題

測驗 1

延伸問題 1

資料結構定義

記憶體管理

輔助工具

基本設定與轉換

乘法

除法與餘數

最大公因數

測試程式碼

延伸問題 2

延伸問題 3

測驗 2

延伸問題 1

判斷一個整數是否為奇數

加速 memchr 搜尋

延伸問題 2

延伸問題 3

測驗 3

2025q1 第 4 週測驗題

測驗 1

延伸問題 1

延伸問題 2

延伸問題 3

測驗 2

延伸問題 1

延伸問題 2

延伸問題 3

測驗 3

延伸問題 1

延伸問題 2

延伸問題 3

Read more

2025q1 Homework1 (lab0)

2025q1 Homework5 (assessment)

2025q1 Homework2 (quiz1+2)

kxo

測驗 `1`

測驗 `2`

加速 `memchr` 搜尋

測驗 `3`

測驗 `1`

測驗 `2`

測驗 `3`