2023q1 第 6 週測驗題

目的: 檢驗學員對並行和多執行緒程式設計、Linux: 不只挑選任務的排程器，和 C 語言: bitwise 操作的認知

作答表單: 測驗 1-2 (針對 Linux 核心「設計」課程)
作答表單: 測驗 3-4 (針對 Linux 核心「實作」課程)

測驗 `1`

以下嘗試用 lab0 提及的 setjmp 和 longjmp，搭配〈你所不知道的 C 語言: goto 和流程控制篇〉闡述的 coroutine，開發簡易的排程器，以達到〈並行和多執行緒程式設計〉提及任務切換機制。在網路伺服器的應用場景，快速切換網路連線，以縮減服務的延遲。示意:

    | Main loop |      | Connection 1 |      | Connection 2 |
    ----------------------------------------------------------
          |
          |
          v
         ***  --> resume --> ***
                             ***
                             ***
         ***  <--  yield <-- ***
         ***
         ***  ------------> resume ------------->   ***
                                                    ***
         ***  <------------  yield <-------------   ***
         ***
         ***
         ***
         ***  --> resume --> ***
                             ***

改繪自 Life of a HTTP request, as seen by my toy web server

給定程式的參考執行輸出如下:

Task 0: n = 70
Task 1: n = 70
Task 2: n = 70
Task 0 fib(0) = 0
Task 1 fib(1) = 1
...
Task 0 fib(68) = 72723460248141
Task 1: resume
Task 1 fib(69) = 117669030460994
Task 2: resume

...
Task 2 68
Task 2: resume
Task 2 69
Task 2: resume
Task 2: complete

對照 FIBONACCI 網站列出的數值

程式碼列表如下: (部分遮蔽)

/* Implementing coroutines with setjmp/longjmp */

#include <setjmp.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "list.h"

struct task {
    jmp_buf env;
    struct list_head list;
    char task_name[10];
    int n;
    int i;
};

struct arg {
    int n;
    int i;
    char *task_name;
};

static LIST_HEAD(tasklist);
static void (**tasks)(void *);
static struct arg *args;
static int ntasks;
static jmp_buf sched;
static struct task *cur_task;

static void task_add(struct task *task)
{
    list_add_tail(&task->list, &tasklist);
}

static void task_switch()
{
    if (!list_empty(&tasklist)) {
        struct task *t = list_first_entry(&tasklist, struct task, list);
        list_del(&t->list);
        cur_task = t;
        longjmp(t->env, 1);
    }
}

void schedule(void)
{
    static int i;

    setjmp(sched);

    while (ntasks-- > 0) {
        struct arg arg = args[i];
        AAAA;
        printf("Never reached\n");
    }

    task_switch();
}

static long long fib_sequence(long long k)
{
    long long f[k + 2];
    f[0] = 0, f[1] = 1;

    for (int i = 2; i <= k; i++) {
        f[i] = f[i - 1] + f[i - 2];
    }

    return f[k];
}

/* A task yields control n times */

void task0(void *arg)
{
    struct task *task = malloc(sizeof(struct task));
    strcpy(task->task_name, ((struct arg *) arg)->task_name);
    task->n = ((struct arg *) arg)->n;
    task->i = ((struct arg *) arg)->i;
    INIT_LIST_HEAD(&task->list);

    printf("%s: n = %d\n", task->task_name, task->n);

    if (setjmp(task->env) == 0) {
        task_add(task);
        longjmp(sched, 1);
    }

    task = cur_task;

    for (; task->i < task->n; task->i += 2) {
        if (setjmp(task->env) == 0) {
            long long res = fib_sequence(task->i);
            printf("%s fib(%d) = %lld\n", task->task_name, task->i, res);
            task_add(task);
            BBBB;
        }
        task = cur_task;
        printf("%s: resume\n", task->task_name);
    }

    printf("%s: complete\n", task->task_name);
    longjmp(sched, 1);
}

void task1(void *arg)
{
    struct task *task = malloc(sizeof(struct task));
    strcpy(task->task_name, ((struct arg *) arg)->task_name);
    task->n = ((struct arg *) arg)->n;
    task->i = ((struct arg *) arg)->i;
    INIT_LIST_HEAD(&task->list);

    printf("%s: n = %d\n", task->task_name, task->n);

    if (setjmp(task->env) == 0) {
        task_add(task);
        longjmp(sched, 1);
    }

    task = cur_task;

    for (; task->i < task->n; task->i++) {
        if (setjmp(task->env) == 0) {
            printf("%s %d\n", task->task_name, task->i);
            task_add(task);
            CCCC;
        }
        task = cur_task;
        printf("%s: resume\n", task->task_name);
    }

    printf("%s: complete\n", task->task_name);
    longjmp(sched, 1);
}

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
int main(void)
{
    void (*registered_task[])(void *) = {task0, task0, task1};
    struct arg arg0 = {.n = 70, .i = 0, .task_name = "Task 0"};
    struct arg arg1 = {.n = 70, .i = 1, .task_name = "Task 1"};
    struct arg arg2 = {.n = 70, .i = 0, .task_name = "Task 2"};
    struct arg registered_arg[] = {arg0, arg1, arg2};
    tasks = registered_task;
    args = registered_arg;
    ntasks = ARRAY_SIZE(registered_task);

    schedule();

    return 0;
}

其中 task0 和 task1 分別計算 Fibonacci 數的偶數和奇數項，list.h 標頭檔來自 lab0-c。請補完程式碼，使程式碼運作符合預期。作答規範:

AAAA, BBBB, CCCC 均為表示式，三者應當包含 task
以最精簡的程式碼撰寫，都不該出現 , 和 ;

延伸問題:

解釋上述程式碼運作原理，舉出上述 coroutine 實作的限制 (注意 stack) 並予以改進
學習 minicoro 程式碼，針對你的處理器型態 (x86-64 或 aarch64)，抽取出其 Linux 相關且用 inline assembly 實作的程式碼出來，探討其手法並分析其效能表現

測驗 `2`

研讀 A (Bare-Bones) User-Level Thread Library 一文後，延伸測驗 1，我們可利用 SIGALRM signal 來實作搶佔式多工，下方的程式建立 3 個排序的任務，原理是藉由 SIGALRM signal 來模擬作業系統的 timer 中斷，設定的時間間隔為 10 ms，於是每 10 ms 觸發模擬的 timer 中斷，帶動排程器，實作 round-robin 排程策略。留意到 trampoline 的使用:

Trampolines (sometimes referred to as indirect jump vectors) are memory locations holding addresses pointing to interrupt service routines, I/O routines, etc. Execution jumps into the trampoline and then immediately jumps out, or bounces, hence the term trampoline.

預期程式執行輸出:

[1] sort: begin
[1] sort: start sorting
[2] sort: begin
[2] sort: start sorting
[3] sort: begin
[3] sort: start sorting
[2] sort: end
[3] sort: end
[1] sort: end

注意後 3 行的順序可能是

3 \to 2 \to 1

或

2 \to 3 \to 1

。程式碼可見 sched.c (部分屏蔽)

請補完程式碼，使其運作符合預期。作答規範:

DDDD 和 EEEE 為函式名稱
FFFF 是表示式，應包含 >> 運算子，但不包含小括號 (即 ( 和 ))

參考資訊:

Proper Locking Under a Preemptible Kernel: Keeping Kernel Code Preempt-Safe
What is a trampoline function?
Linux 核心搶佔
Functional Programming 風格的 C 語言實作

trampoline function 的展現

延伸問題:

解釋上述程式碼運作原理，以及對應到 Linux 核心原始碼的若干機制，如 preempt_disable, preempt_enable, local_irq_save, local_irq_restore 等等
探討上述程式碼的 signal 使用機制，指出其正確性或效率疑慮，並著手改進
研讀 cserv 高效網頁伺服器的分析和改進裡頭 coro 的實作，探討其原理

測驗 `3`

除法運算的成本往往高於加法和乘法，於是改進的策略被引入。其中一種策略是將除法改為乘法運算，例如

\frac{n}{d}

在分子和分母分別乘上

2^{N}

後，得到等價的

n \times \frac{\frac{2^{N}}{d}}{2^{N}}

，其中對 2 的 N 次方 (power of 2) 進行除法，在無號整數的操作就等同於右移 (shift) N 個位元，又當

d

(除數) 的數值是已知的狀況下，

\frac{2^{N}}{d}

可預先計算，也就是說，

\frac{n}{d}

可轉換為乘法和位移操作，進而減少運算成本。不過，顯然

\frac{2^{N}}{d}

無法總是用整數來表達 (僅在

d

是 power of 2 的時候才可)，因此，我們可先估算，並將差值補償回去。

重新檢視

\frac{n}{d} = n \times \frac{\frac{2^{N}}{d}}{2^{N}}

，當我們想將

n

除以

4

時，就相當於乘以

0.25

，所以我們可將

\frac{5}{4}

改為

5 \times 0.25

，我們得到整數的部分 (即

1

)，和小數部分 (即

0.25

)，後者乘以

4

就是

1

，也就是餘數。下方程式碼展示這概念:

const uint32_t D = 3;
#define M ((uint64_t)(UINT64_C(0xFFFFFFFFFFFFFFFF) / (D) + 1))

/* compute (n mod d) given precomputed M */
uint32_t quickmod(uint32_t n)
{   uint64_t quotient = ((__uint128_t) M * n) >> 64;
    return n - quotient * D;
}

其中 __uint128_t 是 GCC extension，用以表示 128 位元的無號數值。

GCC: C Extensions: 128-bit Integers

n

模除

3

的數學推導如下：

\begin{aligned} n % 3 & = n - ⌊ \frac{n}{3} ⌋ \times 3 \\ = n - ⌊ n \times \frac{\frac{2^{64}}{3}}{2^{64}} ⌋ \times 3 \\ = n - ⌊ (n \times \frac{2^{64}}{3}) \times \frac{1}{2^{64}} ⌋ \times 3 \\ = n - ((n \times M) >> 64) \times 3 \end{aligned}

其中

\frac{2^{64}}{3}

對應到 M 無條件捨去的除法運算再補一對應到「取上界」，故

M = ⌈ \frac{2^{64}}{3} ⌉

由 Facebook 公司所維護的 jemalloc 是個高效能的記憶體配置器 (memory allocator，即 malloc 和 free 函式對應的實作)，特別在多核多執行緒的環境有優異表現，在其原始程式碼 include/jemalloc/internal/div.h 和 src/div.c 也運用類似的技巧：已知

\frac{n}{d} = n \times \frac{\frac{2^{k}}{d}}{2^{k}}

但是 k 要取多少才能滿足我們要的精確度呢? src/div.c 提供證明。

假設

n, d, \frac{n}{d}

為整數

\begin{aligned} \frac{n}{d} & = p, p \in Z \\ = ⌊ ⌈ \frac{2^{k}}{d} ⌉ \times \frac{n}{2^{k}} ⌋, \forall k \in N \\ = ⌊ \frac{2^{k} + r}{d} \times \frac{n}{2^{k}} ⌋ (∵ r = - 2^{k} \mod d) \\ = ⌊ \frac{n}{d} + \frac{r}{d} \times \frac{n}{2^{k}} ⌋ \\ = \frac{n}{d} + ⌊ \frac{r}{d} \times \frac{n}{2^{k}} ⌋ (∵ \frac{n}{d} = p \in Z) \end{aligned}

證明

⌈ \frac{2^{k}}{d} ⌉ = \frac{2^{k} + r}{d}

⌈ \frac{2^{k}}{d} ⌉ = \frac{2^{k} + d - (2^{k} \mod d)}{d} = \frac{2^{k} + r}{d} = \frac{2^{k} + (- 2^{k} \mod d)}{d}

證明

- 2^{k} \mod d = d - (2^{k} \mod d)

\begin{aligned} \Rightarrow & 2^{k} \mod d = r \\ 2^{k} & = p d + r \\ - 2^{k} & = (- p) d + - r \\ - 2^{k} & = (- p - 1) d + d - r \\ - 2^{k} & = p^{'} d + (d - r) \\ \Rightarrow & (- 2^{k} \mod d) = d - r \end{aligned}

\frac{n}{d} = \frac{n}{d} + ⌊ \frac{r}{d} \times \frac{n}{2^{k}} ⌋

成立的條件為

\frac{r}{d} \times \frac{n}{2^{k}} < 1

因為
$r = 2^{k} \mod d$ ，所以
$\frac{r}{d} < １$
如果
$\frac{n}{2^{k}} < 1$ 則
$\frac{r}{d} \times \frac{n}{2^{k}} < 1$ 恆成立
假設 n 的範圍是 unsigned int 則 k 取 32 可以保證
$\frac{n}{2^{k}} < 1$

比較 jemalloc 的快速除法與通用除法運算

編譯器最佳化程度由 O0 到 O3 都做測試
測量 10000 次畫出時間分佈圖

測試結果：

編譯器選項 -O0
編譯器選項 -O1
編譯器選項 -O2
編譯器選項 -O3

\to

可發現 jemalloc 的除法運算較快。

fastdiv (部分程式碼遮蔽) 運用上述技巧，提出較通用除法快的實作。以下是測試程式碼:

#include <assert.h>
#include <stdio.h>

#include "fastdiv.h"

int main(int argc, char **argv)
{
    const uint64_t MAX_NUM = 1 << 16;
    const uint64_t MAX_DIV = 1 << 14;

    for (uint64_t d = 1U; d < MAX_DIV; ++d) {
        div_t fd;        /* general method */
        fastdiv(&fd, d); /* divide-by-d */
        const uint64_t mul = fd.mul, add = fd.add, shift = fd.shift;

        div_t fd2;                         /* bounded method optimization */
        fastdiv_bounded(&fd2, d, MAX_DIV); /* divide-by-d */
        const uint64_t mul2 = fd2.mul, add2 = fd2.add, shift2 = fd2.shift;
        assert(mul2 == 1U || add2 == 0U || shift2 == 0U);
        assert(add2 == 0U);
        for (uint64_t num = 0; num < MAX_NUM; ++num) {
            const uint64_t normal = num / d;
            const uint64_t general = (num * mul + add) >> shift;
            const uint64_t bounded = (num * mul2) >> shift2;
            assert(general == normal);
            assert(bounded == normal);
        }
    }
}

請補完程式碼，使 assert 巨集不會在執行時期觸發錯誤並運作符合預期。作答規範:

AAAA, BBBB, CCCC, DDDD 皆為整數
不該存在小括號 (即 ( 和 ))，以最精簡的形式書寫

延伸問題:

解釋上述程式碼運作原理，設計實驗來比較除法和 fastdiv 的效率，並試圖解釋其行為
以 jemalloc 為例，解說快速除法運算的應用場景。嘗試在 Linux 核心原始程式碼找出類似的技巧
GCC Bug #89845 提及針對整數除法的加速機制，參照到 Faster Remainder by Direct Computation Applications to Compilers and Software Libraries，嘗試解讀其手段並舉例說明
libdivine 提供更進階的除法加速機制，嘗試解釋其原理

測驗 `4`

延伸第 5 週測驗題，我們想開發一套快速的 heap allocator，其記憶體開銷約為 1.67%，並比照第 5 週提到的 TLSF，嘗試建立時間複雜度為

O (1)

的實作，注意只支援 32 位元的定址空間，亦即

2^{32} = 4

GiB，內部採用 best-fit 演算法。

參考執行輸出:

This 268435456 bytes heap requires 824 bytes for its base structure plus 4473928 bytes (1.67%) for bookkeeping.
There are 91 base sizes.
allocated 1048576 times (16 + 32 + 64 + 128 + 16) bytes
freed them all.
Allocated 2048 times 69392 bytes.
Freed them all.
Allocated 16777216 times 16 bytes.
Freed them all.
Allocated 8388608 times 24 bytes.
Freed them all.
Allocated 8388608 times 32 bytes.
Freed them all.
Allocated 5242880 times 48 bytes.
Freed them all.
Allocated 4194304 times 61 bytes.
Freed them all.
Allocated 3145728 times 65 bytes.
Freed them all.
Allocated 3145728 times 79 bytes.
Freed them all.
Allocated 3145728 times 80 bytes.
Freed them all.
Allocated 2097152 times 81 bytes.
Freed them all.
Allocated 32768 times 4368 bytes.
Freed them all.
Allocated 524288 times 345 bytes.
Freed them all.

其原始程式碼可見 heap.c (部分遮蔽)。請補完程式碼，使其運作符合預期。作答規範:

EEEE, FFFF 皆為整數
GGGG 為 hexadecimal literal
以最精簡的形式撰寫

延伸問題:

解釋上述程式碼運作原理，應探討何以
$O (1)$ 時間複雜度可落實
Linux 核心一度具備同為
$O (1)$ 時間複雜度的 xvmalloc，不過隨後就被 zsmalloc 取代，請依據 git log 和 LWN 等第一手材料，探討核心開發者的考量因素

2023q1 第 6 週測驗題

測驗 1

測驗 2

測驗 3

測驗 4

測驗 `1`

測驗 `2`

測驗 `3`

測驗 `4`