Try   HackMD

2023q1 Homework3 (fibdrv)

tags: linux2023

contributed by < xueyang0312 >

前期準備

  • gcc 版本
$ gcc --version gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
  • 檢查 Linux 核心版本
$ uname -r 5.15.0-60-generic
  • 確認 linux-headers 套件已正確安裝於開發環境
$ dpkg -L linux-headers-5.15.0-60-generic| grep -i "/lib/modules" /lib/modules /lib/modules/5.15.0-60-generic /lib/modules/5.15.0-60-generic/build
  • 檢驗目前的使用者身份
$ whoami xueyang $ sudo whoami root

自我檢查清單

  • 研讀上述 Linux 效能分析的提示 描述,在自己的實體電腦運作 GNU/Linux,做好必要的設定和準備工作
    從中也該理解為何不希望在虛擬機器中進行實驗;
  • 研讀上述費氏數列相關材料 (包含論文),摘錄關鍵手法,並思考 clz / ctz 一類的指令對 Fibonacci 數運算的幫助。請列出關鍵程式碼並解說
  • 複習 C 語言 數值系統bitwise operation,思考 Fibonacci 數快速計算演算法的實作中如何減少乘法運算的成本;
  • 學習指出針對大數運算的加速運算和縮減記憶體操作成本的舉措,紀錄你的認知和疑惑
  • 注意到 fibdrv.c 存在著 DEFINE_MUTEX, mutex_trylock, mutex_init, mutex_unlock, mutex_destroy 等字樣,什麼場景中會需要呢?撰寫多執行緒的 userspace 程式來測試,觀察 Linux 核心模組若沒用到 mutex,到底會發生什麼問題。嘗試撰寫使用 POSIX Thread 的程式碼來確認。
    搭配閱讀〈並行和多執行緒程式設計

計算 F93 (包含) 之後的 Fibonacci 數

數字字串加法

定義結構體 str_t, 以一個大小為 128 的字串來存計算結果。

typedef struct str {
    char numberStr[128];
} str_t;

add_str 是將 x + y 的值加起來,並將結果存到 result

static void add_str(char *x, char *y, char *result)
{
    size_t size_x = strlen(x), size_y = strlen(y);
    int i, sum, carry = 0;
    if (size_x > size_y) {
        for (i = 0; i < size_y; i++) {
            sum = (x[i] - '0') + (y[i] - '0') + carry;
            result[i] = '0' + sum % 10;
            carry = sum / 10;
        }

        for (i = size_y; i < size_x; i++) {
            sum = (x[i] - '0') + carry;
            result[i] = '0' + sum % 10;
            carry = sum / 10;
        }
    } else {
        for (i = 0; i < size_x; i++) {
            sum = (x[i] - '0') + (y[i] - '0') + carry;
            result[i] = '0' + sum % 10;
            carry = sum / 10;
        }

        for (i = size_x; i < size_y; i++) {
            sum = (y[i] - '0') + carry;
            result[i] = '0' + sum % 10;
            carry = sum / 10;
        }
    }

    if (carry)
        result[i++] = '0' + carry;
    result[i] = '\0';
}

先都以反轉的字串來處理,最後在將結果反轉回來
例如 0, 1, 1, 2, 3, 5, 8, 31, 12…

static long long fib_sequence(long long k, char *buf)
{
    /* FIXME: C99 variable-length array (VLA) is not allowed in Linux kernel. */
    // GFP_KERNEL is a flag used for memory allocation in the Linux kernel.
    str_t *f = kmalloc((k + 2) * sizeof(str_t), GFP_KERNEL);
    strncpy(f[0].numberStr, "0", 1);
    f[0].numberStr[1] = '\0';

    strncpy(f[1].numberStr, "1", 1);
    f[1].numberStr[1] = '\0';

    for (int i = 2; i <= k; i++) {
        add_str(f[i - 1].numberStr, f[i - 2].numberStr, f[i].numberStr);
    }
    size_t retSize = strlen(f[k].numberStr);
    reverse_str(f[k].numberStr, retSize);
    __copy_to_user(buf, f[k].numberStr, retSize);
    return retSize;
}

計算到 F500 (The first 500 Fibonacci numbers ) 也是正確的,結果如下:

$ sudo ./client
Reading from /dev/fibonacci at offset 499, returned the sequence 86168291600238450732788312165664788095941068326060883324529903470149056115823592713458328176574447204501.
Reading from /dev/fibonacci at offset 500, returned the sequence 139423224561697880139724382870407283950070256587697307264108962948325571622863290691557658876222521294125.

加速 Fibonacci 運算 「Fast Doubling」

fast_doubling_recursive

應用 Fast Doubling 手法,實做遞迴版本:

static long long fast_doubling_recursive(long long k, long long *f)
{
    if (k <= 2)
        return (k > 0) ? (f[k] = 1) : (f[k] = 0);

    int x = k / 2;
    long long a = fast_doubling_recursive(x + 1, f);
    long long b = fast_doubling_recursive(x, f);
    f[k] = (k % 2) ? a * a + b * b : b * (2 * a - b);
    return f[k];
}

static long long fib_sequence_fast_doubling(long long k)
{
    long long *f = kmalloc((k + 2) * sizeof(long long), GFP_KERNEL);
    memset(f, 0, (k + 2) * sizeof(long long));
    fast_doubling_recursive(k, f);

    return f[k];
}

搭配 bitwise 運算開發遞迴版的程式碼:


static long long fib_sequence_fast_doubling(long long k)
{
    if (k <= 2)
        return !!k;

    // fib(2n) = fib(n) * (2 * fib(n+1) − fib(n))
    // fib(2n+1) = fib(n) * fib(n) + fib(n+1) * fib(n+1)
    long long a = fib_sequence_fast_doubling(k >> 1);
    long long b = fib_sequence_fast_doubling((k >> 1) + 1);

    if (k & 1)
        return a * a + b * b;
    return a * ((b << 1) - a);
}

fast_doubling_iterative_clz

利用硬體指令 __builtin_clzll 計算出 k 開頭共有幾個 0,在以 63 - __builtin_clzll 為迴圈初始化。

  1. 從最高位元的 1 開始,此時
    n=1
    ,而:
    • fib(n)=1
    • fib(n+1)=1
  2. 若下一個位元不存在的話跳到第 3 步,否則(假設目前為
    n=k
    ):
    • 透過
      fib(k)
      以及
      fib(k+1)
      計算
      fib(2k)
      以及
      fib(2k+1)
    • 檢查下一個位元:
      • 0:
        n=2k
      • 1:
        n=2k+1
        ,此時需要
        fib(n+1)
        讓下一迭代能夠計算
        fib(2n)
        以及
        fib(2n+1)
    • 回到第 2 步
  3. 此時
    n
    為目標數,回傳
    fib(n)
static long long fib_sequence_fast_doubling_iterative(long long k)
{
    if (k <= 2)
        return !!k;
    
    uint8_t count = 63 - __builtin_clzll(k);
    uint64_t fib_n0 = 1, fib_n1 = 1;

    for (uint64_t i = count, fib_2n0,fib_2n1; i-- > 0;) {
        fib_2n0 = fib_n0 * ((fib_n1 << 1) - fib_n0);
        fib_2n1 = fib_n0 * fib_n0 + fib_n1 * fib_n1;

        if (k & (1UL << i)) {
            fib_n0 = fib_2n1; // 2k
            fib_n1 = fib_2n0 + fib_2n1; // 2K + 1
        } else {
            fib_n0 = fib_2n0;
            fib_n1 = fib_2n1;
        }
    }

    return fib_n0;
}

效能比較

可以看到 StringAdd 的方法,n 越大所耗費時間越多,因為需要透過 __copy_to_user 來將大數回傳到 user buffer。經過硬體加速指令 __builtin_clzll 所實做的 Fast_doubling_iterative_with_clz 版本,在四種實做方法所耗費時間是最少的。

但由於 long long 只有 64 bit 的關係, 我們分析的數據只能從 n = 0 ~ 92, 因此需要整合 bigNum 來增加數據的範圍。

Bignum 與 Fast doubling 結合

Bignum Data structure

typedef struct _bn {
    unsigned int *number;
    unsigned int size;
    int sign;
} bn;
  • number - 指向儲存的數值,之後會以 array 的形式來取用
  • size - 配置的記憶體大小,單位為 sizeof(unsigned int)
  • sign - 0 為正數、1 為負數

令一個 unsigned int *number 的數字陣列來儲存,unsigned int 最大值為 4294967295
當超過這個最大值該如何表示? 例如:4294967316

4294967316=1×(232)1+20×(232)0

所以,利用 bn 資料結構來計算大數運算,並且可以表示超過 unsigned long long int 的數字

2641

由於大數沒辦法直接以數值的形式列出,這裡改用字串來呈現,轉換的部分利用 ASCII 的特性來組合出 10 進位字串

char *bn_to_string(const bn *src)
{
    // log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
    // 2 is `+` or `-` ; sign is `-`.
    size_t len = (8 * sizeof(int) * src->size) / 3 + 2 + src->sign;
    char *s = kmalloc(len, GFP_KERNEL);
    char *p = s;

    memset(s, '0', len - 1);
    s[len - 1] = '\0';

    // iterate through each digit of the binary number from MSB to LSB
    for (int i = src->size - 1; i >= 0; i--) {
        for (unsigned int d = 1U << 31; d; d >>= 1) {
            // binary -> decimal string
            int carry = !!(d & src->number[i]);
            for (int j = len - 2; j >= 0; j--) {
                carry = 2 * (s[j] - '0') + carry;
                s[j] = "0123456789"[carry % 10];
                carry /= 10;
                if (!s[j] && !carry)
                    break;
            }
        }
    }

    while (p[0] == '0' && p[1] != '\0') {
        p++;
    }

    if (src->sign) {
        *(--p) = '-';
    }

    memmove(s, p, strlen(p) + 1);
    return s;
}

根據 Fast Doubling 最後結論為:

Given

F(k) and
F(k+1)
, we can calculate these

F(2k)=F(k)[2F(k+1)F(k)]
F(2k+1)=F(k)2+F(k+1)2

所以,必須實做以 bn 為資料結構的大數運算:加法、減法、乘法、左移

加法、減法

一開始會先比較 absign 若爲一樣,則直接透過 bn_do_add static function 將 ab 相加;若不一樣,則先比較兩者大小,並且確保 a

> b ,再透過 bn_do_sub static function 相減。

減法會將 b 直接變號,再透過 bn_add function 來處理相減結果。

void bn_add(const bn *a, const bn *b, bn *c)
{
    if (a->sign == b->sign) {
        // both positive and negative
        bn_do_add(a, b, c);
        c->sign = a->sign;
    } else {
        // different sign

        // a > 0, b < 0
        if (a->sign)
            SWAP(a, b);

        int cmp = bn_cmp(a, b);

        if (cmp > 0) {
            /* |a| > |b| and b < 0, hence c = a - |b| */
            bn_do_sub(a, b, c);
            c->sign = 0;
        } else if (cmp < 0) {
            /* |a| < |b| and b < 0, hence c = -(|b| - |a|) */
            bn_do_sub(b, a, c);
            c->sign = 1;
        } else {
            /* |a| == |b| */
            bn_resize(c, 1);
            c->number[0] = 0;
            c->sign = 0;
        }
    }
}

void bn_sub(const bn *a, const bn *b, bn *c)
{
    bn tmp = *b;
    tmp.sign ^= 1;  // a - b = a + (-b)
    bn_add(a, &tmp, c);
}

比較出 ab size 的最大值,並將結果 c 透過 bn_resize function 來重新分配大小,由於 numberpointer to unsigned int ,所以會直接 truncate carry

bit32 ~
bit63
,再將 carry 又移 32 bits。

bn_do_add 實作

/* |c| = |a| + |b| */
static void bn_do_add(const bn *a, const bn *b, bn *c)
{
    // max digits = max(sizeof(a) + sizeof(b)) + 1
    int d = MAX(bn_msb(a), bn_msb(b)) + 1;
    d = DIV_ROUNDUP(d, 32) + !d;
    bn_resize(c, d);

    unsigned long long carry = 0;
    for (int i = 0; i < c->size; i++) {
        unsigned int tmp1 = (i < a->size) ? a->number[i] : 0;
        unsigned int tmp2 = (i < b->size) ? b->number[i] : 0;

        carry += (unsigned long long) tmp1 + tmp2;
        c->number[i] = carry;
        carry >>= 32;
    }

    if (!c->number[c->size - 1] && c->size > 1)
        bn_resize(c, c->size - 1);
}

如果 carry

<0,則將 carry +
232
來確保
number[i]<2321
,並將 carry 設為 1,表示借位。

bn_do_sub 實作

/* |c| = |a| - |b|
 *  |a| > |b| must be true
 */
static void bn_do_sub(const bn *a, const bn *b, bn *c)
{
    // max digits = max(sizeof(a) + sizeof(b)) + 1
    int d = MAX(a->size, b->size);
    bn_resize(c, d);

    long long int carry = 0;
    for (int i = 0; i < c->size; i++) {
        unsigned int tmp1 = (i < a->size) ? a->number[i] : 0;
        unsigned int tmp2 = (i < b->size) ? b->number[i] : 0;

        carry = (long long int) tmp1 - tmp2 - carry;
        if (carry < 0) {
            c->number[i] = carry + (1LL << 32);
            carry = 1;
        } else {
            c->number[i] = carry;
            carry = 0;
        }
    }

    d = bn_clz(c) / 32;
    if (d == c->size)
        --d;
    bn_resize(c, c->size - d);
}

乘法

/* c = a * b
 *
 */
void bn_mul(const bn *a, const bn *b, bn *c)
{
    // max digits = sizeof(a) + sizeof(b)
    int d = bn_msb(a) + bn_msb(b);
    d = DIV_ROUNDUP(d, 32) + !d;
    bn *tmp;

    if (c == a || c == b) {
        tmp = c;
        c = bn_alloc(d);
    } else {
        tmp = NULL;
        bn_resize(c, d);
    }

    for (int i = 0; i < a->size; i++) {
        for (int j = 0; j < b->size; j++) {
            unsigned long long int carry = 0;
            carry = (unsigned long long int) a->number[i] * b->number[j];
            bn_mul_add(c, i + j, carry);
        }
    }

    c->sign = a->sign ^ b->sign;

    if (tmp) {
        bn_cpy(tmp, c);
        bn_free(c);
    }
}

bn_mul_add 實作

static void bn_mul_add(bn *c, int offset, unsigned long long int x)
{
    unsigned long long int carry = 0;
    for (int i = offset; i < c->size; i++) {
        carry += c->number[i] + (x & 0xFFFFFFFF);
        c->number[i] = carry;
        carry >>= 32;
        x >>= 32;
        if (!x && !carry)
            return;
    }
}

左移

void bn_lshift(const bn *src, size_t offset)
{
    size_t z = bn_clz(src);
    offset %= 32;  // only handle offset within 32 bits atm
    if (!offset)
        return;

    if (offset > z)
        bn_resize(src, src->size + 1);
    /* bit shift */
    for (int i = src->size - 1; i > 0; i--)
        src->number[i] =
            src->number[i] << offset | src->number[i - 1] >> (32 - offset);
    src->number[0] <<= offset;
}

相關操作

SWAP

#ifndef SWAP
#define SWAP(x, y)           \
    do {                     \
        typeof(x) __tmp = x; \
        x = y;               \
        y = __tmp;           \
    } while (0)
#endif

Divide round up

#ifndef DIV_ROUNDUP
#define DIV_ROUNDUP(x, len) (((x) + (len) -1) / (len))
#endif

Comparation of bn

/* compare length
 *  if |a| > |b| , return 1
 *  if |a| < |b| , return -1
 *  if |a| = |b| , return 0
 */
static int bn_cmp(const bn *a, const bn *b)
{
    if (a->size > b->size)
        return 1;
    else if (a->size < b->size)
        return -1;
    else {
        for (int i = a->size - 1; i >= 0; i--) {
            if (a->number[i] > b->number[i])
                return 1;
            if (a->number[i] < b->number[i])
                return -1;
        }
        return 0;
    }
}

Resize of bn

/* sizeof BigNum */
int bn_resize(bn *src, size_t size)
{
    if (!src)
        return -1;
    if (size == src->size)
        return 0;
    if (size == 0)
        return bn_free(src);

    src->number = krealloc(src->number, sizeof(int) * size, GFP_KERNEL);
    if (!src->number)
        return -1;
    if (size > src->size)
        memset(src->number + src->size, 0, sizeof(int) * (size - src->size));
    src->size = size;
    return 0;
}

Count leading zero of bn

static int bn_clz(const bn *src)
{
    int cnt = 0;
    for (int i = src->size - 1; i >= 0; i--) {
        if (src->number[i]) {
            cnt += __builtin_clz(src->number[i]);
            return cnt;
        } else
            cnt += 32;
    }

    return cnt;
}

Allocate size of bn

bn *bn_alloc(size_t size)
{
    bn *new = (bn *) kmalloc(sizeof(bn), GFP_KERNEL);
    new->number = kmalloc(sizeof(unsigned int) * size, GFP_KERNEL);
    memset(new->number, 0, sizeof(unsigned int) * size);
    new->size = size;
    new->sign = 0;
    return new;
}

Initialization bn

void bn_init(bn *src, size_t size, unsigned int value)
{
    src->number = kmalloc(sizeof(unsigned int) * size, GFP_KERNEL);
    src->number[0] = value;
    src->size = size;
    src->sign = 0;
}

bn_fib_fast_doubling_recursive

static bn bn_fib_helper(long long k, bn *fib, bn *c)
{
    if (k <= 2) {
        long long tmp = k;
        bn_init(&fib[k], 1, !!tmp);
        return fib[k];
    }

    bn a = bn_fib_helper((k >> 1), fib, c);
    bn b = bn_fib_helper((k >> 1) + 1, fib, c);

    bn_init(&fib[k], 1, 0);
    bn_init(&c[0], 1, 0);
    bn_init(&c[1], 1, 0);

    if (k & 1) {
        bn_mul(&a, &a, &c[0]);          // c0 = a * a
        bn_mul(&b, &b, &c[1]);          // c1 = b * b
        bn_add(&c[0], &c[1], &fib[k]);  // fib[k] = a * a + b * b
    } else {
        bn_cpy(&c[0], &b);
        bn_lshift(&c[0], 1);         // c0 = 2 * b
        bn_sub(&c[0], &a, &c[1]);    // c1 = 2 * b - a
        bn_mul(&a, &c[1], &fib[k]);  // fib[k] = a * (2 * b - a)
    }

    return fib[k];
}

static long long bn_fib_fast_doubling_recursive(long long k, char *buf)
{
    bn *fib = (bn *) kmalloc((k + 2) * sizeof(bn), GFP_KERNEL);
    bn *c = (bn *) kmalloc(2 * sizeof(bn), GFP_KERNEL);
    bn_fib_helper(k, fib, c);
    char *ret = bn_to_string(&fib[k]);
    size_t retSize = strlen(ret);
    __copy_to_user(buf, ret, retSize);
    return retSize;
}

bn_fib_iterative

static long long bn_fib_iterative(unsigned int n, char *buf)
{
    bn *dest = bn_alloc(1);
    if (n <= 2) {  // Fib(0) = 0, Fib(1) = 1
        dest->number[0] = !!n;
        return 1;
    }

    bn *a = bn_alloc(1);
    bn *b = bn_alloc(1);
    dest->number[0] = 1;

    for (unsigned int i = 1; i < n; i++) {
        bn_cpy(b, dest);        // b = dest
        bn_add(dest, a, dest);  // dest += a
        bn_swap(a, b);          // SWAP(a, b)
    }
    bn_free(a);
    bn_free(b);
    char *ret = bn_to_string(dest);
    bn_free(dest);

    size_t retSize = strlen(ret);
    __copy_to_user(buf, ret, retSize);
    return retSize;
}

bn_fib_fast_doubling_iterative_clz

static long long bn_fib_fast_doubling_iterative_clz(long long k, char *buf)
{
    bn *f1 = bn_alloc(1);
    if (k <= 2) {  // Fib(0) = 0, Fib(1) = 1
        f1->number[0] = !!k;
        char *ret = bn_to_string(f1);
        size_t retSize = strlen(ret);
        __copy_to_user(buf, ret, retSize);
        bn_free(f1);
        return retSize;
    }

    bn *f2 = bn_alloc(1);
    f1->number[0] = 1;  // fib[k]
    f2->number[0] = 1;  // fib[k+1]

    bn *k1 = bn_alloc(1);
    bn *k2 = bn_alloc(1);

    uint8_t count = 63 - __builtin_clzll(k);

    for (uint64_t i = count; i-- > 0;) {
        // fib[2k] = fib[k] * (fib[k + 1] * 2 - fib[k]);
        bn_cpy(k1, f2);
        bn_lshift(k1, 1);
        bn_sub(k1, f1, k1);
        bn_mul(f1, k1, k1);
        // fib[2k] = fib[k] * fib[k] + fib[k+1] * fib[k+1]

        bn_mul(f1, f1, f1);
        bn_mul(f2, f2, f2);
        bn_add(f1, f2, k2);

        if (k & (1UL << i)) {
            bn_cpy(f1, k2);  // 2k
            bn_add(k1, k2, f2);
        } else {
            bn_cpy(f1, k1);
            bn_cpy(f2, k2);
        }
    }

    char *ret = bn_to_string(f1);
    size_t retSize = strlen(ret);
    __copy_to_user(buf, ret, retSize);

    bn_free(k1);
    bn_free(k2);
    bn_free(f2);
    bn_free(f1);

    return retSize;
}

Linux 效能分析

引入 ktime API

修改 fib_read 以及 fib_write,在 fib_read 會呼叫 fib_time_proxy ,並且根據 mode 來計算指定方法的時間,並且在 fib_write 回傳測量時間。

static long long fib_time_proxy(long long k, char *buf, int mode)
{
    long long result = 0;
    switch (mode) {
    case 0:
        kt = ktime_get();
        result = fib_sequence_basic(k);
        kt = ktime_sub(ktime_get(), kt);
        break;
    case 1:
        kt = ktime_get();
        result = fib_sequence_string_add(k, buf);
        kt = ktime_sub(ktime_get(), kt);
        break;
    case 2:
        kt = ktime_get();
        result = fib_sequence_fast_doubling_recursive(k);
        kt = ktime_sub(ktime_get(), kt);
        break;
    case 3:
        kt = ktime_get();
        result = fib_sequence_fast_doubling_iterative(k);
        kt = ktime_sub(ktime_get(), kt);
        break;
    case 4:
        kt = ktime_get();
        result = bn_fib_fast_doubling_recursive(k, buf);
        kt = ktime_sub(ktime_get(), kt);
        break;
    case 5:
        kt = ktime_get();
        result = bn_fib_iterative(k, buf);
        kt = ktime_sub(ktime_get(), kt);
        break;
    case 6:
        kt = ktime_get();
        bn_fib_fast_doubling_iterative_clz(k, buf);
        kt = ktime_sub(ktime_get(), kt);
    default:
        break;
    }

    return result;
}
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    return (ssize_t) fib_time_proxy(*offset, buf, 6);
}

/* write operation is skipped */
static ssize_t fib_write(struct file *file,
                         const char *buf,
                         size_t size,
                         loff_t *offset)
{
    return ktime_to_ns(kt);
}

下圖為遞迴版本的 bn_fib_fast_doubling_iterative_clz 運算:

查看行程的 CPU affinity

  • 可使用 taskset 加上 -p 參數再加上該行程的 PID
  • 加上 -c 參數,讓 taskset 直接輸出 CPU 的核心列表
$ taskset -cp 1 pid 1's current affinity list: 0-15

限定 CPU 給特定的程式使用

  • 參考 KYWeng ,修改 /etc/default/grub 內的GRUB_CMDLINE_LINUX_DEFAULT 尾端,加入 isolcpus=15 來指定編號 15 的核心不被排程器賦予任務,修改完成後需 sudo update-grub 來更新設定,重開機後即生效
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=15"
  • 觀察 PID 1 的 affinity list 不包含該編號 7 的 CPU
taskset -cp 1 pid 1's current affinity list: 0-14
  • 也可以觀察 /sys/devices/system/cpu/isolated 是否為指定編號 7
cat /sys/devices/system/cpu/isolated 15

將程式固定在特定的 CPU 中執行

  • 先透過 insmod 將模組載入核心
$ sudo insmod fibdrv.ko

透過指定處理器親和性 (process affinity),可以將程式固定在特定的 CPU 中執行

$ sudo taskset -c 15 ./client

排除干擾效能分析的因素

sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
  • 設定 scaling_governor 為 performance。準備以下 shell script,檔名為 performance.sh :
for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor do echo performance > ${i} done
  • 之後再用 $ sudo sh performance.sh 執行

  • 針對 Intel 處理器,關閉 turbo mode

sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"

用統計手法去除極端值

Pytohn script實做以及結果
4ce43c4

優化 bignum 運算成本

Perf

  • perf stat

擷取自 ChatGPT 解釋

In perf stat, the -r option is used to specify the number of times to repeat the command or application and collect performance statistics.

And the -e option is used to specify the hardware performance events to monitor during the execution of a command or application. These events can include CPU cycles, cache misses, branch instructions, page faults, and more.

To use -e, you must specify the name of the hardware event you want to monitor using its event code. You can obtain a list of available events and their corresponding event codes by running the perf list command.

輸入 perf list 來查看有哪些 event 可以觀察

$ perf list
List of pre-defined events (to be used in -e):

  duration_time                                      [Tool event]

  branch-instructions OR cpu/branch-instructions/    [Kernel PMU event]
  branch-misses OR cpu/branch-misses/                [Kernel PMU event]
  bus-cycles OR cpu/bus-cycles/                      [Kernel PMU event]
  cache-misses OR cpu/cache-misses/                  [Kernel PMU event]
  cache-references OR cpu/cache-references/          [Kernel PMU event]
  cpu-cycles OR cpu/cpu-cycles/                      [Kernel PMU event]
  instructions OR cpu/instructions/                  [Kernel PMU event]
  mem-loads OR cpu/mem-loads/                        [Kernel PMU event]
  mem-stores OR cpu/mem-stores/                      [Kernel PMU event]
  ref-cycles OR cpu/ref-cycles/                      [Kernel PMU event]
  topdown-fetch-bubbles OR cpu/topdown-fetch-bubbles/ [Kernel PMU event]
  topdown-recovery-bubbles OR cpu/topdown-recovery-bubbles/ [Kernel PMU event]
:

優化 bn_fib_fast_doubling_iterative_clz

先利用 perf stat 紀錄尚未優化的效能

$ perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-instructions,branch-misses ./client
Performance counter stats for './client' (10 runs):

       9,2613,2781      cycles                                                        ( +-  0.04% )  (65.50%)
      16,4274,2825      instructions              #    1.77  insn per cycle           ( +-  0.03% )  (82.98%)
            5,4001      cache-misses              #    4.526 % of all cache refs      ( +-  4.62% )  (83.77%)
          114,4262      cache-references                                              ( +-  2.54% )  (83.77%)
       2,2879,8664      branch-instructions                                           ( +-  0.01% )  (83.77%)
     <not counted>      branch-misses                                                 (0.00%)

         0.0016970 +- 0.0000221 seconds time elapsed  ( +-  1.30% )

Some events weren't counted. Try disabling the NMI watchdog:
        echo 0 > /proc/sys/kernel/nmi_watchdog
        perf stat ...
        echo 1 > /proc/sys/kernel/nmi_watchdog

branch-misses 並沒有被計算到,根據 ChatGPT 解釋,根據提示先 disableing NMI watchdog。

The NMI(Non-Maskable Interrupt) watchdog is a mechanism in the Linux kernel that monitors the system for hangs or freezes. When it detects such a condition, it triggers an NMI interrupt, which cannot be masked or ignored by the system. This interrupt takes priority over other interrupts and can cause perf to miss some events.

尚未優化的 bn_fib_fast_doubling_iterative_clz 目前效能如下:

Performance counter stats for './client' (10 runs):

       9,2613,2781      cycles                                                        ( +-  0.04% )  (65.50%)
      16,4274,2825      instructions              #    1.77  insn per cycle           ( +-  0.03% )  (82.98%)
            5,4001      cache-misses              #    4.526 % of all cache refs      ( +-  4.62% )  (83.77%)
          114,4262      cache-references                                              ( +-  2.54% )  (83.77%)
       2,2879,8664      branch-instructions                                           ( +-  0.01% )  (83.77%)
         1312,9651      branch-misses             #    5.73% of all branches          ( +-  0.04% )  (83.18%)

          0.321210 +- 0.000144 seconds time elapsed  ( +-  0.04% )

探討 mutex 對 linux kernel module 影響

  • 注意到 fibdrv.c 存在著 DEFINE_MUTEX, mutex_trylock, mutex_init, mutex_unlock, mutex_destroy 等字樣,什麼場景中會需要呢?撰寫多執行緒的 userspace 程式來測試,觀察 Linux 核心模組若沒用到 mutex,到底會發生什麼問題。嘗試撰寫使用 POSIX Thread 的程式碼來確認。
    搭配閱讀〈並行和多執行緒程式設計

在 fibdrv.c 中,分別在 fib_open 以及 fib_release 使用 mutex_trylock 以及 mutex_unlock
mutex_trylock 是 linux kernel 提供的 mutex API, 並不像 mutex_lock 一樣會 block 住,而是會立即 return 成功或者失敗

static int fib_open(struct inode *inode, struct file *file)
{
    if (!mutex_trylock(&fib_mutex)) {
        printk(KERN_ALERT "fibdrv is in use");
        return -EBUSY;
    }
    return 0;
}
static int fib_release(struct inode *inode, struct file *file)
{
    mutex_unlock(&fib_mutex);
    return 0;
}

當有 2 個 thread or process 以上嘗試同時 open fib character device 的話,只有一個能成功 open,只有當成功 open 的 thread or process,關閉 file descriptor,其他 thread or process 才能再次開啟。

以下程式來驗證:

void *read_character_device(void *arg)
{
    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }

    char read_buf[100];
    for (int i = 82; i < OFFSET; i++) {
        lseek(fd, i, SEEK_SET);
        long long sz = read(fd, read_buf, sizeof(read_buf));
        printf("Fib(%d):%lld\r\n",i ,sz);
    }
    close(fd);
}

int main()
{
    pthread_t worker_thread[2];
    pthread_create(&worker_thread[0], NULL, read_character_device, NULL);
    pthread_create(&worker_thread[1], NULL, read_character_device, NULL);
    for(int i = 0; i < 2; i++)
        pthread_join(worker_thread[i], NULL);

    return 0;
}

可以觀察到第二個 thread 無法成功 open,因為第一個 thread 已經取得 mutex 了

Fib(82):61305790721611591
Fib(83):99194853094755497
Fib(84):160500643816367088
Fib(85):259695496911122585
Fib(86):420196140727489673
Fib(87):679891637638612258
Fib(88):1100087778366101931
Fib(89):1779979416004714189
Failed to open character device: Device or resource busy
Fib(90):2880067194370816120
Fib(91):4660046610375530309

查看 dmesg

dmesg

...
[293052.774656] fibdrv is in use

若將 file descriptor 讓 thread 共享,所以 struct file 當中的 f_pos 會成為共用資源。

static int fd;

void *read_character_device(void *arg)
{

    char read_buf[100];
    int thr_number = *((int *) arg);
    char read_buf[100];
    for (int i = 82; i < OFFSET; i++) {
        lseek(fd, i, SEEK_SET);
        sleep(thr_number);
        long long sz = read(fd, read_buf, sizeof(read_buf));
        printf("Thread %d Fib(%d):%lld\r\n", thr_number, i , sz);
    }
}

int main()
{
    fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }
    pthread_t worker_thread[2];
    int thread_number_1 = 1;
    pthread_create(&worker_thread[0], NULL, read_character_device, (void *) &thread_number_1);
    int thread_number_2 = 2;
    pthread_create(&worker_thread[1], NULL, read_character_device, (void *) &thread_number_2);
    for(int i = 0; i < 2; i++)
        pthread_join(worker_thread[i], NULL);
    
    close(fd);
    return 0;
}

thread 2 以 lseek 設定好目標 offset 後會休眠 2 秒,但在休眠的期間 thread 1 又更改了 offset,導致 thread 進行 read 時使用的 offset 不是當初設定的數值,產生 race condition。

$ sudo ./client

Thread 1 Fib(82):61305790721611591
Thread 2 Fib(82):99194853094755497
Thread 1 Fib(83):99194853094755497
Thread 1 Fib(84):160500643816367088
Thread 2 Fib(83):259695496911122585
Thread 1 Fib(85):160500643816367088
Thread 1 Fib(86):420196140727489673
Thread 2 Fib(84):679891637638612258

探討 character device

設備號的管理

__init 加入印出MAJOR, MINOR 兩個設備號碼:

static dev_t fib_dev = 0;
static struct cdev *fib_cdev;
static struct class *fib_class;

static int __init init_fib_dev(void)
{
    int rc = 0;

    mutex_init(&fib_mutex);
    mutex_init(&offset_mutex);

    // Let's register the device
    // This will dynamically allocate the major number
    rc = alloc_chrdev_region(&fib_dev, 0, 1, DEV_FIBONACCI_NAME);

    if (rc < 0) {
        printk(KERN_ALERT
               "Failed to register the fibonacci char device. rc = %i",
               rc);
        return rc;
    }
    
    ...
        
    printk("succeeded register char device : %s\n", DEV_FIBONACCI_NAME);
    printk("Major number = %\n", MAJOR(fib_dev));
    return rc;
    
    ...
}

dmesg 查看成功註冊 fibonacci character device 訊息

[208268.352753] succeeded register char device : fibonacci
[208268.352755] Major number = 509

Major number 為 509,這個數值可以對應 $ cat /proc/devices 結果。

Character devices:
  1 mem
  4 /dev/vc/0
  4 tty
  4 ttyS
  5 /dev/tty
  5 /dev/console
  5 /dev/ptmx
  5 ttyprintk
  6 lp
  7 vcs
 10 misc
 13 input
 21 sg
 29 fb
 89 i2c
 99 ppdev
108 ppp
116 alsa
128 ptm
136 pts
180 usb
189 usb_device
202 cpu/msr
...
509 fibonacci
510 aux
511 cec

在 <linux/kdev_t.h> 定義 MAJOR, MINOR macro:

#define MAJOR(dev)	((dev)>>8)
#define MINOR(dev)	((dev) & 0xff)
#define MKDEV(ma,mi)	((ma)<<8 | (mi))

kernel 必須避免兩個設備驅動使用到同一個主設備號的情況,linux kernel 提供兩種 interface function 完成設備號的申請:

int register_chrdev_region(dev_t from, unsigned count, const char *name)

register_chrdev_region() 需要指定主設備號,在使用這 function 前,Programmer 要保證分配的主設備號沒有被人使用。
所以 linux 提供了另一個 function:

int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
			const char *name)

alloc_chrdev_region() 會自動分配一個主設備號,可以避免和系統佔用的主設備號重複。
在卸載 module 時,需要把主設備號釋放給系統:

void unregister_chrdev_region(dev_t from, unsigned count)

Abstract of character device driver

首先,先解釋 struct cdev 資料結構

#ifndef _LINUX_CDEV_H
#define _LINUX_CDEV_H

struct cdev {
	struct kobject kobj;
	struct module *owner;
	const struct file_operations *ops;
	struct list_head list;
	dev_t dev;
	unsigned int count;
} __randomize_layout;

#endif
  • kobj:可被管理的 kernel object,可以是任何一個內核中需要被管理的對象,例如 device、driver、bus 等等。
  • owner:字符設備驅動程序所在的 module pointer。
  • ops:定義一個字符設備的操作函式集合。
  • list:用來將字符設備串成一個linked list。
  • dev:字符設備的設備號碼,由主設備和次設備組成。
  • count:同屬一個主設備號的次設備號的個數
static dev_t fib_dev = 0;
static struct cdev *fib_cdev;
static struct class *fib_class;

const struct file_operations fib_fops = {
    .owner = THIS_MODULE,
    .read = fib_read,
    .write = fib_write,
    .open = fib_open,
    .release = fib_release,
    .llseek = fib_device_lseek,
};

static int __init init_fib_dev(void)
{
    ...
    fib_cdev = cdev_alloc();
    if (fib_cdev == NULL) {
        printk(KERN_ALERT "Failed to alloc cdev");
        rc = -1;
        goto failed_cdev;
    }
    fib_cdev->ops = &fib_fops;
    rc = cdev_add(fib_cdev, fib_dev, 1);

    if (rc < 0) {
        printk(KERN_ALERT "Failed to add cdev");
        rc = -2;
        goto failed_cdev;
    }

    fib_class = class_create(THIS_MODULE, DEV_FIBONACCI_NAME);

    if (!fib_class) {
        printk(KERN_ALERT "Failed to create device class");
        rc = -3;
        goto failed_class_create;
    }
    if (!device_create(fib_class, NULL, fib_dev, NULL, DEV_FIBONACCI_NAME))     {
        printk(KERN_ALERT "Failed to create device");
        rc = -4;
        goto failed_device_create;
    }
failed_device_create:
    class_destroy(fib_class);
failed_class_create:
    cdev_del(fib_cdev);
}
failed_cdev:
    unregister_chrdev_region(fib_dev, 1);
    return rc;

cdev_alloc() : 分配一個記憶體空間,並做初始化。

/**
 * cdev_alloc() - allocate a cdev structure
 *
 * Allocates and returns a cdev structure, or NULL on failure.
 */
struct cdev *cdev_alloc(void)
{
	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
	if (p) {
		INIT_LIST_HEAD(&p->list);
		kobject_init(&p->kobj, &ktype_cdev_dynamic);
	}
	return p;
}

cdev_alloc 也可以替換成: kmalloc 配置記憶體給 fib_cdev ,在呼叫 cdev_init(struct cdev *cdev, const struct file_operations *fops) ,但是 cdev_init 要傳入 file_operations

cdev_add():將 char device 加入至系統。

/**
 * cdev_add() - add a char device to the system
 * @p: the cdev structure for the device
 * @dev: the first device number for which this device is responsible
 * @count: the number of consecutive minor numbers corresponding to this
 *         device
 *
 * cdev_add() adds the device represented by @p to the system, making it
 * live immediately.  A negative error code is returned on failure.
 */
int cdev_add(struct cdev *p, dev_t dev, unsigned count)
{
	int error;

	p->dev = dev;
	p->count = count;

	if (WARN_ON(dev == WHITEOUT_DEV))
		return -EBUSY;

	error = kobj_map(cdev_map, dev, count, NULL,
			 exact_match, exact_lock, p);
	if (error)
		return error;

	kobject_get(p->kobj.parent);

	return 0;
}

到目前為止,尚未在 /dev/ 目錄下建立 device,有兩種方法可以建立

  1. 透過 mknod 手動建立名為 fibonacci 的 device driver
  2. 透過 class_createdevice_create 來在 /dev/ 目錄下建立 fibonacci char device:
    在呼叫 device_create 時,要傳入 struct class 當作 argument,所以必須先 create class, 而這個 class 會放置在 sysfs 下面(/sys/class/fibonacci),
* device_create - creates a device and registers it with sysfs
struct device *device_create(struct class *class, struct device *parent,
			     dev_t devt, void *drvdata, const char *fmt, ...)

就不需要透過 mknod 來手動建立 device driver, 在呼叫 device_create 成功後,就會在 /dev/ 目錄下建立 char device,也就是分配 inode 節點。

總結剛剛所述,在 fibdrv.c 中定義了一系列對於此 char device 的操作:

const struct file_operations fib_fops = {
    .owner = THIS_MODULE,
    .read = fib_read,
    .write = fib_write,
    .open = fib_open,
    .release = fib_release,
    .llseek = fib_device_lseek,
};

定義完操作後變將這些操作關聯到相對應的 char device 上,並註冊 char device 到系統當中,
最後就是建立 device node,讓 user 可以透過對這個 device node 操作,來和 device 互動

Linux Virtual File System Interface

閱讀 Linux 核心設計: 檔案系統概念及實作手法

VFS 定義了在實體檔案系統上更高一層的介面,讓應用程式得以透過 VFS 定義好的介面存取底層資料,不用考慮底層是如何實作。有了 VFS,增加擴充新的檔案系統非常容易,只要實作出 VFS 規範的介面。

linux system call 到 device driver file_ops 過程