Linux 核心專題: fibdrv 改進

執行人: p96114175
GitHub
專題解說錄影

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

提問清單

任務簡述

擴充 fibdrv，強化其並行處理能力，預計達成:

有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷
藉由 hashtable 或 cache 一類的機制，儲存已計算的 Fibonacci 數
引入 workqueue，將運算要求分派給個別 CPU 核，並確保降低非必要的同步處理成本
修訂 fibdrv 和應用程式之間的 API，使其適合用於同步處理

TODO: 儲存已計算的 Fibonacci 數

考慮到大數運算的特性，當以 key-value 形式保存時，不是儲存單純的整數值，而是指向特定結構的指標，於是當 fibdrv 嘗試釋放佔用的記憶體空間時，應有對應的操作
考慮到 fast doubling 和 Fibonacci 數的特性，不用保存連續數值，而是關注第 N 個和第 2N 個 Fibonacci 數的關聯，儘量降低記憶體開銷
應當善用 Linux 核心的 hashtable 或相關的資料結構

引入 workqueue，確保並行處理的效益

學習 ktcp，引入 kthread 和 CMWQ 到 fibdrv，確保 Fibonacci 數的運算可發揮硬體能力
確保並行處理的效益，不僅要確認結果正確，還要讓並行的 fibdrv 得以更有效的運算
思考如何運用 Linux 核心的 RCU (或其他 lock-free 機制) 有效釋放並行處理過程中的記憶體資源

有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷

閱讀大數運算協助計算出 Fibonacci 第一百萬個

以下為欲支援運算的函式，部分參考教材

bn_clz

/* count leading zeros of src*/
static int bn_clz(const bn *src)
{
    int cnt = 0;
    for (int i = src->size - 1; i >= 0; i--) {
        if (src->number[i]) {
            // prevent undefined behavior when src = 0
            cnt += __builtin_clz(src->number[i]);
            return cnt;
        } else {
            cnt += 32;
        }
    }
    return cnt;
}

bn_to_string

該函式用於將 bn 轉換為十進制字串表示

用 (8 * sizeof(int) * src.size) / 3 + 2 + src.sign 算出需要的字元存放空間，再由 kmalloc() 分配這段空間

使用 memset() 對字元陣列 s 所有元素初始化為 0，最後一個元素設置為 \0 表示字串的結束

/* 
 * output bn to decimal string
 * Note: the returned string should be freed with kfree()
 */
char *bn_to_string(bn src)
{
    // log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
    size_t len = (8 * sizeof(int) * src.size) / 3 + 2 + src.sign;
    char *s = kmalloc(len, GFP_KERNEL);
    char *p = s;

    memset(s, '0', len - 1);
    s[len - 1] = '\0';

    for (int i = src.size - 1; i >= 0; i--) {
        for (unsigned int d = 1U << 31; d; d >>= 1) {
            /* binary -> decimal string */
            int carry = !!(d & src.number[i]);
            for (int j = len - 2; j >= 0; j--) {
                s[j] += s[j] - '0' + carry; // double it
                carry = (s[j] > '9');
                if (carry)
                    s[j] -= 10;
            }
        }
    }
    // skip leading zero
    while (p[0] == '0' && p[1] != '\0') { 
        p++;
    }
    if (src.sign)
        *(--p) = '-';
    memmove(s, p, strlen(p) + 1);
    return s;
}

在 fibdrv.c 新增以下內容

#define MAX_LENGTH 1000000
...
/* calc n-th Fibonacci number and save into dest */
void bn_fib(bn *dest, unsigned int n)
{
    bn_resize(dest, 1);
    if (n < 2) { //Fib(0) = 0, Fib(1) = 1
        dest->number[0] = n;
        return;
    }

    bn *a = bn_alloc(1);
    bn *b = bn_alloc(1);
    dest->number[0] = 1;

    for (unsigned int i = 1; i < n; i++) {
        bn_swap(b, dest);
        bn_add(a, b, dest);
        bn_swap(a, b);
    }
    bn_free(a);
    bn_free(b);
}
void bn_fib_fdoubling(bn *dest, unsigned int n)
{
    bn_resize(dest, 1);
    if (n < 2) { //Fib(0) = 0, Fib(1) = 1
        dest->number[0] = n;
        return;
    }

    bn *f1 = dest; /* F(k) */
    bn *f2 = bn_alloc(1); /* F(k+1) */
    f1->number[0] = 0;
    f2->number[0] = 1;
    bn *k1 = bn_alloc(1);
    bn *k2 = bn_alloc(1);

    for (unsigned int i = 1U << 31; i; i >>= 1) {
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        bn_cpy(k1, f2);
        bn_lshift(k1, 1);
        bn_sub(k1, f1, k1);
        bn_mult(k1, f1, k1);
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_mult(f1, f1, f1);
        bn_mult(f2, f2, f2);
        bn_cpy(k2, f1);
        bn_add(k2, f2, k2);
        if (n & i) {
            bn_cpy(f1, k2);
            bn_cpy(f2, k1);
            bn_add(f2, k2, f2);
        } else {
            bn_cpy(f1, k1);
            bn_cpy(f2, k2);
        }
    }
    bn_free(f2);
    bn_free(k1);
    bn_free(k2);
}
...
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn *fib = bn_alloc(1);
    bn_fib_doubling(fib, *offset);
    char *s = bn_to_string(*fib);
    copy_to_user(buf, s, strlen(s) + 1);
    bn_free(fib);
    kfree(s);
    return 0;
}

建立名為 million.c 的檔案，負責接收回傳回來的資料

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <inttypes.h>

#define FIB_DEV "/dev/fibonacci"

int main()
{
    // char buf[1000000];
    char buf[1000000];
    // char write_buf[] = "testing writing";
    int offset = 1000000;

    int fd = open(FIB_DEV, O_RDWR);

    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }
    for (int i = offset; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        read(fd, buf, sizeof(buf));
        // char *p = bn_to_string(buf, len);
        printf("reading from " FIB_DEV
               " at ofset %d, returned the sequence "
               "%s.\n",
               i, buf);
    }
    // free(buf);
    close(fd);
    return 0;
}

執行以下命令

$ gcc million.c -o million
$ sudo perf stat -e cache-misses,cache-references,instructions,cycles ./million

得到以下結果

reading from /dev/fibonacci at ofset 10000, returned the sequence 33644764876431783266621612005107543310302148460680063906564769974680081442166662368155595513633734025582065332680836159373734790483865268263040892463056431887354544369559827491606602099884183933864652731300088830269235......970975069382648706613264507665074611512677522748621598642530711298441182622661057163515069260029861704945425047491378115154139941550671256271197133252763631939606902895650288268608362241082050562430701794976171121233066073310059947366875.
*** stack smashing detected ***: terminated
./million: 中止

 Performance counter stats for './million':

           381,200      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
         3,151,919      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
   762,443,945,177      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
   278,306,077,550      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

      57.240695802 seconds time elapsed

       0.011964000 seconds user
      56.988321000 seconds sys

隨後用 perf 進行 f(0)~f(10000) 分析，初步分析出 bn_to_string 這個函式有問題，佔總執行時間 98.87% ，而 bn_fib_fdoubling 僅占 1.07% ，裏頭有

57.24 s \times 98.87

在執行 bn_to_string

reading from /dev/fibonacci at ofset 100000, returned the sequence 25974069347221724166155034021275915414880485386517696584724770703952534543511273686265556772836716744754637587223074432111638399473875091030965697382188304493052287638531334921353026792789567010512765782716356.....21635660895603514383883939018953166274355609970015699780289236362349895374653428746875.

 Performance counter stats for './million':

            30,774      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
           119,111      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
    22,703,034,346      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
     8,320,050,649      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

       1.690718310 seconds time elapsed

       0.000000000 seconds user
       1.685669000 seconds sys

reading from /dev/fibonacci at ofset 1000000, returned the sequence 19532821287077577316320149475962563324435429965918733969534051945716252578870156947666419876341501461288795243352202360846255109120195602337440154381151966361569199621256428943033701138278006380027674115279274666698655783793188228320612714975832303348548934895725992307229129019282092643316275217308614600179125820426996599360209593392020051848620284024473431398113674187202038684801753185386211128........03373870955680756568226835379339839824880227237703197854614809323023472557966211738929885417307414847072116640441570575360458225614322429985978068323969654385552378378141386675079286837205802043347225419033684684301719893411568996526838242546875.
Performance counter stats for './million':

           240,028      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
         3,482,714      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
 2,269,015,989,781      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
   832,563,412,079      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

     167.505017193 seconds time elapsed

       0.000000000 seconds user
     167.471991000 seconds sys

上圖 f(100000)、f(1000000) 的結果和 WolframAlpha 的結果一致
f(100000)如下

f(1000000)如下

可將 bn_to_string 搬到使用者層級來處理，讓核心模組聚焦在 Fibonacci 求值和相關大數運算/數值表示。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

好的老師，我試試看

Samples: 235K of event 'cpu_core/cycles/', Event count (approx.): 278841269506
  Children      Self  Command  Shared Object      Symbol
+   99.95%     0.00%  million  [kernel.kallsyms]  [k] __x64_sys_read         ◆
+   99.95%     0.00%  million  [kernel.kallsyms]  [k] ksys_read              ▒
+   99.95%     0.00%  million  [kernel.kallsyms]  [k] vfs_read               ▒
-   99.95%     0.00%  million  [fibdrv]           [k] fib_read               ▒
   - fib_read                                                                ▒
        98.87% bn_to_string                                                  ▒
      + 1.07% bn_fib_fdoubling                                               ▒
    98.87%    98.84%  million  [fibdrv]           [k] bn_to_string           ▒
+    1.07%     0.00%  million  [fibdrv]           [k] bn_fib_fdoubling       ▒
     1.04%     1.04%  million  [fibdrv]           [k] bn_mult                ▒     0.04%     0.00%  million  libc-2.31.so       [.] __printf (inlined)     ▒
     0.04%     0.00%  million  libc-2.31.so       [.] __vfprintf_internal    ▒
     0.04%     0.00%  million  libc-2.31.so       [.] _IO_new_file_xsputn (in▒
     0.04%     0.00%  million  [kernel.kallsyms]  [k] asm_sysvec_apic_timer_i▒
     0.03%     0.00%  million  [kernel.kallsyms]  [k] sysvec_apic_timer_inter▒
     0.03%     0.00%  million  [kernel.kallsyms]  [k] __sysvec_apic_timer_int▒
     0.03%     0.00%  million  [kernel.kallsyms]  [k] hrtimer_interrupt      ▒
     0.03%     0.00%  million  [unknown]          [k] 0x3733353432373336     ▒

將 bn_to_string 搬到使用者層級來處理，讓核心模組聚焦在 Fibonacci 求值和相關大數運算/數值表示

fibdrv.c

以下為調整過後的版本

/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn *fib = bn_alloc(1);
    bn_fib_fdoubling(fib, *offset);
    int len = fib->size;
    copy_to_user(buf, fib->number, sizeof(unsigned int)*len);
    bn_free(fib);
    return len;
}

million.c

以下為調整過後的版本

#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

// #include "BigNumber.h"
char *bn_to_string(void *str, size_t size)
{
    // log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
    unsigned int *number = (unsigned int *) str;
    size_t len = (8 * sizeof(int) * size) / 3 + 2;
    char *s = malloc(len);
    char *p = s;

    memset(s, '0', len - 1);
    s[len - 1] = '\0';

    for (int i = size - 1; i >= 0; i--) {
        for (unsigned int d = 1U << 31; d; d >>= 1) {
            /* binary -> decimal string */
            int carry = !!(d & number[i]);
            for (int j = len - 2; j >= 0; j--) {
                s[j] += s[j] - '0' + carry;  // double it
                carry = (s[j] > '9');
                if (carry)
                    s[j] -= 10;
            }
        }
    }
    // skip leading zero
    while (p[0] == '0' && p[1] != '\0') {
        p++;
    }
    // if (src.sign)
    //     *(--p) = '-';
    memmove(s, p, strlen(p) + 1);
    return s;
}

int main()
{
    char buf[100000];
    // char write_buf[] = "testing writing";
    int offset = 100000;

    int fd = open(FIB_DEV, O_RDWR);

    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }
    for (int i = offset; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        int len = read(fd, buf, sizeof(buf));
        char *p = bn_to_string(buf, len);
        printf("reading from " FIB_DEV
               " at ofset %d, returned the sequence "
               "%s.\n",
               i, p);
    }
    close(fd);
    return 0;
}

在million.c 我發現有重複計算的情況，所以我直接將變數 i 調至我們要的目標值，便可減少重複計算的缺點。

執行以下命令:

$ perf stat -e cache-misses,cache-references,instructions,cycles ./million

得到以下結果，可以發現確實改善重複計算的問題

reading from /dev/fibonacci at ofset 100000, returned the sequence 25974069347221724166155034021275915414880485386517696584724770703952534543511273686265556772836716744754637587223074432111638399473875091030965697382188304493052287638531334921....21635660895603514383883939018953166274355609970015699780289236362349895374653428746875.

 Performance counter stats for './million':

            40,088      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
           215,685      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
    62,423,779,622      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
    24,461,981,416      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

       4.914407094 seconds time elapsed

       4.873887000 seconds user
       0.035984000 seconds sys

reading from /dev/fibonacci at ofset 1000000, returned the sequence 1953282128707757731632014947596256332443542996591873396953405194571625257887015694766641987634150146128879524335220236084625510912019560....72557966211738929885417307414847072116640441570575360458225614322429985978068323969654385552378378141386675079286837205802043347225419033684684301719893411568996526838242546875.

 Performance counter stats for './million':

           923,247      cpu_core/cache-misses/                                        (100.00%)
       134,057,082      cpu_atom/cache-misses/                                        (0.00%)
        13,412,663      cpu_core/cache-references/                                     (100.00%)
     1,201,151,457      cpu_atom/cache-references/                                     (0.00%)
 6,239,775,530,849      cpu_core/instructions/                                        (100.00%)
 3,294,260,877,504      cpu_atom/instructions/                                        (0.00%)
 2,464,834,820,774      cpu_core/cycles/                                              (100.00%)
 1,845,415,329,159      cpu_atom/cycles/                                              (0.00%)

     488.835328958 seconds time elapsed

     487.016463000 seconds user
       1.775938000 seconds sys

將 bn_to_string 移至 user level 後，其中

f (100000)

及

f (1000000)

的運算結果和 WolframAlpha 的結果一致，達成其中一個目標計算出第一百萬個的 fibonacci

執行以下命令:

$ perf report

得到以下結果，但是 bn_to_string 的問題仍需要改進

Samples: 20K of event 'cpu_core/cycles/', Event count (approx.): 24424653112
  Children      Self  Command  Shared Object      Symbol
+   99.98%     0.00%  million  [unknown]          [k] 0xffffffffffffffff       ◆
+   99.98%     0.00%  million  million            [.] main                     ▒
-   99.61%    99.59%  million  million            [.] bn_to_string             ▒
     99.59% 0xffffffffffffffff                                                 ▒
        main                                                                   ▒
        bn_to_string                                                           ▒
     0.38%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff94000099       ▒
     0.38%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff93f666d9       ▒
     0.37%     0.00%  million  libc-2.31.so       [.] __GI___read (inlined)    ▒
     0.37%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff93585d6a       ▒
     0.37%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff93585cc7       ▒
     0.37%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff935837bd       ▒
     0.37%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfee0       ▒
     0.14%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfe1f       ▒
     0.12%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfe03       ▒
     0.11%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfe11       ▒
     0.07%     0.07%  million  [fibdrv]           [k] 0x0000000000000a31       ▒
     0.07%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfa34       ▒
     0.05%     0.05%  million  [fibdrv]           [k] 0x0000000000000a29       ▒
     0.05%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfa2b

針對 `bn_fib_fdoubling` 進行改良

原先 bn_cpy 採取從 src 記憶體區域複製至 dest 記憶體區域。
memcpy

The memcpy() function copies n bytes from memory area src to
memory area dest. The memory areas must not overlap. Use
memmove(3) if the memory areas do overlap.

後來採用 bn_swap 達成交換 bn 位置，如下

void bn_swap(bn *a, bn *b)
{
    bn tmp = *a;
    *a = *b;
    *b = tmp;
}

改良前

void bn_fib_fdoubling(bn *dest, unsigned int n)
{
    ...
    for (unsigned int i = 1U << 31; i; i >>= 1) {
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        bn_cpy(k1, f2);
        bn_lshift(k1, 1, k1);
        bn_sub(k1, f1, k1);
        bn_mult(k1, f1, k1);
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_mult(f1, f1, f1);
        bn_mult(f2, f2, f2);
        bn_cpy(k2, f1);
        bn_add(k2, f2, k2);
        if (n & i) {
            bn_cpy(f1, k2);
            bn_cpy(f2, k1);
            bn_add(f2, k2, f2);
        } else {
            bn_cpy(f1, k1);
            bn_cpy(f2, k2);
        }
    }
    ...
}

改良後

void bn_fib_fdoubling(bn *dest, unsigned int n)
{
    ...
    for (unsigned int i = 1U << (31 - __builtin_clz(n)); i; i >>= 1) {
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_lshift(f2, 1, k1);// k1 = 2 * F(k+1)
        bn_sub(k1, f1, k1);  // k1 = 2 * F(k+1) – F(k)
        bn_mult(k1, f1, k2); // k2 = k1 * f1 = F(2k)
        bn_mult(f1, f1, k1); // k1 = F(k)^2
        bn_swap(f1, k2);     // f1 <-> k2, f1 = F(2k) now
        bn_mult(f2, f2, k2); // k2 = F(k+1)^2
        bn_add(k1, k2, f2);  // f2 = f1^2 + f2^2 = F(2k+1) now
        if (n & i) {
            bn_swap(f1, f2);    // f1 = F(2k+1)
            bn_add(f1, f2, f2); // f2 = F(2k+2)
        }
    }
    ...
}

改良前

reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.

 Performance counter stats for './million':

            16,024      cpu_core/cache-misses/    (0.00%)
            43,849      cpu_core/cache-references/    (0.00%)
         8,404,954      cpu_core/instructions/    (0.00%)
         4,012,031      cpu_core/cycles/    (0.00%)

       0.005958262 seconds time elapsed

       0.005661000 seconds user
       0.000000000 seconds sys

改良後

reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.

 Performance counter stats for './million':

            27,210      cpu_core/cache-misses/    (0.00%)
            42,862      cpu_core/cache-references/    (0.00%)
         8,325,720      cpu_core/instructions/    (0.00%)
         4,075,598      cpu_core/cycles/    (0.00%)

       0.005514773 seconds time elapsed

       0.005042000 seconds user
       0.000000000 seconds sys

以時間來看，改進了 0.000443489 seconds，約為 7 %，表示 swap 這個策略比 memcpy() 來的好
instructions 也從原先 8,404,954 下降至 8,325,720，約為 0.9%

使用 `Q-Matrix` 進行改良

學習 Q-Matrix 如何使用
參考公式如下

\begin{aligned} F (2 k - 1) & = F (k)^{2} + F (k - 1)^{2} \\ F (2 k) & = F (k) [2 F (k - 1) + F (k)] \end{aligned}

得到以下測試結果

reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.

 Performance counter stats for './million':

            9,620      cpu_core/cache-misses/    (0.00%)
            48,474      cpu_core/cache-references/    (0.00%)
         8,335,893      cpu_core/instructions/    (0.00%)
         4,094,925      cpu_core/cycles/    (0.00%)

       0.005139802 seconds time elapsed

       0.004427000 seconds user
       0.000000000 seconds sys

時間從 0.005514773 seconds 降至 0.005139802 seconds，約為 6%

Implement Hash Table to save Fibonacci.

參考 Linux 核心的 hash table 實作來設計 Hash table

儘量使用 Linux 核心提供的標頭檔和 API。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

目前已使用老師說的標頭檔和API了

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

huaxin

資料結構

struct hlist_node { 
    struct hlist_node *next, **pprev; 
};
struct hlist_head { 
    struct hlist_node *first; 
};
typedef struct { 
    int bits; 
    struct hlist_head *ht; 
} map_t;
struct hash_key {
    int key;
    void *data;
    struct hlist_node node;
};

宣告出 pprev （指標的指標) 並不指向上個節點，而是指向上個節點的 next 指標，當要刪除的節點是首個節點，可通過 *pprev = next 直接修改指標的指向。詳細內容請看Linux 核心的 hash table 實作

為保存我們需要的數值，定義出以下 struct

struct bn_hashtable {
    unsigned int id;
    struct _bn *bn_object;
    struct hlist_node node;
};

示意圖如下:

目前想先從 n = 100做起，所以先定義出 128 個 buckets

#define DEFAULT_HASHTABLE_LENGTH 7
// define a hash table with 2^7(=128) buckets
DEFINE_HASHTABLE(htable, DEFAULT_HASHTABLE_LENGTH);
// => struct hlist_head htable[128] = { [0 ... 127] = HLIST_HEAD_INIT };

目前考慮到 fast doubling 和 Fibonacci 數的特性，不用保存連續數值，而是關注第 N 個和第 2N 個 Fibonacci 數的關聯，想先保存第 N 和第 2N 個數值
參考 Matrix Difference Equation for Fibonacci Sequence，把 Fibonacci Q-Matrix 公式重新推導一次，已理解以下 Fast Doubling 的由來

\begin{aligned} F_{2 n + 1} & = {F_{n + 1}}^{2} + {F_{n}}^{2} \\ F_{2 n} & = F_{n} (F_{n + 1} + F_{n - 1}) \\ = F_{n} (F_{n + 1} + (F_{n + 1} - F_{n})) \\ = F_{n} (2 F_{n + 1} - F_{n}) \end{aligned}

參考 Calculating Fibonacci Numbers by Fast Doubling

學習下方方法，可以幫助我們找到目標值，如下

\begin{aligned} (\begin{array}{c} a = F_{0} \\ b = F_{1} \end{array}) & \overset{2 n + 1, 2 n + 2}{\to} (\begin{array}{c} F_{1} \\ F_{2} \end{array}) \overset{2 n, 2 n + 1}{\to} (\begin{array}{c} F_{2} \\ F_{3} \end{array}) \overset{2 n + 1, 2 n + 2}{\to} (\begin{array}{c} F_{5} \\ F_{6} \end{array}) \overset{2 n, 2 n + 1}{\to} (\begin{array}{c} F_{10} \\ F_{11} \end{array}) \end{aligned}

舉 f(100) 為例子，說明 bn_fib_doubling 如何運算出 count， n = 100，經i = 1U << (31 - __builtin_clz(n))，會先將 100 轉為 1100100(binary)，再執行 __builtin_clz(1100100)，可以得到 24，結果 i 可得 1000000。
Other Builtins

Built-in Function: int __builtin_clz (unsigned int x)
Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.

經過 (n & i) ? (count << 1) + 1 : count << 1 可得到每一次 for loop 的 count 值，依序為
0000001 = (1100100 & 1000000) ? (0000000 << 1) + 1 : 0000000 << 1
0000011 = (1100100 & 0100000) ? (0000001 << 1) + 1 : 0000001 << 1
0000110 = (1100100 & 0010000) ? (0000011 << 1) + 1 : 0000011 << 1
0001100 = (1100100 & 0001000) ? (0000110 << 1) + 1 : 0000110 << 1
0011001 = (1100100 & 0000100) ? (0001100 << 1) + 1 : 0001100 << 1
0110010 = (1100100 & 0000010) ? (0011001 << 1) + 1 : 0011001 << 1
1100100 = (1100100 & 0000001) ? (0110010 << 1) + 1 : 0110010 << 1
算出 count 之後，將用於 hashtable 中，儲存我們每一階段計算好的 fibonacci ，有f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100)

void bn_fib_doubling(bn *dest, unsigned int n)
{
    bn_resize(dest, 1);
    if (n < 2) {  // Fib(0) = 0, Fib(1) = 1
        dest->number[0] = n;
        return;
    }

    bn *f1 = dest;        /* F(k) */
    bn *f2 = bn_alloc(1); /* F(k+1) */
    f1->number[0] = 0;
    f2->number[0] = 1;
    bn *k1 = bn_alloc(1);
    bn *k2 = bn_alloc(1);
    // Follow the rule as below
    // f(0)  -----> f(1)  -----> f(2)  -----> f(5)  -----> f(10)  ......
    //        2n+1          2n          2n+1          2n
    unsigned int count = 0;
    struct bn_hashtable *hash_t =
        kmalloc(sizeof(struct bn_hashtable), GFP_KERNEL);
    hash_t->id = count;
    bn *tmp = bn_alloc(1);
    bn_cpy(tmp, dest);
    hash_t->bn_object = tmp;
    hash_add(htable, &hash_t->node, hash_t->id);
    // Take for example f(100) = 1100100 , initial i is 1100100, following i as
    // 1000000, 0100000, 0000000, 0000000, 0000100, 0000000, 0000000
    for (unsigned int i = 1U << (31 - __builtin_clz(n)); i; i >>= 1) {
        count = (n & i) ? (count << 1) + 1 : count << 1;
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_lshift(f2, 1, k1);  // k1 = 2 * F(k+1)
        bn_sub(k1, f1, k1);    // k1 = 2 * F(k+1) – F(k)
        bn_mult(k1, f1, k2);   // k2 = k1 * f1 = F(2k)
        bn_mult(f1, f1, k1);   // k1 = F(k)^2
        bn_swap(f1, k2);       // f1 <-> k2, f1 = F(2k) now
        bn_mult(f2, f2, k2);   // k2 = F(k+1)^2
        bn_add(k1, k2, f2);    // f2 = f1^2 + f2^2 = F(2k+1) now
        // Below description is for "if (n & i)".
        // if i is 1000000, then do 1100100 & 1000000. Finally, I would get
        // true. if i is 0100000, then do 1100100 & 0100000. Finally, I would
        // get true. if i is 0000000, then do 1100100 & 0000000. Finally, I
        // would get false. if i is 0000000, then do 1100100 & 0000000. Finally,
        // I would get false. if i is 0000100, then do 1100100 & 0000100.
        // Finally, I would get true. if i is 1000000, then do 1100100 &
        // 1000000. Finally, I would get false.
        if (n & i) {             // odd
            bn_swap(f1, f2);     // f1 = F(2k+1)
            bn_add(f1, f2, f2);  // f2 = F(2k+2)
        }
        struct bn_hashtable *hash_t_loop =
            kmalloc(sizeof(struct bn_hashtable), GFP_KERNEL);
        hash_t_loop->id = count;
        bn *tmp_loop = bn_alloc(1);
        bn_cpy(tmp_loop, dest);
        hash_t_loop->bn_object = tmp_loop;
        hash_add(htable, &hash_t_loop->node, hash_t_loop->id);
    }
    bn_free(f2);
    bn_free(k1);
    bn_free(k2);
}

當我們要嘗試驗證我們儲存的資料是否正確，我們需要先確認好儲存位置，分別為f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100)，再針對key 設為 0,1,3,6,12,25,50,100

/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    int len = 0;
    bn *fib = bn_alloc(1);
    bn_fib_doubling(fib, *offset);
    unsigned int key = 50; // 以f(100)為例，這裡會設定 key 為 0, 1, 3, 6, 12, 25,50,100 ,驗證是否儲存正確
    struct bn_hashtable *hash_t_;
    hash_for_each_possible(htable, hash_t_, node, key) {
        if(hash_t_->id == key) { 
            len = hash_t_->bn_object->size;
            copy_to_user(buf, hash_t_->bn_object->number, sizeof(unsigned int)*len);
            printk(KERN_INFO "get target %d \n", key);
            bn_free(fib);
            return len;
        }
    }
    kfree(hash_t_);
    bn_free(fib);
    
    return len;
}

可得到以下結果，確定 f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100) 在 hashtable 中儲存是正確的

reading from /dev/fibonacci at ofset 100, returned the sequence 0.
reading from /dev/fibonacci at ofset 100, returned the sequence 1.
reading from /dev/fibonacci at ofset 100, returned the sequence 2.
reading from /dev/fibonacci at ofset 100, returned the sequence 8.
reading from /dev/fibonacci at ofset 100, returned the sequence 144.
reading from /dev/fibonacci at ofset 100, returned the sequence 75025.
reading from /dev/fibonacci at ofset 100, returned the sequence 12586269025.
reading from /dev/fibonacci at ofset 100, returned the sequence 354224848179261915075.

文字訊息「不要」用圖片展現，尊重視覺障礙者閱讀的權益。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

上圖結果和 WolframAlpha 一致
也可以透過 dmeg 檢查我們的是否成功儲存我們的數值，命令如下

dmesg |grep target

[187815.683518] get target 1
[187822.400863] get target 3
[187829.297017] get target 6
[187836.337402] get target 12
[187843.613414] get target 25
[187851.664775] get target 50
[187857.644399] get target 100

我們考慮 fast doubling 和 Fibonacci 數的特性，所以不保存連續數值，而是關注第 N 個和第 2N 個 Fibonacci 數的關聯，儲存第 N 個和第 2N 個 Fibonacci 數，降低記憶體開銷。

注意用語，參見:

當 fibdrv 嘗試釋放佔用的記憶體空間時，應有對應的操作

針對我們的 hashtable 逐一釋放記憶體

/*release hashtable*/
static void hashtable_release(void)
{
    struct bn_hashtable *pos = NULL;
    struct hlist_node *n = NULL;
    int bucket;

    for (bucket = 0; bucket < (1U << DEFAULT_HASHTABLE_LENGTH); ++bucket) {
        hlist_for_each_entry_safe(pos, n, &htable[bucket], node)
        {
            bn_free(pos->bn_object);
            hlist_del(&pos->node);
            kfree(pos);
        }
    }
}

參考 The __init and __exit Macros其中提到以下內容

The __exit macro causes the omission of the function when the module is built into the kernel, and like __exit, has no effect for loadable modules. Again, if you consider when the cleanup function runs, this makes complete sense;

於是便把 hashtable_release 放在 __exit 內部

static void __exit exit_fib_dev(void)
{
    // mutex_destroy(&fib_mutex);
    device_destroy(fib_class, fib_dev);
    class_destroy(fib_class);
    cdev_del(fib_cdev);
    unregister_chrdev_region(fib_dev, 1);
+   hashtable_release();
}

引入 workqueue，將運算要求分派給個別 CPU 核，並確保降低非必要的同步處理成本

研究中…
〈Concurrency Managed Workqueue(cmwq)〉提到，cmwq 是 wq 的改進，專注以下目標

Maintain compatibility with the original workqueue API
Use per-CPU unified worker pools shared by all wq to provide flexible level of concurrency on demand without wasting a lot of resource.
Automatically regulate worker pool and level of concurrency

Linux 核心專題: fibdrv 改進

任務簡述

TODO: 儲存已計算的 Fibonacci 數

引入 workqueue，確保並行處理的效益

有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷

bn_clz

bn_to_string

fibdrv.c

million.c

針對 bn_fib_fdoubling 進行改良

使用 Q-Matrix 進行改良

Implement Hash Table to save Fibonacci.

資料結構

當 fibdrv 嘗試釋放佔用的記憶體空間時，應有對應的操作

引入 workqueue，將運算要求分派給個別 CPU 核，並確保降低非必要的同步處理成本

針對 `bn_fib_fdoubling` 進行改良

使用 `Q-Matrix` 進行改良