Try   HackMD

Linux 核心專題: fibdrv 改進

執行人: p96114175
GitHub
專題解說錄影

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
提問清單

  • ?

任務簡述

擴充 fibdrv,強化其並行處理能力,預計達成:

  • 有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷
  • 藉由 hashtable 或 cache 一類的機制,儲存已計算的 Fibonacci 數
  • 引入 workqueue,將運算要求分派給個別 CPU 核,並確保降低非必要的同步處理成本
  • 修訂 fibdrv 和應用程式之間的 API,使其適合用於同步處理

TODO: 儲存已計算的 Fibonacci 數

  1. 考慮到大數運算的特性,當以 key-value 形式保存時,不是儲存單純的整數值,而是指向特定結構的指標,於是當 fibdrv 嘗試釋放佔用的記憶體空間時,應有對應的操作
  2. 考慮到 fast doubling 和 Fibonacci 數的特性,不用保存連續數值,而是關注第 N 個和第 2N 個 Fibonacci 數的關聯,儘量降低記憶體開銷
  3. 應當善用 Linux 核心的 hashtable 或相關的資料結構

引入 workqueue,確保並行處理的效益

  1. 學習 ktcp,引入 kthread 和 CMWQ 到 fibdrv,確保 Fibonacci 數的運算可發揮硬體能力

  2. 確保並行處理的效益,不僅要確認結果正確,還要讓並行的 fibdrv 得以更有效的運算

  3. 思考如何運用 Linux 核心的 RCU (或其他 lock-free 機制) 有效釋放並行處理過程中的記憶體資源

有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷

閱讀大數運算 協助計算出 Fibonacci 第一百萬個

以下為欲支援運算的函式,部分參考教材

bn_clz

/* count leading zeros of src*/
static int bn_clz(const bn *src)
{
    int cnt = 0;
    for (int i = src->size - 1; i >= 0; i--) {
        if (src->number[i]) {
            // prevent undefined behavior when src = 0
            cnt += __builtin_clz(src->number[i]);
            return cnt;
        } else {
            cnt += 32;
        }
    }
    return cnt;
}

bn_to_string

該函式用於將 bn 轉換為十進制字串表示

(8 * sizeof(int) * src.size) / 3 + 2 + src.sign 算出需要的字元存放空間,再由 kmalloc() 分配這段空間

使用 memset() 對字元陣列 s 所有元素初始化為 0,最後一個元素設置為 \0 表示字串的結束

/* 
 * output bn to decimal string
 * Note: the returned string should be freed with kfree()
 */
char *bn_to_string(bn src)
{
    // log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
    size_t len = (8 * sizeof(int) * src.size) / 3 + 2 + src.sign;
    char *s = kmalloc(len, GFP_KERNEL);
    char *p = s;

    memset(s, '0', len - 1);
    s[len - 1] = '\0';

    for (int i = src.size - 1; i >= 0; i--) {
        for (unsigned int d = 1U << 31; d; d >>= 1) {
            /* binary -> decimal string */
            int carry = !!(d & src.number[i]);
            for (int j = len - 2; j >= 0; j--) {
                s[j] += s[j] - '0' + carry; // double it
                carry = (s[j] > '9');
                if (carry)
                    s[j] -= 10;
            }
        }
    }
    // skip leading zero
    while (p[0] == '0' && p[1] != '\0') { 
        p++;
    }
    if (src.sign)
        *(--p) = '-';
    memmove(s, p, strlen(p) + 1);
    return s;
}

fibdrv.c 新增以下內容

#define MAX_LENGTH 1000000
...
/* calc n-th Fibonacci number and save into dest */
void bn_fib(bn *dest, unsigned int n)
{
    bn_resize(dest, 1);
    if (n < 2) { //Fib(0) = 0, Fib(1) = 1
        dest->number[0] = n;
        return;
    }

    bn *a = bn_alloc(1);
    bn *b = bn_alloc(1);
    dest->number[0] = 1;

    for (unsigned int i = 1; i < n; i++) {
        bn_swap(b, dest);
        bn_add(a, b, dest);
        bn_swap(a, b);
    }
    bn_free(a);
    bn_free(b);
}
void bn_fib_fdoubling(bn *dest, unsigned int n)
{
    bn_resize(dest, 1);
    if (n < 2) { //Fib(0) = 0, Fib(1) = 1
        dest->number[0] = n;
        return;
    }

    bn *f1 = dest; /* F(k) */
    bn *f2 = bn_alloc(1); /* F(k+1) */
    f1->number[0] = 0;
    f2->number[0] = 1;
    bn *k1 = bn_alloc(1);
    bn *k2 = bn_alloc(1);

    for (unsigned int i = 1U << 31; i; i >>= 1) {
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        bn_cpy(k1, f2);
        bn_lshift(k1, 1);
        bn_sub(k1, f1, k1);
        bn_mult(k1, f1, k1);
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_mult(f1, f1, f1);
        bn_mult(f2, f2, f2);
        bn_cpy(k2, f1);
        bn_add(k2, f2, k2);
        if (n & i) {
            bn_cpy(f1, k2);
            bn_cpy(f2, k1);
            bn_add(f2, k2, f2);
        } else {
            bn_cpy(f1, k1);
            bn_cpy(f2, k2);
        }
    }
    bn_free(f2);
    bn_free(k1);
    bn_free(k2);
}
...
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn *fib = bn_alloc(1);
    bn_fib_doubling(fib, *offset);
    char *s = bn_to_string(*fib);
    copy_to_user(buf, s, strlen(s) + 1);
    bn_free(fib);
    kfree(s);
    return 0;
}

建立名為 million.c 的檔案,負責接收回傳回來的資料

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <inttypes.h>

#define FIB_DEV "/dev/fibonacci"

int main()
{
    // char buf[1000000];
    char buf[1000000];
    // char write_buf[] = "testing writing";
    int offset = 1000000;

    int fd = open(FIB_DEV, O_RDWR);

    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }
    for (int i = offset; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        read(fd, buf, sizeof(buf));
        // char *p = bn_to_string(buf, len);
        printf("reading from " FIB_DEV
               " at ofset %d, returned the sequence "
               "%s.\n",
               i, buf);
    }
    // free(buf);
    close(fd);
    return 0;
}

執行以下命令

$ gcc million.c -o million
$ sudo perf stat -e cache-misses,cache-references,instructions,cycles ./million

得到以下結果

reading from /dev/fibonacci at ofset 10000, returned the sequence 33644764876431783266621612005107543310302148460680063906564769974680081442166662368155595513633734025582065332680836159373734790483865268263040892463056431887354544369559827491606602099884183933864652731300088830269235......970975069382648706613264507665074611512677522748621598642530711298441182622661057163515069260029861704945425047491378115154139941550671256271197133252763631939606902895650288268608362241082050562430701794976171121233066073310059947366875.
*** stack smashing detected ***: terminated
./million: 中止

 Performance counter stats for './million':

           381,200      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
         3,151,919      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
   762,443,945,177      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
   278,306,077,550      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

      57.240695802 seconds time elapsed

       0.011964000 seconds user
      56.988321000 seconds sys

隨後用 perf 進行 f(0)~f(10000) 分析,初步分析出 bn_to_string 這個函式有問題,佔總執行時間 98.87% , 而 bn_fib_fdoubling 僅占 1.07% ,裏頭有

57.24s×98.87 在執行 bn_to_string

reading from /dev/fibonacci at ofset 100000, returned the sequence 25974069347221724166155034021275915414880485386517696584724770703952534543511273686265556772836716744754637587223074432111638399473875091030965697382188304493052287638531334921353026792789567010512765782716356.....21635660895603514383883939018953166274355609970015699780289236362349895374653428746875.

 Performance counter stats for './million':

            30,774      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
           119,111      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
    22,703,034,346      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
     8,320,050,649      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

       1.690718310 seconds time elapsed

       0.000000000 seconds user
       1.685669000 seconds sys

reading from /dev/fibonacci at ofset 1000000, returned the sequence 19532821287077577316320149475962563324435429965918733969534051945716252578870156947666419876341501461288795243352202360846255109120195602337440154381151966361569199621256428943033701138278006380027674115279274666698655783793188228320612714975832303348548934895725992307229129019282092643316275217308614600179125820426996599360209593392020051848620284024473431398113674187202038684801753185386211128........03373870955680756568226835379339839824880227237703197854614809323023472557966211738929885417307414847072116640441570575360458225614322429985978068323969654385552378378141386675079286837205802043347225419033684684301719893411568996526838242546875.
Performance counter stats for './million':

           240,028      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
         3,482,714      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
 2,269,015,989,781      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
   832,563,412,079      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

     167.505017193 seconds time elapsed

       0.000000000 seconds user
     167.471991000 seconds sys

上圖 f(100000)、f(1000000) 的結果和 WolframAlpha 的結果一致
f(100000)如下

f(1000000)如下

可將 bn_to_string 搬到使用者層級來處理,讓核心模組聚焦在 Fibonacci 求值和相關大數運算/數值表示。

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

好的老師,我試試看

Samples: 235K of event 'cpu_core/cycles/', Event count (approx.): 278841269506
  Children      Self  Command  Shared Object      Symbol
+   99.95%     0.00%  million  [kernel.kallsyms]  [k] __x64_sys_read         ◆
+   99.95%     0.00%  million  [kernel.kallsyms]  [k] ksys_read              ▒
+   99.95%     0.00%  million  [kernel.kallsyms]  [k] vfs_read               ▒
-   99.95%     0.00%  million  [fibdrv]           [k] fib_read               ▒
   - fib_read                                                                ▒
        98.87% bn_to_string                                                  ▒
      + 1.07% bn_fib_fdoubling                                               ▒
    98.87%    98.84%  million  [fibdrv]           [k] bn_to_string           ▒
+    1.07%     0.00%  million  [fibdrv]           [k] bn_fib_fdoubling       ▒
     1.04%     1.04%  million  [fibdrv]           [k] bn_mult                ▒     0.04%     0.00%  million  libc-2.31.so       [.] __printf (inlined)     ▒
     0.04%     0.00%  million  libc-2.31.so       [.] __vfprintf_internal    ▒
     0.04%     0.00%  million  libc-2.31.so       [.] _IO_new_file_xsputn (in▒
     0.04%     0.00%  million  [kernel.kallsyms]  [k] asm_sysvec_apic_timer_i▒
     0.03%     0.00%  million  [kernel.kallsyms]  [k] sysvec_apic_timer_inter▒
     0.03%     0.00%  million  [kernel.kallsyms]  [k] __sysvec_apic_timer_int▒
     0.03%     0.00%  million  [kernel.kallsyms]  [k] hrtimer_interrupt      ▒
     0.03%     0.00%  million  [unknown]          [k] 0x3733353432373336     ▒

bn_to_string 搬到使用者層級來處理,讓核心模組聚焦在 Fibonacci 求值和相關大數運算/數值表示

fibdrv.c

以下為調整過後的版本

/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn *fib = bn_alloc(1);
    bn_fib_fdoubling(fib, *offset);
    int len = fib->size;
    copy_to_user(buf, fib->number, sizeof(unsigned int)*len);
    bn_free(fib);
    return len;
}

million.c

以下為調整過後的版本

#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

// #include "BigNumber.h"
char *bn_to_string(void *str, size_t size)
{
    // log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
    unsigned int *number = (unsigned int *) str;
    size_t len = (8 * sizeof(int) * size) / 3 + 2;
    char *s = malloc(len);
    char *p = s;

    memset(s, '0', len - 1);
    s[len - 1] = '\0';

    for (int i = size - 1; i >= 0; i--) {
        for (unsigned int d = 1U << 31; d; d >>= 1) {
            /* binary -> decimal string */
            int carry = !!(d & number[i]);
            for (int j = len - 2; j >= 0; j--) {
                s[j] += s[j] - '0' + carry;  // double it
                carry = (s[j] > '9');
                if (carry)
                    s[j] -= 10;
            }
        }
    }
    // skip leading zero
    while (p[0] == '0' && p[1] != '\0') {
        p++;
    }
    // if (src.sign)
    //     *(--p) = '-';
    memmove(s, p, strlen(p) + 1);
    return s;
}

int main()
{
    char buf[100000];
    // char write_buf[] = "testing writing";
    int offset = 100000;

    int fd = open(FIB_DEV, O_RDWR);

    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }
    for (int i = offset; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        int len = read(fd, buf, sizeof(buf));
        char *p = bn_to_string(buf, len);
        printf("reading from " FIB_DEV
               " at ofset %d, returned the sequence "
               "%s.\n",
               i, p);
    }
    close(fd);
    return 0;
}

million.c 我發現有重複計算的情況,所以我直接將變數 i 調至我們要的目標值,便可減少重複計算的缺點。

執行以下命令:

$ perf stat -e cache-misses,cache-references,instructions,cycles ./million

得到以下結果,可以發現確實改善重複計算的問題

reading from /dev/fibonacci at ofset 100000, returned the sequence 25974069347221724166155034021275915414880485386517696584724770703952534543511273686265556772836716744754637587223074432111638399473875091030965697382188304493052287638531334921....21635660895603514383883939018953166274355609970015699780289236362349895374653428746875.

 Performance counter stats for './million':

            40,088      cpu_core/cache-misses/                                      
     <not counted>      cpu_atom/cache-misses/                                        (0.00%)
           215,685      cpu_core/cache-references/                                   
     <not counted>      cpu_atom/cache-references/                                     (0.00%)
    62,423,779,622      cpu_core/instructions/                                      
     <not counted>      cpu_atom/instructions/                                        (0.00%)
    24,461,981,416      cpu_core/cycles/                                            
     <not counted>      cpu_atom/cycles/                                              (0.00%)

       4.914407094 seconds time elapsed

       4.873887000 seconds user
       0.035984000 seconds sys

reading from /dev/fibonacci at ofset 1000000, returned the sequence 1953282128707757731632014947596256332443542996591873396953405194571625257887015694766641987634150146128879524335220236084625510912019560....72557966211738929885417307414847072116640441570575360458225614322429985978068323969654385552378378141386675079286837205802043347225419033684684301719893411568996526838242546875.

 Performance counter stats for './million':

           923,247      cpu_core/cache-misses/                                        (100.00%)
       134,057,082      cpu_atom/cache-misses/                                        (0.00%)
        13,412,663      cpu_core/cache-references/                                     (100.00%)
     1,201,151,457      cpu_atom/cache-references/                                     (0.00%)
 6,239,775,530,849      cpu_core/instructions/                                        (100.00%)
 3,294,260,877,504      cpu_atom/instructions/                                        (0.00%)
 2,464,834,820,774      cpu_core/cycles/                                              (100.00%)
 1,845,415,329,159      cpu_atom/cycles/                                              (0.00%)

     488.835328958 seconds time elapsed

     487.016463000 seconds user
       1.775938000 seconds sys

bn_to_string 移至 user level 後,其中

f(100000)
f(1000000)
的運算結果和 WolframAlpha 的結果一致,達成其中一個目標計算出第一百萬個的 fibonacci

執行以下命令:

$ perf report 

得到以下結果,但是 bn_to_string 的問題仍需要改進

Samples: 20K of event 'cpu_core/cycles/', Event count (approx.): 24424653112
  Children      Self  Command  Shared Object      Symbol
+   99.98%     0.00%  million  [unknown]          [k] 0xffffffffffffffff       ◆
+   99.98%     0.00%  million  million            [.] main                     ▒
-   99.61%    99.59%  million  million            [.] bn_to_string             ▒
     99.59% 0xffffffffffffffff                                                 ▒
        main                                                                   ▒
        bn_to_string                                                           ▒
     0.38%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff94000099       ▒
     0.38%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff93f666d9       ▒
     0.37%     0.00%  million  libc-2.31.so       [.] __GI___read (inlined)    ▒
     0.37%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff93585d6a       ▒
     0.37%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff93585cc7       ▒
     0.37%     0.00%  million  [kernel.kallsyms]  [k] 0xffffffff935837bd       ▒
     0.37%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfee0       ▒
     0.14%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfe1f       ▒
     0.12%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfe03       ▒
     0.11%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfe11       ▒
     0.07%     0.07%  million  [fibdrv]           [k] 0x0000000000000a31       ▒
     0.07%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfa34       ▒
     0.05%     0.05%  million  [fibdrv]           [k] 0x0000000000000a29       ▒
     0.05%     0.00%  million  [fibdrv]           [k] 0xffffffffc08cfa2b  

針對 bn_fib_fdoubling 進行改良

原先 bn_cpy 採取從 src 記憶體區域複製至 dest 記憶體區域。
memcpy

The memcpy() function copies n bytes from memory area src to
memory area dest. The memory areas must not overlap. Use
memmove(3) if the memory areas do overlap.

後來採用 bn_swap 達成交換 bn 位置,如下

void bn_swap(bn *a, bn *b)
{
    bn tmp = *a;
    *a = *b;
    *b = tmp;
}

改良前

void bn_fib_fdoubling(bn *dest, unsigned int n)
{
    ...
    for (unsigned int i = 1U << 31; i; i >>= 1) {
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        bn_cpy(k1, f2);
        bn_lshift(k1, 1, k1);
        bn_sub(k1, f1, k1);
        bn_mult(k1, f1, k1);
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_mult(f1, f1, f1);
        bn_mult(f2, f2, f2);
        bn_cpy(k2, f1);
        bn_add(k2, f2, k2);
        if (n & i) {
            bn_cpy(f1, k2);
            bn_cpy(f2, k1);
            bn_add(f2, k2, f2);
        } else {
            bn_cpy(f1, k1);
            bn_cpy(f2, k2);
        }
    }
    ...
}

改良後

void bn_fib_fdoubling(bn *dest, unsigned int n)
{
    ...
    for (unsigned int i = 1U << (31 - __builtin_clz(n)); i; i >>= 1) {
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_lshift(f2, 1, k1);// k1 = 2 * F(k+1)
        bn_sub(k1, f1, k1);  // k1 = 2 * F(k+1) – F(k)
        bn_mult(k1, f1, k2); // k2 = k1 * f1 = F(2k)
        bn_mult(f1, f1, k1); // k1 = F(k)^2
        bn_swap(f1, k2);     // f1 <-> k2, f1 = F(2k) now
        bn_mult(f2, f2, k2); // k2 = F(k+1)^2
        bn_add(k1, k2, f2);  // f2 = f1^2 + f2^2 = F(2k+1) now
        if (n & i) {
            bn_swap(f1, f2);    // f1 = F(2k+1)
            bn_add(f1, f2, f2); // f2 = F(2k+2)
        }
    }
    ...
}

改良前

reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.

 Performance counter stats for './million':

            16,024      cpu_core/cache-misses/    (0.00%)
            43,849      cpu_core/cache-references/    (0.00%)
         8,404,954      cpu_core/instructions/    (0.00%)
         4,012,031      cpu_core/cycles/    (0.00%)

       0.005958262 seconds time elapsed

       0.005661000 seconds user
       0.000000000 seconds sys

改良後

reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.

 Performance counter stats for './million':

            27,210      cpu_core/cache-misses/    (0.00%)
            42,862      cpu_core/cache-references/    (0.00%)
         8,325,720      cpu_core/instructions/    (0.00%)
         4,075,598      cpu_core/cycles/    (0.00%)

       0.005514773 seconds time elapsed

       0.005042000 seconds user
       0.000000000 seconds sys

  • 以時間來看,改進了 0.000443489 seconds,約為 7 %,表示 swap 這個策略比 memcpy() 來的好
  • instructions 也從原先 8,404,954 下降至 8,325,720,約為 0.9%

使用 Q-Matrix 進行改良

學習 Q-Matrix 如何使用
參考公式如下

F(2k1)=F(k)2+F(k1)2F(2k)=F(k)[2F(k1)+F(k)]

得到以下測試結果

reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.

 Performance counter stats for './million':

            9,620      cpu_core/cache-misses/    (0.00%)
            48,474      cpu_core/cache-references/    (0.00%)
         8,335,893      cpu_core/instructions/    (0.00%)
         4,094,925      cpu_core/cycles/    (0.00%)

       0.005139802 seconds time elapsed

       0.004427000 seconds user
       0.000000000 seconds sys
  • 時間從 0.005514773 seconds 降至 0.005139802 seconds, 約為 6%

Implement Hash Table to save Fibonacci.

參考 Linux 核心的 hash table 實作 來設計 Hash table

儘量使用 Linux 核心提供的標頭檔和 API。

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

目前已使用老師說的標頭檔和API了

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
huaxin

資料結構

struct hlist_node { 
    struct hlist_node *next, **pprev; 
};
struct hlist_head { 
    struct hlist_node *first; 
};
typedef struct { 
    int bits; 
    struct hlist_head *ht; 
} map_t;
struct hash_key {
    int key;
    void *data;
    struct hlist_node node;
};

宣告出 pprev (指標的指標) 並不指向上個節點,而是指向上個節點的 next 指標,當要刪除的節點是首個節點,可通過 *pprev = next 直接修改指標的指向。詳細內容請看Linux 核心的 hash table 實作

為保存我們需要的數值,定義出以下 struct

struct bn_hashtable {
    unsigned int id;
    struct _bn *bn_object;
    struct hlist_node node;
};

示意圖如下:







G


cluster_A

id


cluster_bn

bn


cluster_1

hash_key 1


cluster_3

hash_key 3



map

hlist_head.first

 

 

 

 

 

 

 

 



hn1

hlist_node

pprev

next



map:ht1->hn1





hn3

hlist_node

pprev

next



map:ht5->hn3





null1
NULL



null2
NULL



bn1

number

size



hn1:s->map:ht1





hn1:next->null1





hn3:s->map:ht5





hn3:next->null2





目前想先從 n = 100做起,所以先定義出 128 個 buckets

#define DEFAULT_HASHTABLE_LENGTH 7
// define a hash table with 2^7(=128) buckets
DEFINE_HASHTABLE(htable, DEFAULT_HASHTABLE_LENGTH);
// => struct hlist_head htable[128] = { [0 ... 127] = HLIST_HEAD_INIT };

目前考慮到 fast doubling 和 Fibonacci 數的特性,不用保存連續數值,而是關注第 N 個和第 2N 個 Fibonacci 數的關聯,想先保存 第 N 和第 2N 個數值
參考 Matrix Difference Equation for Fibonacci Sequence,把 Fibonacci Q-Matrix 公式重新推導一次,已理解以下 Fast Doubling 的由來

F2n+1=Fn+12+Fn2F2n=Fn(Fn+1+Fn1)=Fn(Fn+1+(Fn+1Fn))=Fn(2Fn+1Fn)
參考 Calculating Fibonacci Numbers by Fast Doubling

學習下方方法,可以幫助我們找到目標值,如下

(a=F0b=F1)2n+1,2n+2(F1F2)2n,2n+1(F2F3)2n+1,2n+2(F5F6)2n,2n+1(F10F11)

舉 f(100) 為例子,說明 bn_fib_doubling 如何運算出 count, n = 100,經i = 1U << (31 - __builtin_clz(n)),會先將 100 轉為 1100100(binary),再執行 __builtin_clz(1100100),可以得到 24,結果 i 可得 1000000。
Other Builtins

Built-in Function: int __builtin_clz (unsigned int x)
Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.

經過 (n & i) ? (count << 1) + 1 : count << 1 可得到每一次 for loop 的 count 值,依序為
0000001 = (1100100 & 1000000) ? (0000000 << 1) + 1 : 0000000 << 1
0000011 = (1100100 & 0100000) ? (0000001 << 1) + 1 : 0000001 << 1
0000110 = (1100100 & 0010000) ? (0000011 << 1) + 1 : 0000011 << 1
0001100 = (1100100 & 0001000) ? (0000110 << 1) + 1 : 0000110 << 1
0011001 = (1100100 & 0000100) ? (0001100 << 1) + 1 : 0001100 << 1
0110010 = (1100100 & 0000010) ? (0011001 << 1) + 1 : 0011001 << 1
1100100 = (1100100 & 0000001) ? (0110010 << 1) + 1 : 0110010 << 1
算出 count 之後,將用於 hashtable 中,儲存我們每一階段計算好的 fibonacci ,有f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100)

void bn_fib_doubling(bn *dest, unsigned int n)
{
    bn_resize(dest, 1);
    if (n < 2) {  // Fib(0) = 0, Fib(1) = 1
        dest->number[0] = n;
        return;
    }

    bn *f1 = dest;        /* F(k) */
    bn *f2 = bn_alloc(1); /* F(k+1) */
    f1->number[0] = 0;
    f2->number[0] = 1;
    bn *k1 = bn_alloc(1);
    bn *k2 = bn_alloc(1);
    // Follow the rule as below
    // f(0)  -----> f(1)  -----> f(2)  -----> f(5)  -----> f(10)  ......
    //        2n+1          2n          2n+1          2n
    unsigned int count = 0;
    struct bn_hashtable *hash_t =
        kmalloc(sizeof(struct bn_hashtable), GFP_KERNEL);
    hash_t->id = count;
    bn *tmp = bn_alloc(1);
    bn_cpy(tmp, dest);
    hash_t->bn_object = tmp;
    hash_add(htable, &hash_t->node, hash_t->id);
    // Take for example f(100) = 1100100 , initial i is 1100100, following i as
    // 1000000, 0100000, 0000000, 0000000, 0000100, 0000000, 0000000
    for (unsigned int i = 1U << (31 - __builtin_clz(n)); i; i >>= 1) {
        count = (n & i) ? (count << 1) + 1 : count << 1;
        /* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
        /* F(2k+1) = F(k)^2 + F(k+1)^2 */
        bn_lshift(f2, 1, k1);  // k1 = 2 * F(k+1)
        bn_sub(k1, f1, k1);    // k1 = 2 * F(k+1) – F(k)
        bn_mult(k1, f1, k2);   // k2 = k1 * f1 = F(2k)
        bn_mult(f1, f1, k1);   // k1 = F(k)^2
        bn_swap(f1, k2);       // f1 <-> k2, f1 = F(2k) now
        bn_mult(f2, f2, k2);   // k2 = F(k+1)^2
        bn_add(k1, k2, f2);    // f2 = f1^2 + f2^2 = F(2k+1) now
        // Below description is for "if (n & i)".
        // if i is 1000000, then do 1100100 & 1000000. Finally, I would get
        // true. if i is 0100000, then do 1100100 & 0100000. Finally, I would
        // get true. if i is 0000000, then do 1100100 & 0000000. Finally, I
        // would get false. if i is 0000000, then do 1100100 & 0000000. Finally,
        // I would get false. if i is 0000100, then do 1100100 & 0000100.
        // Finally, I would get true. if i is 1000000, then do 1100100 &
        // 1000000. Finally, I would get false.
        if (n & i) {             // odd
            bn_swap(f1, f2);     // f1 = F(2k+1)
            bn_add(f1, f2, f2);  // f2 = F(2k+2)
        }
        struct bn_hashtable *hash_t_loop =
            kmalloc(sizeof(struct bn_hashtable), GFP_KERNEL);
        hash_t_loop->id = count;
        bn *tmp_loop = bn_alloc(1);
        bn_cpy(tmp_loop, dest);
        hash_t_loop->bn_object = tmp_loop;
        hash_add(htable, &hash_t_loop->node, hash_t_loop->id);
    }
    bn_free(f2);
    bn_free(k1);
    bn_free(k2);
}

當我們要嘗試驗證我們儲存的資料是否正確,我們需要先確認好儲存位置,分別為f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100),再針對key 設為 0,1,3,6,12,25,50,100

/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    int len = 0;
    bn *fib = bn_alloc(1);
    bn_fib_doubling(fib, *offset);
    unsigned int key = 50; // 以f(100)為例,這裡會設定 key 為 0, 1, 3, 6, 12, 25,50,100 ,驗證是否儲存正確
    struct bn_hashtable *hash_t_;
    hash_for_each_possible(htable, hash_t_, node, key) {
        if(hash_t_->id == key) { 
            len = hash_t_->bn_object->size;
            copy_to_user(buf, hash_t_->bn_object->number, sizeof(unsigned int)*len);
            printk(KERN_INFO "get target %d \n", key);
            bn_free(fib);
            return len;
        }
    }
    kfree(hash_t_);
    bn_free(fib);
    
    return len;
}

可得到以下結果,確定 f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100) 在 hashtable 中儲存是正確的

reading from /dev/fibonacci at ofset 100, returned the sequence 0.
reading from /dev/fibonacci at ofset 100, returned the sequence 1.
reading from /dev/fibonacci at ofset 100, returned the sequence 2.
reading from /dev/fibonacci at ofset 100, returned the sequence 8.
reading from /dev/fibonacci at ofset 100, returned the sequence 144.
reading from /dev/fibonacci at ofset 100, returned the sequence 75025.
reading from /dev/fibonacci at ofset 100, returned the sequence 12586269025.
reading from /dev/fibonacci at ofset 100, returned the sequence 354224848179261915075.

文字訊息「不要」用圖片展現,尊重視覺障礙者閱讀的權益。

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

上圖結果和 WolframAlpha 一致
也可以透過 dmeg 檢查我們的是否成功儲存我們的數值,命令如下

dmesg |grep target
[187815.683518] get target 1
[187822.400863] get target 3
[187829.297017] get target 6
[187836.337402] get target 12
[187843.613414] get target 25
[187851.664775] get target 50
[187857.644399] get target 100

我們考慮 fast doubling 和 Fibonacci 數的特性,所以不保存連續數值,而是關注第 N 個和第 2N 個 Fibonacci 數的關聯,儲存第 N 個和第 2N 個 Fibonacci 數,降低記憶體開銷。

注意用語,參見:

當 fibdrv 嘗試釋放佔用的記憶體空間時,應有對應的操作

針對我們的 hashtable 逐一釋放記憶體

/*release hashtable*/
static void hashtable_release(void)
{
    struct bn_hashtable *pos = NULL;
    struct hlist_node *n = NULL;
    int bucket;

    for (bucket = 0; bucket < (1U << DEFAULT_HASHTABLE_LENGTH); ++bucket) {
        hlist_for_each_entry_safe(pos, n, &htable[bucket], node)
        {
            bn_free(pos->bn_object);
            hlist_del(&pos->node);
            kfree(pos);
        }
    }
}

參考 The __init and __exit Macros其中提到以下內容

The __exit macro causes the omission of the function when the module is built into the kernel, and like __exit, has no effect for loadable modules. Again, if you consider when the cleanup function runs, this makes complete sense;

於是便把 hashtable_release 放在 __exit 內部

static void __exit exit_fib_dev(void)
{
    // mutex_destroy(&fib_mutex);
    device_destroy(fib_class, fib_dev);
    class_destroy(fib_class);
    cdev_del(fib_cdev);
    unregister_chrdev_region(fib_dev, 1);
+   hashtable_release();
}

引入 workqueue,將運算要求分派給個別 CPU 核,並確保降低非必要的同步處理成本

研究中
Concurrency Managed Workqueue(cmwq)〉提到,cmwq 是 wq 的改進,專注以下目標

  • Maintain compatibility with the original workqueue API
  • Use per-CPU unified worker pools shared by all wq to provide flexible level of concurrency on demand without wasting a lot of resource.
  • Automatically regulate worker pool and level of concurrency