owned this note
owned this note
Published
Linked with GitHub
# Linux 核心專題: fibdrv 改進
> 執行人: p96114175
> [GitHub](https://github.com/p96114175/fibdrv)
> [專題解說錄影](https://youtu.be/YVkQSCX_Bzw)
:::success
:question: 提問清單
* ?
:::
## 任務簡述
擴充 [fibdrv](https://hackmd.io/@sysprog/linux2023-fibdrv),強化其並行處理能力,預計達成:
* 有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷
* 藉由 hashtable 或 cache 一類的機制,儲存已計算的 Fibonacci 數
* 引入 workqueue,將運算要求分派給個別 CPU 核,並確保降低非必要的同步處理成本
* 修訂 fibdrv 和應用程式之間的 API,使其適合用於同步處理
## TODO: 儲存已計算的 Fibonacci 數
1. 考慮到大數運算的特性,當以 key-value 形式保存時,不是儲存單純的整數值,而是指向特定結構的指標,於是當 fibdrv 嘗試釋放佔用的記憶體空間時,應有對應的操作
2. 考慮到 fast doubling 和 Fibonacci 數的特性,不用保存連續數值,而是關注第 N 個和第 2N 個 Fibonacci 數的關聯,儘量降低記憶體開銷
3. 應當善用 Linux 核心的 hashtable 或相關的資料結構
## 引入 workqueue,確保並行處理的效益
1. 學習 [ktcp](https://hackmd.io/@sysprog/linux2023-ktcp),引入 kthread 和 CMWQ 到 fibdrv,確保 Fibonacci 數的運算可發揮硬體能力
2. 確保並行處理的效益,不僅要確認結果正確,還要讓並行的 fibdrv 得以更有效的運算
3. 思考如何運用 Linux 核心的 RCU (或其他 lock-free 機制) 有效釋放並行處理過程中的記憶體資源
### 有效運算 Fibonacci 數 (至少能算到第一百萬個) 並降低記憶體開銷
閱讀[大數運算](https://hackmd.io/@sysprog/linux2023-fibdrv/%2F%40sysprog%2Flinux2023-fibdrv-d#%E5%88%9D%E6%AD%A5%E6%94%AF%E6%8F%B4%E5%A4%A7%E6%95%B8%E9%81%8B%E7%AE%97) 協助計算出 Fibonacci 第一百萬個
以下為欲支援運算的函式,部分參考[教材](https://hackmd.io/@sysprog/linux2023-fibdrv/%2F%40sysprog%2Flinux2023-fibdrv-d#%E5%88%9D%E6%AD%A5%E6%94%AF%E6%8F%B4%E5%A4%A7%E6%95%B8%E9%81%8B%E7%AE%97)
#### bn_clz
```c
/* count leading zeros of src*/
static int bn_clz(const bn *src)
{
int cnt = 0;
for (int i = src->size - 1; i >= 0; i--) {
if (src->number[i]) {
// prevent undefined behavior when src = 0
cnt += __builtin_clz(src->number[i]);
return cnt;
} else {
cnt += 32;
}
}
return cnt;
}
```
#### bn_to_string
該函式用於將 bn 轉換為十進制字串表示
用 `(8 * sizeof(int) * src.size) / 3 + 2 + src.sign` 算出需要的字元存放空間,再由 `kmalloc()` 分配這段空間
使用 `memset()` 對字元陣列 s 所有元素初始化為 `0`,最後一個元素設置為 `\0` 表示字串的結束
```c
/*
* output bn to decimal string
* Note: the returned string should be freed with kfree()
*/
char *bn_to_string(bn src)
{
// log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
size_t len = (8 * sizeof(int) * src.size) / 3 + 2 + src.sign;
char *s = kmalloc(len, GFP_KERNEL);
char *p = s;
memset(s, '0', len - 1);
s[len - 1] = '\0';
for (int i = src.size - 1; i >= 0; i--) {
for (unsigned int d = 1U << 31; d; d >>= 1) {
/* binary -> decimal string */
int carry = !!(d & src.number[i]);
for (int j = len - 2; j >= 0; j--) {
s[j] += s[j] - '0' + carry; // double it
carry = (s[j] > '9');
if (carry)
s[j] -= 10;
}
}
}
// skip leading zero
while (p[0] == '0' && p[1] != '\0') {
p++;
}
if (src.sign)
*(--p) = '-';
memmove(s, p, strlen(p) + 1);
return s;
}
```
在 `fibdrv.c` 新增以下內容
```c
#define MAX_LENGTH 1000000
...
/* calc n-th Fibonacci number and save into dest */
void bn_fib(bn *dest, unsigned int n)
{
bn_resize(dest, 1);
if (n < 2) { //Fib(0) = 0, Fib(1) = 1
dest->number[0] = n;
return;
}
bn *a = bn_alloc(1);
bn *b = bn_alloc(1);
dest->number[0] = 1;
for (unsigned int i = 1; i < n; i++) {
bn_swap(b, dest);
bn_add(a, b, dest);
bn_swap(a, b);
}
bn_free(a);
bn_free(b);
}
void bn_fib_fdoubling(bn *dest, unsigned int n)
{
bn_resize(dest, 1);
if (n < 2) { //Fib(0) = 0, Fib(1) = 1
dest->number[0] = n;
return;
}
bn *f1 = dest; /* F(k) */
bn *f2 = bn_alloc(1); /* F(k+1) */
f1->number[0] = 0;
f2->number[0] = 1;
bn *k1 = bn_alloc(1);
bn *k2 = bn_alloc(1);
for (unsigned int i = 1U << 31; i; i >>= 1) {
/* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
bn_cpy(k1, f2);
bn_lshift(k1, 1);
bn_sub(k1, f1, k1);
bn_mult(k1, f1, k1);
/* F(2k+1) = F(k)^2 + F(k+1)^2 */
bn_mult(f1, f1, f1);
bn_mult(f2, f2, f2);
bn_cpy(k2, f1);
bn_add(k2, f2, k2);
if (n & i) {
bn_cpy(f1, k2);
bn_cpy(f2, k1);
bn_add(f2, k2, f2);
} else {
bn_cpy(f1, k1);
bn_cpy(f2, k2);
}
}
bn_free(f2);
bn_free(k1);
bn_free(k2);
}
...
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
bn *fib = bn_alloc(1);
bn_fib_doubling(fib, *offset);
char *s = bn_to_string(*fib);
copy_to_user(buf, s, strlen(s) + 1);
bn_free(fib);
kfree(s);
return 0;
}
```
建立名為 `million.c` 的檔案,負責接收回傳回來的資料
```c
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <inttypes.h>
#define FIB_DEV "/dev/fibonacci"
int main()
{
// char buf[1000000];
char buf[1000000];
// char write_buf[] = "testing writing";
int offset = 1000000;
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
for (int i = offset; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
read(fd, buf, sizeof(buf));
// char *p = bn_to_string(buf, len);
printf("reading from " FIB_DEV
" at ofset %d, returned the sequence "
"%s.\n",
i, buf);
}
// free(buf);
close(fd);
return 0;
}
```
執行以下命令
```shell
$ gcc million.c -o million
$ sudo perf stat -e cache-misses,cache-references,instructions,cycles ./million
```
得到以下結果
```
reading from /dev/fibonacci at ofset 10000, returned the sequence 33644764876431783266621612005107543310302148460680063906564769974680081442166662368155595513633734025582065332680836159373734790483865268263040892463056431887354544369559827491606602099884183933864652731300088830269235......970975069382648706613264507665074611512677522748621598642530711298441182622661057163515069260029861704945425047491378115154139941550671256271197133252763631939606902895650288268608362241082050562430701794976171121233066073310059947366875.
*** stack smashing detected ***: terminated
./million: 中止
Performance counter stats for './million':
381,200 cpu_core/cache-misses/
<not counted> cpu_atom/cache-misses/ (0.00%)
3,151,919 cpu_core/cache-references/
<not counted> cpu_atom/cache-references/ (0.00%)
762,443,945,177 cpu_core/instructions/
<not counted> cpu_atom/instructions/ (0.00%)
278,306,077,550 cpu_core/cycles/
<not counted> cpu_atom/cycles/ (0.00%)
57.240695802 seconds time elapsed
0.011964000 seconds user
56.988321000 seconds sys
```
隨後用 perf 進行 `f(0)~f(10000)` 分析,初步分析出 `bn_to_string` 這個函式有問題,佔總執行時間 `98.87%` , 而 `bn_fib_fdoubling ` 僅占 `1.07%` ,裏頭有 $57.24s \times 98.87% = 56.593188s$ 在執行 `bn_to_string`
```
reading from /dev/fibonacci at ofset 100000, returned the sequence 25974069347221724166155034021275915414880485386517696584724770703952534543511273686265556772836716744754637587223074432111638399473875091030965697382188304493052287638531334921353026792789567010512765782716356.....21635660895603514383883939018953166274355609970015699780289236362349895374653428746875.
Performance counter stats for './million':
30,774 cpu_core/cache-misses/
<not counted> cpu_atom/cache-misses/ (0.00%)
119,111 cpu_core/cache-references/
<not counted> cpu_atom/cache-references/ (0.00%)
22,703,034,346 cpu_core/instructions/
<not counted> cpu_atom/instructions/ (0.00%)
8,320,050,649 cpu_core/cycles/
<not counted> cpu_atom/cycles/ (0.00%)
1.690718310 seconds time elapsed
0.000000000 seconds user
1.685669000 seconds sys
```
```
reading from /dev/fibonacci at ofset 1000000, returned the sequence 19532821287077577316320149475962563324435429965918733969534051945716252578870156947666419876341501461288795243352202360846255109120195602337440154381151966361569199621256428943033701138278006380027674115279274666698655783793188228320612714975832303348548934895725992307229129019282092643316275217308614600179125820426996599360209593392020051848620284024473431398113674187202038684801753185386211128........03373870955680756568226835379339839824880227237703197854614809323023472557966211738929885417307414847072116640441570575360458225614322429985978068323969654385552378378141386675079286837205802043347225419033684684301719893411568996526838242546875.
Performance counter stats for './million':
240,028 cpu_core/cache-misses/
<not counted> cpu_atom/cache-misses/ (0.00%)
3,482,714 cpu_core/cache-references/
<not counted> cpu_atom/cache-references/ (0.00%)
2,269,015,989,781 cpu_core/instructions/
<not counted> cpu_atom/instructions/ (0.00%)
832,563,412,079 cpu_core/cycles/
<not counted> cpu_atom/cycles/ (0.00%)
167.505017193 seconds time elapsed
0.000000000 seconds user
167.471991000 seconds sys
```
上圖 f(100000)、f(1000000) 的結果和 [WolframAlpha](https://www.wolframalpha.com/input?i=fib%281000000%29) 的結果一致
f(100000)如下
![](https://hackmd.io/_uploads/HkMd2-Jwn.png)
f(1000000)如下
![](https://hackmd.io/_uploads/SJjOo-Jwh.png)
:::warning
可將 `bn_to_string` 搬到使用者層級來處理,讓核心模組聚焦在 Fibonacci 求值和相關大數運算/數值表示。
:notes: jserv
:::
:::info
好的老師,我試試看
:::
```
Samples: 235K of event 'cpu_core/cycles/', Event count (approx.): 278841269506
Children Self Command Shared Object Symbol
+ 99.95% 0.00% million [kernel.kallsyms] [k] __x64_sys_read ◆
+ 99.95% 0.00% million [kernel.kallsyms] [k] ksys_read ▒
+ 99.95% 0.00% million [kernel.kallsyms] [k] vfs_read ▒
- 99.95% 0.00% million [fibdrv] [k] fib_read ▒
- fib_read ▒
98.87% bn_to_string ▒
+ 1.07% bn_fib_fdoubling ▒
98.87% 98.84% million [fibdrv] [k] bn_to_string ▒
+ 1.07% 0.00% million [fibdrv] [k] bn_fib_fdoubling ▒
1.04% 1.04% million [fibdrv] [k] bn_mult ▒ 0.04% 0.00% million libc-2.31.so [.] __printf (inlined) ▒
0.04% 0.00% million libc-2.31.so [.] __vfprintf_internal ▒
0.04% 0.00% million libc-2.31.so [.] _IO_new_file_xsputn (in▒
0.04% 0.00% million [kernel.kallsyms] [k] asm_sysvec_apic_timer_i▒
0.03% 0.00% million [kernel.kallsyms] [k] sysvec_apic_timer_inter▒
0.03% 0.00% million [kernel.kallsyms] [k] __sysvec_apic_timer_int▒
0.03% 0.00% million [kernel.kallsyms] [k] hrtimer_interrupt ▒
0.03% 0.00% million [unknown] [k] 0x3733353432373336 ▒
```
將 `bn_to_string` 搬到使用者層級來處理,讓核心模組聚焦在 Fibonacci 求值和相關大數運算/數值表示
#### fibdrv.c
以下為調整過後的版本
```c
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
bn *fib = bn_alloc(1);
bn_fib_fdoubling(fib, *offset);
int len = fib->size;
copy_to_user(buf, fib->number, sizeof(unsigned int)*len);
bn_free(fib);
return len;
}
```
#### million.c
以下為調整過後的版本
```c
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
// #include "BigNumber.h"
char *bn_to_string(void *str, size_t size)
{
// log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
unsigned int *number = (unsigned int *) str;
size_t len = (8 * sizeof(int) * size) / 3 + 2;
char *s = malloc(len);
char *p = s;
memset(s, '0', len - 1);
s[len - 1] = '\0';
for (int i = size - 1; i >= 0; i--) {
for (unsigned int d = 1U << 31; d; d >>= 1) {
/* binary -> decimal string */
int carry = !!(d & number[i]);
for (int j = len - 2; j >= 0; j--) {
s[j] += s[j] - '0' + carry; // double it
carry = (s[j] > '9');
if (carry)
s[j] -= 10;
}
}
}
// skip leading zero
while (p[0] == '0' && p[1] != '\0') {
p++;
}
// if (src.sign)
// *(--p) = '-';
memmove(s, p, strlen(p) + 1);
return s;
}
int main()
{
char buf[100000];
// char write_buf[] = "testing writing";
int offset = 100000;
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
for (int i = offset; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
int len = read(fd, buf, sizeof(buf));
char *p = bn_to_string(buf, len);
printf("reading from " FIB_DEV
" at ofset %d, returned the sequence "
"%s.\n",
i, p);
}
close(fd);
return 0;
}
```
在`million.c` 我發現有重複計算的情況,所以我直接將變數 `i` 調至我們要的目標值,便可減少重複計算的缺點。
執行以下命令:
```shell
$ perf stat -e cache-misses,cache-references,instructions,cycles ./million
```
得到以下結果,可以發現確實改善重複計算的問題
```
reading from /dev/fibonacci at ofset 100000, returned the sequence 25974069347221724166155034021275915414880485386517696584724770703952534543511273686265556772836716744754637587223074432111638399473875091030965697382188304493052287638531334921....21635660895603514383883939018953166274355609970015699780289236362349895374653428746875.
Performance counter stats for './million':
40,088 cpu_core/cache-misses/
<not counted> cpu_atom/cache-misses/ (0.00%)
215,685 cpu_core/cache-references/
<not counted> cpu_atom/cache-references/ (0.00%)
62,423,779,622 cpu_core/instructions/
<not counted> cpu_atom/instructions/ (0.00%)
24,461,981,416 cpu_core/cycles/
<not counted> cpu_atom/cycles/ (0.00%)
4.914407094 seconds time elapsed
4.873887000 seconds user
0.035984000 seconds sys
```
```
reading from /dev/fibonacci at ofset 1000000, returned the sequence 1953282128707757731632014947596256332443542996591873396953405194571625257887015694766641987634150146128879524335220236084625510912019560....72557966211738929885417307414847072116640441570575360458225614322429985978068323969654385552378378141386675079286837205802043347225419033684684301719893411568996526838242546875.
Performance counter stats for './million':
923,247 cpu_core/cache-misses/ (100.00%)
134,057,082 cpu_atom/cache-misses/ (0.00%)
13,412,663 cpu_core/cache-references/ (100.00%)
1,201,151,457 cpu_atom/cache-references/ (0.00%)
6,239,775,530,849 cpu_core/instructions/ (100.00%)
3,294,260,877,504 cpu_atom/instructions/ (0.00%)
2,464,834,820,774 cpu_core/cycles/ (100.00%)
1,845,415,329,159 cpu_atom/cycles/ (0.00%)
488.835328958 seconds time elapsed
487.016463000 seconds user
1.775938000 seconds sys
```
將 `bn_to_string` 移至 user level 後,其中 $f(100000)$ 及 $f(1000000)$ 的運算結果和 WolframAlpha 的結果一致,達成其中一個目標計算出第一百萬個的 fibonacci
執行以下命令:
```shell
$ perf report
```
得到以下結果,但是 `bn_to_string` 的問題仍需要改進
```diff
Samples: 20K of event 'cpu_core/cycles/', Event count (approx.): 24424653112
Children Self Command Shared Object Symbol
+ 99.98% 0.00% million [unknown] [k] 0xffffffffffffffff ◆
+ 99.98% 0.00% million million [.] main ▒
- 99.61% 99.59% million million [.] bn_to_string ▒
99.59% 0xffffffffffffffff ▒
main ▒
bn_to_string ▒
0.38% 0.00% million [kernel.kallsyms] [k] 0xffffffff94000099 ▒
0.38% 0.00% million [kernel.kallsyms] [k] 0xffffffff93f666d9 ▒
0.37% 0.00% million libc-2.31.so [.] __GI___read (inlined) ▒
0.37% 0.00% million [kernel.kallsyms] [k] 0xffffffff93585d6a ▒
0.37% 0.00% million [kernel.kallsyms] [k] 0xffffffff93585cc7 ▒
0.37% 0.00% million [kernel.kallsyms] [k] 0xffffffff935837bd ▒
0.37% 0.00% million [fibdrv] [k] 0xffffffffc08cfee0 ▒
0.14% 0.00% million [fibdrv] [k] 0xffffffffc08cfe1f ▒
0.12% 0.00% million [fibdrv] [k] 0xffffffffc08cfe03 ▒
0.11% 0.00% million [fibdrv] [k] 0xffffffffc08cfe11 ▒
0.07% 0.07% million [fibdrv] [k] 0x0000000000000a31 ▒
0.07% 0.00% million [fibdrv] [k] 0xffffffffc08cfa34 ▒
0.05% 0.05% million [fibdrv] [k] 0x0000000000000a29 ▒
0.05% 0.00% million [fibdrv] [k] 0xffffffffc08cfa2b
```
### 針對 `bn_fib_fdoubling` 進行改良
原先 `bn_cpy` 採取從 `src` 記憶體區域複製至 `dest` 記憶體區域。
[memcpy](https://man7.org/linux/man-pages/man3/memcpy.3.html)
> The memcpy() function copies n bytes from memory area src to
memory area dest. The memory areas must not overlap. Use
memmove(3) if the memory areas do overlap.
後來採用 `bn_swap` 達成交換 `bn` 位置,如下
```c
void bn_swap(bn *a, bn *b)
{
bn tmp = *a;
*a = *b;
*b = tmp;
}
```
改良前
```c
void bn_fib_fdoubling(bn *dest, unsigned int n)
{
...
for (unsigned int i = 1U << 31; i; i >>= 1) {
/* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
bn_cpy(k1, f2);
bn_lshift(k1, 1, k1);
bn_sub(k1, f1, k1);
bn_mult(k1, f1, k1);
/* F(2k+1) = F(k)^2 + F(k+1)^2 */
bn_mult(f1, f1, f1);
bn_mult(f2, f2, f2);
bn_cpy(k2, f1);
bn_add(k2, f2, k2);
if (n & i) {
bn_cpy(f1, k2);
bn_cpy(f2, k1);
bn_add(f2, k2, f2);
} else {
bn_cpy(f1, k1);
bn_cpy(f2, k2);
}
}
...
}
```
改良後
```c
void bn_fib_fdoubling(bn *dest, unsigned int n)
{
...
for (unsigned int i = 1U << (31 - __builtin_clz(n)); i; i >>= 1) {
/* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
/* F(2k+1) = F(k)^2 + F(k+1)^2 */
bn_lshift(f2, 1, k1);// k1 = 2 * F(k+1)
bn_sub(k1, f1, k1); // k1 = 2 * F(k+1) – F(k)
bn_mult(k1, f1, k2); // k2 = k1 * f1 = F(2k)
bn_mult(f1, f1, k1); // k1 = F(k)^2
bn_swap(f1, k2); // f1 <-> k2, f1 = F(2k) now
bn_mult(f2, f2, k2); // k2 = F(k+1)^2
bn_add(k1, k2, f2); // f2 = f1^2 + f2^2 = F(2k+1) now
if (n & i) {
bn_swap(f1, f2); // f1 = F(2k+1)
bn_add(f1, f2, f2); // f2 = F(2k+2)
}
}
...
}
```
改良前
```
reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.
Performance counter stats for './million':
16,024 cpu_core/cache-misses/ (0.00%)
43,849 cpu_core/cache-references/ (0.00%)
8,404,954 cpu_core/instructions/ (0.00%)
4,012,031 cpu_core/cycles/ (0.00%)
0.005958262 seconds time elapsed
0.005661000 seconds user
0.000000000 seconds sys
```
改良後
```
reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.
Performance counter stats for './million':
27,210 cpu_core/cache-misses/ (0.00%)
42,862 cpu_core/cache-references/ (0.00%)
8,325,720 cpu_core/instructions/ (0.00%)
4,075,598 cpu_core/cycles/ (0.00%)
0.005514773 seconds time elapsed
0.005042000 seconds user
0.000000000 seconds sys
```
* 以時間來看,改進了 0.000443489 seconds,約為 7 %,表示 `swap` 這個策略比 `memcpy()` 來的好
* instructions 也從原先 8,404,954 下降至 8,325,720,約為 0.9%
### 使用 `Q-Matrix` 進行改良
學習 [Q-Matrix](https://hackmd.io/@sysprog/linux2023-fibdrv-d#%E6%94%B9%E5%96%84%E6%96%B9%E6%A1%88-2-%E9%81%8B%E7%94%A8-Q-Matrix) 如何使用
參考公式如下
\begin{split}
F(2k-1) &= F(k)^2+F(k-1)^2 \\
F(2k) &= F(k)[2F(k-1) + F(k)]
\end{split}
得到以下測試結果
```
reading from /dev/fibonacci at ofset 1000, returned the sequence 43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875.
Performance counter stats for './million':
9,620 cpu_core/cache-misses/ (0.00%)
48,474 cpu_core/cache-references/ (0.00%)
8,335,893 cpu_core/instructions/ (0.00%)
4,094,925 cpu_core/cycles/ (0.00%)
0.005139802 seconds time elapsed
0.004427000 seconds user
0.000000000 seconds sys
```
* 時間從 0.005514773 seconds 降至 0.005139802 seconds, 約為 6%
### Implement Hash Table to save Fibonacci.
參考 [Linux 核心的 hash table 實作](https://hackmd.io/@sysprog/linux-hashtable) 來設計 Hash table
:::warning
儘量使用 Linux 核心提供的標頭檔和 API。
:notes: jserv
:::
:::success
目前已使用老師說的標頭檔和API了
:notes: huaxin
:::
## 資料結構
```c
struct hlist_node {
struct hlist_node *next, **pprev;
};
struct hlist_head {
struct hlist_node *first;
};
typedef struct {
int bits;
struct hlist_head *ht;
} map_t;
struct hash_key {
int key;
void *data;
struct hlist_node node;
};
```
宣告出 `pprev` (指標的指標) 並不指向上個節點,而是指向上個節點的 next 指標,當要刪除的節點是首個節點,可通過 `*pprev = next` 直接修改指標的指向。詳細內容請看[Linux 核心的 hash table 實作](https://hackmd.io/@sysprog/linux-hashtable)
為保存我們需要的數值,定義出以下 `struct`
```c
struct bn_hashtable {
unsigned int id;
struct _bn *bn_object;
struct hlist_node node;
};
```
示意圖如下:
```graphviz
digraph G {
rankdir=LR;
node[shape=record];
map [label="hlist_head.first |<ht0> |<ht1> |<ht2> |<ht3> |<ht4> |<ht5> |<ht7> |<ht8> "];
node[shape=none]
null1 [label=NULL]
null2 [label=NULL]
subgraph cluster_A {
label="id"
subgraph cluster_bn {
style=filled;
color=orange;
node [shape=record]
bn1 [label="{number|size}"]
label=bn
}
subgraph cluster_1 {
style=filled;
color=lightgrey;
node [shape=record];
hn1 [label="hlist_node | {<prev>pprev | <next>next}"];
label="hash_key 1"
}
}
subgraph cluster_3 {
style=filled;
color=lightgrey;
node [shape=record];
hn3 [label="hlist_node | {<prev>pprev | <next>next}"];
label="hash_key 3"
}
map:ht1 -> hn1
hn1:next -> null1
// hn2:next -> null1
// hn2:prev:s -> hn1:next:s
map:ht5 -> hn3
hn3:next -> null2
hn1:prev:s -> map:ht1
hn3:prev:s -> map:ht5
}
```
目前想先從 `n = 100`做起,所以先定義出 128 個 buckets
```c
#define DEFAULT_HASHTABLE_LENGTH 7
// define a hash table with 2^7(=128) buckets
DEFINE_HASHTABLE(htable, DEFAULT_HASHTABLE_LENGTH);
// => struct hlist_head htable[128] = { [0 ... 127] = HLIST_HEAD_INIT };
```
目前考慮到 fast doubling 和 Fibonacci 數的特性,不用保存連續數值,而是關注第 N 個和第 2N 個 Fibonacci 數的關聯,想先保存 第 N 和第 2N 個數值
參考 [Matrix Difference Equation for Fibonacci Sequence](https://chunminchang.github.io/blog/post/matrix-difference-equation-for-fibonacci-sequence),把 Fibonacci Q-Matrix 公式重新推導一次,已理解以下 Fast Doubling 的由來
\begin{split}
{F_{2n+1}} &= {F_{n+1}}^2+{F_{n}}^2 \\
F_{2n} &= F_n(F_{n+1} + F_{n-1}) \\
&= F_n(F_{n+1} + (F_{n+1} - F_n)) \\
&= F_n( 2F_{n+1} - F_n)
\end{split}
參考 [Calculating Fibonacci Numbers by Fast Doubling](https://chunminchang.github.io/blog/post/calculating-fibonacci-numbers-by-fast-doubling)
學習下方方法,可以幫助我們找到目標值,如下
\begin{align*}
\begin{pmatrix} a
= F_0 \\ b = F_1 \end{pmatrix} & \xrightarrow{2n+1, 2n+2}
\begin{pmatrix} F_1 \\ F_2 \end{pmatrix} \xrightarrow{2n, 2n+1}
\begin{pmatrix} F_2 \\ F_3 \end{pmatrix} \xrightarrow{2n+1, 2n+2}
\begin{pmatrix} F_5 \\ F_6 \end{pmatrix} \xrightarrow{2n, 2n+1}
\begin{pmatrix} F_{10} \\ F_{11} \end{pmatrix}
\end{align*}
舉 f(100) 為例子,說明 `bn_fib_doubling` 如何運算出 count, n = 100,經`i = 1U << (31 - __builtin_clz(n))`,會先將 100 轉為 1100100(binary),再執行 `__builtin_clz(1100100)`,可以得到 24,結果 i 可得 1000000。
[Other Builtins](https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html)
> Built-in Function: int __builtin_clz (unsigned int x)
Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
經過 `(n & i) ? (count << 1) + 1 : count << 1` 可得到每一次 for loop 的 count 值,依序為
0000001 = (1100100 & 1000000) ? (0000000 << 1) + 1 : 0000000 << 1
0000011 = (1100100 & 0100000) ? (0000001 << 1) + 1 : 0000001 << 1
0000110 = (1100100 & 0010000) ? (0000011 << 1) + 1 : 0000011 << 1
0001100 = (1100100 & 0001000) ? (0000110 << 1) + 1 : 0000110 << 1
0011001 = (1100100 & 0000100) ? (0001100 << 1) + 1 : 0001100 << 1
0110010 = (1100100 & 0000010) ? (0011001 << 1) + 1 : 0011001 << 1
1100100 = (1100100 & 0000001) ? (0110010 << 1) + 1 : 0110010 << 1
算出 count 之後,將用於 hashtable 中,儲存我們每一階段計算好的 fibonacci ,有f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100)
```c
void bn_fib_doubling(bn *dest, unsigned int n)
{
bn_resize(dest, 1);
if (n < 2) { // Fib(0) = 0, Fib(1) = 1
dest->number[0] = n;
return;
}
bn *f1 = dest; /* F(k) */
bn *f2 = bn_alloc(1); /* F(k+1) */
f1->number[0] = 0;
f2->number[0] = 1;
bn *k1 = bn_alloc(1);
bn *k2 = bn_alloc(1);
// Follow the rule as below
// f(0) -----> f(1) -----> f(2) -----> f(5) -----> f(10) ......
// 2n+1 2n 2n+1 2n
unsigned int count = 0;
struct bn_hashtable *hash_t =
kmalloc(sizeof(struct bn_hashtable), GFP_KERNEL);
hash_t->id = count;
bn *tmp = bn_alloc(1);
bn_cpy(tmp, dest);
hash_t->bn_object = tmp;
hash_add(htable, &hash_t->node, hash_t->id);
// Take for example f(100) = 1100100 , initial i is 1100100, following i as
// 1000000, 0100000, 0000000, 0000000, 0000100, 0000000, 0000000
for (unsigned int i = 1U << (31 - __builtin_clz(n)); i; i >>= 1) {
count = (n & i) ? (count << 1) + 1 : count << 1;
/* F(2k) = F(k) * [ 2 * F(k+1) – F(k) ] */
/* F(2k+1) = F(k)^2 + F(k+1)^2 */
bn_lshift(f2, 1, k1); // k1 = 2 * F(k+1)
bn_sub(k1, f1, k1); // k1 = 2 * F(k+1) – F(k)
bn_mult(k1, f1, k2); // k2 = k1 * f1 = F(2k)
bn_mult(f1, f1, k1); // k1 = F(k)^2
bn_swap(f1, k2); // f1 <-> k2, f1 = F(2k) now
bn_mult(f2, f2, k2); // k2 = F(k+1)^2
bn_add(k1, k2, f2); // f2 = f1^2 + f2^2 = F(2k+1) now
// Below description is for "if (n & i)".
// if i is 1000000, then do 1100100 & 1000000. Finally, I would get
// true. if i is 0100000, then do 1100100 & 0100000. Finally, I would
// get true. if i is 0000000, then do 1100100 & 0000000. Finally, I
// would get false. if i is 0000000, then do 1100100 & 0000000. Finally,
// I would get false. if i is 0000100, then do 1100100 & 0000100.
// Finally, I would get true. if i is 1000000, then do 1100100 &
// 1000000. Finally, I would get false.
if (n & i) { // odd
bn_swap(f1, f2); // f1 = F(2k+1)
bn_add(f1, f2, f2); // f2 = F(2k+2)
}
struct bn_hashtable *hash_t_loop =
kmalloc(sizeof(struct bn_hashtable), GFP_KERNEL);
hash_t_loop->id = count;
bn *tmp_loop = bn_alloc(1);
bn_cpy(tmp_loop, dest);
hash_t_loop->bn_object = tmp_loop;
hash_add(htable, &hash_t_loop->node, hash_t_loop->id);
}
bn_free(f2);
bn_free(k1);
bn_free(k2);
}
```
當我們要嘗試驗證我們儲存的資料是否正確,我們需要先確認好儲存位置,分別為f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100),再針對key 設為 0,1,3,6,12,25,50,100
```c
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
int len = 0;
bn *fib = bn_alloc(1);
bn_fib_doubling(fib, *offset);
unsigned int key = 50; // 以f(100)為例,這裡會設定 key 為 0, 1, 3, 6, 12, 25,50,100 ,驗證是否儲存正確
struct bn_hashtable *hash_t_;
hash_for_each_possible(htable, hash_t_, node, key) {
if(hash_t_->id == key) {
len = hash_t_->bn_object->size;
copy_to_user(buf, hash_t_->bn_object->number, sizeof(unsigned int)*len);
printk(KERN_INFO "get target %d \n", key);
bn_free(fib);
return len;
}
}
kfree(hash_t_);
bn_free(fib);
return len;
}
```
可得到以下結果,確定 f(0), f(1), f(3), f(6), f(12), f(25), f(50), f(100) 在 hashtable 中儲存是正確的
```
reading from /dev/fibonacci at ofset 100, returned the sequence 0.
reading from /dev/fibonacci at ofset 100, returned the sequence 1.
reading from /dev/fibonacci at ofset 100, returned the sequence 2.
reading from /dev/fibonacci at ofset 100, returned the sequence 8.
reading from /dev/fibonacci at ofset 100, returned the sequence 144.
reading from /dev/fibonacci at ofset 100, returned the sequence 75025.
reading from /dev/fibonacci at ofset 100, returned the sequence 12586269025.
reading from /dev/fibonacci at ofset 100, returned the sequence 354224848179261915075.
```
:::danger
文字訊息「不要」用圖片展現,尊重視覺障礙者閱讀的權益。
:notes: jserv
:::
上圖結果和 [WolframAlpha](https://www.wolframalpha.com/input?i=fib%281000000%29) 一致
也可以透過 `dmeg` 檢查我們的是否成功儲存我們的數值,命令如下
```shell
dmesg |grep target
```
```
[187815.683518] get target 1
[187822.400863] get target 3
[187829.297017] get target 6
[187836.337402] get target 12
[187843.613414] get target 25
[187851.664775] get target 50
[187857.644399] get target 100
```
我們考慮 fast doubling 和 Fibonacci 數的特性,所以不保存連續數值,而是關注第 N 個和第 2N 個 Fibonacci 數的關聯,儲存第 N 個和第 2N 個 Fibonacci 數,降低記憶體開銷。
:::warning
注意用語,參見:
* [資訊科技詞彙翻譯](https://hackmd.io/@sysprog/it-vocabulary)
* [詞彙對照表](https://hackmd.io/@l10n-tw/glossaries)
:::
### 當 fibdrv 嘗試釋放佔用的記憶體空間時,應有對應的操作
針對我們的 hashtable 逐一釋放記憶體
```c
/*release hashtable*/
static void hashtable_release(void)
{
struct bn_hashtable *pos = NULL;
struct hlist_node *n = NULL;
int bucket;
for (bucket = 0; bucket < (1U << DEFAULT_HASHTABLE_LENGTH); ++bucket) {
hlist_for_each_entry_safe(pos, n, &htable[bucket], node)
{
bn_free(pos->bn_object);
hlist_del(&pos->node);
kfree(pos);
}
}
}
```
參考 [The `__init` and `__exit` Macros](https://tldp.org/LDP/lkmpg/2.4/html/x281.htm)其中提到以下內容
> The `__exit` macro causes the omission of the function when the module is built into the kernel, and like `__exit`, has no effect for loadable modules. Again, `if you consider when the cleanup function runs, this makes complete sense;`
於是便把 `hashtable_release` 放在 `__exit` 內部
```diff
static void __exit exit_fib_dev(void)
{
// mutex_destroy(&fib_mutex);
device_destroy(fib_class, fib_dev);
class_destroy(fib_class);
cdev_del(fib_cdev);
unregister_chrdev_region(fib_dev, 1);
+ hashtable_release();
}
```
### 引入 workqueue,將運算要求分派給個別 CPU 核,並確保降低非必要的同步處理成本
研究中.....
〈[Concurrency Managed Workqueue(cmwq)](https://www.kernel.org/doc/html/latest/core-api/workqueue.html#concurrency-managed-workqueue-cmwq)〉提到,cmwq 是 wq 的改進,專注以下目標
* Maintain compatibility with the original workqueue API
* Use per-CPU unified worker pools shared by all wq to provide flexible level of concurrency on demand without wasting a lot of resource.
* Automatically regulate worker pool and level of concurrency