# 2022q1 Homework3 (fibdrv) contributed by < `Kevin-Shih` > ## 作業環境 **硬體** - CPU: Intel® Core™ i5-7300HQ CPU @ 2.50GHz × 4 :::spoiler lscpu ``` $ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 39 bits physical, 48 bits virtual CPU(s): 4 On-line CPU(s) list: 0-3 Thread(s) per core: 1 Core(s) per socket: 4 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 158 Model name: Intel(R) Core(TM) i5-7300HQ CPU @ 2.50GHz Stepping: 9 CPU MHz: 2500.000 CPU max MHz: 3500.0000 CPU min MHz: 800.0000 BogoMIPS: 4999.90 Virtualization: VT-x L1d cache: 128 KiB L1i cache: 128 KiB L2 cache: 1 MiB L3 cache: 6 MiB NUMA node0 CPU(s): 0-3 ``` ::: - RAM: 4GiB DDR4 @ 2133MHz x 2 (Dual channel) :::spoiler dmidecode -t memory ``` $ sudo dmidecode -t memory # dmidecode 3.2 Handle 0x003F, DMI type 17, 40 bytes Memory Device Array Handle: 0x003E Error Information Handle: Not Provided Total Width: 64 bits Data Width: 64 bits Size: 4096 MB Form Factor: SODIMM Set: None Locator: ChannelA-DIMM0 Bank Locator: BANK 0 Type: DDR4 Type Detail: Synchronous Speed: 2133 MT/s Manufacturer: 859B Part Number: CT4G4SFS8213.C8FBR2 Rank: 1 Configured Memory Speed: 2133 MT/s Handle 0x0040, DMI type 17, 40 bytes Memory Device Array Handle: 0x003E Error Information Handle: Not Provided Total Width: 64 bits Data Width: 64 bits Size: 4096 MB Form Factor: SODIMM Set: None Locator: ChannelB-DIMM0 Bank Locator: BANK 2 Type: DDR4 Type Detail: Synchronous Speed: 2133 MT/s Manufacturer: SK Hynix Part Number: HMA451S6AFR8N-TF Rank: 1 Configured Memory Speed: 2133 MT/s ``` ::: **作業系統** - OS: Ubuntu 20.04.4 LTS (Focal Fossa) - Linux kernel: 5.13.0-35-generic ``` $ uname -a Linux Shih-MSI 5.13.0-35-generic #40~20.04.1-Ubuntu SMP Mon Mar 7 09:18:32 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux ``` **編譯器** ``` $ gcc --version gcc (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0 ``` ## 修正 C99 VLA is not allowed in Linux kernel 依照 Fibonacci 數列的定義 $f(i)=f(i-1)+f(i-2)$ 可以發現其實只須 3 個變數即可迭代計算出 $f(k)$ 修正後的 `fib_sequence` ```c static long long fib_sequence(long long k) { if (k < 2) return k; long long f_k = -1, f_k1 = 1, f_k2 = 0; for (int i = 2; i <= k; i++) { f_k = f_k1 + f_k2; // 計算 f(i) f_k2 = f_k1; // f_k2 = f(i - 1) = f((i + 1) - 2) f_k1 = f_k; // f_k1 = f(i) = f((i + 1) - 1) } return f_k; } ``` ## 建立一個盡可能減少干擾效能分析因素的環境 - 將 `cpu0` 隔離僅供本次要測試的程式使用 ``` GRUB_CMDLINE_LINUX = “isolcpus=0” ``` > 後續以 `taskset` 指定欲測試的程式使用 `cpu0` - 禁用 ASLR 關閉這項安全功能,避免效能衝擊。 - 關閉 turbo mode 關閉 intel cpu 的 turbo mode,避免 turbo state 時使用 pl2 及 turbo frequency 的效能與回歸 pl1 及 base frequency 後的效能不同,影響後續分析。 - 設定 `cpu0` 的 scaling_governor 為 performance 因先前隔離 `cpu0` 作為這次運行效能測試的核心,故僅將 `cpu0` 調整為 performance 簡單寫了個 shell script 方便後面一次執行 ```bash # perf_test_mode.sh # disable ASLR echo 0 > /proc/sys/kernel/randomize_va_space echo "cat /proc/sys/kernel/randomize_va_space" cat /proc/sys/kernel/randomize_va_space # turn off turbo mode echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo echo "" echo "cat /sys/devices/system/cpu/intel_pstate/no_turbo" cat /sys/devices/system/cpu/intel_pstate/no_turbo # turn cpu0 to perf mode echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor echo "" echo "cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor" cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor # show isolated cpu core echo "" echo "cat /sys/devices/system/cpu/isolated" cat /sys/devices/system/cpu/isolated # expect cpu0 is isolated ``` reboot 後執行該 shell script 的結果 ``` $ sudo bash perf_test_mode.sh cat /proc/sys/kernel/randomize_va_space 0 cat /sys/devices/system/cpu/intel_pstate/no_turbo 1 cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor performance cat /sys/devices/system/cpu/isolated 0 ``` ## kernel 及 user space 的效能分析 在修正了 VLA 的問題後,先試著量測最基本的算法在計算時的效能。 測試的方式是挪用原先空置的 `write()` 當 `client` 以帶參數的方式呼叫後,會以測試時間的模式執行程式並以其中的 `size` 參數作為選擇以哪種方式計算 Fibonacci 數列。 在 kernel space 使用 ktime api 量測,user space 則使用 `clock_gettime()` , kernel to user 則是透過 user space 的耗時減去 `write()` 回傳的 kernel space 耗時得出。 **kernel 的時間量測** ```c static ssize_t fib_write(struct file *file, const char *buf, size_t size, loff_t *offset) { switch (size) { case 0: // defalut (time measure) kt = ktime_get(); fib_sequence(*offset); kt = ktime_sub(ktime_get(), kt); return (ssize_t) ktime_to_ns(kt); case 1: // bignum (time measure) kt = ktime_get(); fib_sequence_bignum(*offset, NULL); kt = ktime_sub(ktime_get(), kt); return (ssize_t) ktime_to_ns(kt); case 2: // fast doubling (time measure) kt = ktime_get(); fib_sequence_fdoubling(*offset); kt = ktime_sub(ktime_get(), kt); return (ssize_t) ktime_to_ns(kt); case 3: // fast doubling2 (time measure) default: // make check return 1; } } ``` **user space 及 kernel to user 的時間量測** ```c if (argc == 2) { for (int i = 0; i <= offset; i++) { // calculate to 100th fibonacci for (int j = 0; j < 100; j++) { // calculate 100 times struct timespec t1, t2; lseek(fd, i, SEEK_SET); clock_gettime(CLOCK_MONOTONIC, &t1); sz = write(fd, write_buf, atoi(argv[1])); clock_gettime(CLOCK_MONOTONIC, &t2); long user_time; if ((t2.tv_nsec-t1.tv_nsec)<0) user_time = 1000000000 + t2.tv_nsec - t1.tv_nsec; else user_time = t2.tv_nsec - t1.tv_nsec; printf("%d %lld %ld %lld\n", i, sz, user_time, user_time - sz); } } return 0; } ``` 以 python script 濾除極端值 ```python def outlier_filter(datas, threshold = 1.96): # 1.96 sigma => 95% confidence datas = np.array(datas) z = np.abs((datas - datas.mean()) / datas.std()) return datas[z < threshold] def data_processing(data_set, n): catgories = data_set[0].shape[0] samples = data_set[0].shape[1] final = np.zeros((catgories, samples)) for c in range(catgories): for s in range(samples): final[c][s] = \ outlier_filter([data_set[i][c][s] for i in range(n)]).mean() return final ``` 參照作業要求中 [用統計手法去除極端值 ](https://hackmd.io/@sysprog/linux2022-fibdrv#%E7%94%A8%E7%B5%B1%E8%A8%88%E6%89%8B%E6%B3%95%E5%8E%BB%E9%99%A4%E6%A5%B5%E7%AB%AF%E5%80%BC) 中的手法,在假設所得的 100 筆 n^th^ fibonacci 的資料為常態分佈,以 Z-value 刪去大於等於 1.96 sigma (95% 信心區間) 的觀測量後,取平均值作為第 n^th^ fibonacci 計算時間。 :::spoiler gnuplot ```gnuplot # for kernel time set title "Execution Time" set xlabel "n_th Fibonacci" set ylabel "time (ns)" set terminal png font " Times_New_Roman,12 " set output "perf.png" set yrange [20:70] set xrange [0:100] set xtics 0, 10, 100 set key left plot \ "time_filtered.txt" using 1 with linespoints linewidth 2 title "kernel", \ # for user space and kernel to user time set title "Execution Time" set xlabel "n_th Fibonacci" set ylabel "time (ns)" set terminal png font " Times_New_Roman,12 " set output "perf_user.png" set yrange [700:800] set xrange [0:100] set xtics 0, 10, 100 set key left plot \ "time_filtered.txt" using 2 with linespoints linewidth 2 title "user", \ "time_filtered.txt" using 3 with linespoints linewidth 2 title "k to u", \ ``` ::: **僅修正 VLA (v0)** ``` sudo taskset -c 0 ./client 0 > time.txt ``` **kernel space** ![](https://i.imgur.com/XmEtELS.png) **user space and kernel to user** ![](https://i.imgur.com/ErsYajQ.png) 由於 kernel space 與 user space 的耗時相差過大,只好分別畫成兩張圖方便檢視,儘管 kernel to user (以下簡稱 k to u)的時間似乎非常大,但在重複測試多次後仍然出現相同情形,且 k to u 的時間大致持平十分合理,因此結果應是正確的。 另外由於最初的實做是使用迭代的方式計算,因此隨著計算的項數增加執行時間大致成線性增加。 ## 更快的計算 fast doubling 參考作業要求中的 fast doubling 作法 $$ \begin{split} F(2k) &= F(k)[2F(k+1) - F(k)] \\ F(2k+1) &= F(k+1)^2+F(k)^2 \end{split} $$ 從 `MSB - 1` 開始(跳過 f(0) ) 這個位置可以用 `__builtin_clzll` 計算 - 遇到 0 ,執行 fast doubling 後繼續下個 bit - 遇到 1 ,執行 fast doubling 後再算出 f(2k + 2) = f(2K + 1) + f(2K) 並繼續 ```c static long long fib_sequence_fdoubling(long long k) { if(k < 2) return k; uint64_t msb = 62 - __builtin_clzll(k); // msb - 1 since skip k = 0 uint64_t f0 = 1, f1 = 1; // k = 1 for (uint64_t mask = 1 << msb, t1 = 0, t2 = 0; mask ;mask >>= 1){ t1 = f0 * (2 * f1 - f0); // f(2k) t2 = f1 * f1 + f0 * f0; // f(2k + 1) f0 = t1; f1 = t2; // k *= 2 if (k & mask){ t1 = f0 + f1; // k++ f0 = f1; f1 = t1; } } return f0; } ``` **效能分析** ![](https://i.imgur.com/pZ2nolv.png) 相較於原先 iterative 的實做 fast doubling 無疑快上許多,時間複雜度從 O(n) 增加轉為 O(log~2~(n))。 ### 進一步修改 fast doubling 由於原先的 fast doubling 儘管省略了 f(0) 這一項,也使用了 `clz` 來取得 MSB 位置,但在運算時 f(2k) 時使用了 `2 * f1` 可以被 shift 操作取代以節省時間,且在算 f(2k + 2) 時多了不少無意義的賦值,因此做出了些許修正再重新測試。 ```c static long long fib_sequence_fdoubling2(long long k) { if(k < 2) return k; uint64_t msb = 62 - __builtin_clzll(k); // msb - 1 since skip k = 0 uint64_t f0 = 1, f1 = 1; // k = 1 for (uint64_t mask = 1 << msb, t1 = 0, t2 = 0; mask ;mask >>= 1){ t1 = f0 * ((f1 << 1) - f0); // f(2k) t2 = f1 * f1 + f0 * f0; // f(2k + 1) if (k & mask){ f0 = t2; f1 = t1 + t2; // k = 2k + 1 } else{ f0 = t1; f1 = t2; // k = 2k } } return f0; } ``` **效能分析** ![](https://i.imgur.com/QnNw9sY.png) 可以看到這些改變略為降低計算 Fabonacci sequence 的 cost。 ## 實做 big num 以計算 f(93) 以後的值 為了可以計算 f(93) 以後的值採用下列 `bigN` 結構 ```c typedef struct bigN { uint64_t val; struct list_head link; } bigN; ``` 因為實在是沒看懂作業要求中的 **數字字串及SSO** 的部份,因此參考 [freshLiver](https://hackmd.io/@freshLiver/2022q1-hw3-fibdrv#%E6%9B%B4%E7%B2%BE%E7%A2%BA%E7%9A%84-Fibonacci-%E9%81%8B%E7%AE%97%E5%AF%A6%E4%BD%9C) 的作法,使用 kernel 中的 linked list 將 64 bit uint 串起來作為 big num 的結構,以 linked list 串起後在將可以計算更後面的 Fabonacci sequence,目前測試可以正確計算至 f(2000) 共 418 位數。 在加入了 `bigN` 這個結構後也需要實做對應的加法 ```c static struct list_head *bigN_add(struct list_head *bigN1, struct list_head *bigN2) { struct list_head **n1 = &bigN1->next, **n2 = &bigN2->next; for (int carrybit = 0;; n1 = &(*n1)->next, n2 = &(*n2)->next) { if (*n2 == bigN2) { if (*n1 == bigN1) { if (carrybit) new_bignode(bigN2, 1); // add a node to @bigN2 with val = 1 break; } new_bignode(bigN2, 0); // add a node to @bigN2 with val = 0 } carrybit = add_64(list_entry(*n1, bigN, link)->val, &list_entry(*n2, bigN, link)->val, carrybit); } return bigN2; } ``` 將 2 個 list 相加,其中 `bigN1` 應為 $f(i-1)$ ,`bigN2` 應為 $f(i-2)$,算出的結果直接放入 `bigN2`。 ```c #define MAX_64 1000000000000000000UL // 10^18 < 2^63 static int add_64(uint64_t a, uint64_t *b, int carrybit) { if (a + *b + carrybit >= MAX_64) { *b = a + *b + carrybit - MAX_64; return 1; // return carrybit = 1 } *b += a + carrybit; return 0; // return carrybit = 0 } ``` 將一個 `bigN` node 的上限定在 10^18^ 方便後續 copy_to_user 時串接多個 node,且 10^18^ < 2^63^ 因此 a, b 可以直接相加而不 overflow 方便加法實做,但會造成空間的浪費。 ```c static char *bigN2string(struct list_head *list_head) { int fklen = (nodecount & 1ull) + (nodecount >> 1); char *result = (char *) kzalloc(fklen * 18 + 1, GFP_KERNEL); if (!result) return NULL; struct list_head *pos; list_for_each_prev(pos, list_head) { if (strlen(result) == 0) snprintf(result, 19, "%lld", list_entry(pos, bigN, link)->val); else snprintf(result + strlen(result), 19, "%018lld", list_entry(pos, bigN, link)->val); } return result; } ``` 因 MSB 存放在最後一個 node,而 LSB 在第一個 node,只要倒著走訪 list 中的節點並串接到要回傳的 `result` 字串即可最後的答案。 ```c static long long fib_sequence_bignum(long long k, char *buf) { if (k < 2) { char result[2]; snprintf(result, 2, "%lld", k); long long n = copy_to_user(buf, result, 2); return n; } struct list_head *fk1 = (struct list_head *) kmalloc( sizeof(struct list_head), GFP_KERNEL), *fk2 = (struct list_head *) kmalloc( sizeof(struct list_head), GFP_KERNEL), *temp; INIT_LIST_HEAD(fk1); INIT_LIST_HEAD(fk2); new_bignode(fk1, 1); new_bignode(fk2, 0); for (int i = 2; i <= k; i++) { temp = bigN_add(fk1, fk2); fk2 = fk1; fk1 = temp; } char *result = bigN2string(fk1); long long n = copy_to_user(buf, result, strlen(result)); kfree(result); freebigN(fk1); freebigN(fk2); return n; } ``` 先一樣採用單純的遞迴計算,可以正確的計算出 93 項以後的數 ``` at offset 90, returned the sequence 2880067194370816120. at offset 91, returned the sequence 4660046610375530309. at offset 92, returned the sequence 7540113804746346429. at offset 93, returned the sequence 12200160415121876738. at offset 94, returned the sequence 19740274219868223167. at offset 95, returned the sequence 31940434634990099905. at offset 96, returned the sequence 51680708854858323072. at offset 97, returned the sequence 83621143489848422977. at offset 98, returned the sequence 135301852344706746049. at offset 99, returned the sequence 218922995834555169026. at offset 100, returned the sequence 354224848179261915075. ``` 採用與先前相同的方法進行效能量測。 ![](https://i.imgur.com/iQWrsUw.png) 這次計算的相當慢,估計是先前 `bigN` 的結構及加法設計上浪費了 `uint64_t` 不少"範圍"(上限被定在 10^18^),且每次 kmalloc 一個新的 node 都是很大的 cost。 ``` |--73.77%--fib_sequence_bignum | | | |--48.24%--__kmalloc | | kmalloc_order_trace | | kmalloc_order ``` 在查看 perf 所紀錄的 caller graph 不意外的發現 kmalloc 佔用了近一半的時間,接下來該考慮改用不會浪費那多間的大數結構並希望能減少 kmalloc 造成的 cost。 ### 進一步改進 bignum 版本 改用數字字串來實做 bignum ```c static void bigN_add2(char *bigN1, char *bigN2, char *buf) { int i, carry = 0, sum, len1 = strlen(bigN1), len2 = strlen(bigN2); for (i = 0; i < len2; i++) { sum = bigN1[i] + bigN2[i] + carry - ('0' << 1); buf[i] = '0' + sum % 10; carry = sum / 10; } for (i = len2; i < len1; i++) { sum = bigN1[i] - '0' + carry; buf[i] = '0' + sum % 10; carry = sum / 10; } if (carry) buf[i] = '0' + carry; } // v4 bigN2 iterative static long long fib_sequence_bignum2(long long k, char *buf) { if (k < 2) { char result[2]; snprintf(result, 2, "%lld", k); long long n = copy_to_user(buf, result, 2); return n; } char *fk1 = (char *)kmalloc(1024, GFP_KERNEL), *fk2 = (char *)kmalloc(1024, GFP_KERNEL), *temp = (char *)kmalloc(1024, GFP_KERNEL); fk1[0] = '1', fk2[0] = '1'; // k = 2, k = 1 for (int i = 3; i <= k; i++) { bigN_add2(fk1, fk2, temp); strncpy(fk2, fk1, strlen(fk1)); strncpy(fk1, temp, strlen(temp)); } strrev(fk1); long long n = copy_to_user(buf, fk1, strlen(fk1)); return n; } ``` 先以 iterative 的方式實做修改過的 bignum,數字字串的部份採用固定 1024Byte 的 char array,數字依倒序儲存方便進行相加,並依作業要求中的介紹實做數字字串的相加,最後在將字串反轉回正序並 `copy_to_user` 即可。 將 `expected.txt` 改為 [The first 500 Fibonacci numbers](https://blog.abelotech.com/posts/first-500-fibonacci-numbers/#the-first-500-fibonacci-numbers) 中的答案後,將 `client.c` 的輸出格式調整好,並將 `MAX_LENGTH` 設為 500 後,以 `make check` 檢查實做的正確性。 省略部份輸出: ``` $ sudo make check make -C /lib/modules/5.13.0-35-generic/build M=/home/user/Senior/linux2022/fibdrv modules sudo ./client > out make unload make[1]: 進入目錄「/home/user/Senior/linux2022/fibdrv」 sudo rmmod fibdrv || true >/dev/null make[1]: 離開目錄「/home/user/Senior/linux2022/fibdrv」 Passed [-] ``` 接著進行效能分析: > 比較的是兩者的 kernel time ![](https://i.imgur.com/aaiVZnN.png) 可以發現在前 200 項相較前一個實做無疑快上不少,然而受使用 `strncpy()` 的拖累在後續項次數字變長後,反而比原先的實做還慢。 ### 再次改進 bignum 版本 ```c static char **bigN_add3(char **bigN1, char **bigN2) { int i, carry = 0, sum, len1 = strlen(*bigN1), len2 = strlen(*bigN2); for (i = 0; i < len2; i++) { sum = (*bigN1)[i] + (*bigN2)[i] + carry - ('0' << 1); (*bigN2)[i] = '0' + sum % 10; carry = sum / 10; } for (i = len2; i < len1; i++) { sum = (*bigN1)[i] - '0' + carry; (*bigN2)[i] = '0' + sum % 10; carry = sum / 10; } if (carry) (*bigN2)[i] = '0' + carry; return bigN2; } // v5 bigN3 iterative static long long fib_sequence_bignum3(long long k, char *buf) { if (k < 2) { char result[2]; snprintf(result, 2, "%lld", k); long long n = copy_to_user(buf, result, 2); return n; } char *buf1 = (char *) kmalloc(1024, GFP_KERNEL), *buf2 = (char *) kmalloc(1024, GFP_KERNEL); char **fk1 = &buf1, **fk2 = &buf2, **temp; buf1[0] = '1', buf2[0] = '1'; // k = 2, k = 1 for (int i = 3; i <= k; i++) { temp = bigN_add3(fk1, fk2); fk2 = fk1; fk1 = temp; } strrev(*fk1); long long n = copy_to_user(buf, *fk1, strlen(*fk1)); return n; } ``` 前一個版本受 `strncpy` 的拖累效能不佳,這個版本透過使用 indirect pointer 省去複製字串的消耗,也節省先前需要一個 temporary buffer 存放 `ADD` 運算的結果的空間。 同樣使用 make check 檢查前 500 項的結果 ``` $ sudo make check ... // 省略 Passed [-] ``` 效能分析 > 比較的是兩者的 kernel time ![](https://i.imgur.com/BZXLCwn.png) 儘管還是完全比不上 fast doubling 但比起前兩次實做,在擺脫耗時的 `strncpy` 後快上不少。 :::warning TODO 為 big number 實做 乘法 等運算,以實做 big number 版本的 fast doubling :::