# 2022q1 Homework3 (fibdrv) contributed by < [Wallmountain](https://github.com/Wallmountain) > ## 實驗環境 ``` Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 39 bits physical, 48 bits virtual CPU(s): 12 On-line CPU(s) list: 0-11 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 158 Model name: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz Stepping: 10 CPU MHz: 1358.240 CPU max MHz: 4100.0000 CPU min MHz: 800.0000 BogoMIPS: 4399.99 Virtualization: VT-x L1d cache: 192 KiB L1i cache: 192 KiB L2 cache: 1.5 MiB L3 cache: 9 MiB NUMA node0 CPU(s): 0-11 ``` ## 前期準備 - 檢查 Linux 核心版本 ```shell $ uname -r 5.11.0-27-generic ``` - 安裝 linux-headers 套件 ```shell $ sudo apt install linux-headers-`uname -r` Reading package lists... Done Building dependency tree Reading state information... Done linux-headers-5.11.0-27-generic is already the newest version (5.11.0-27.29~20.04.1). 0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded. ``` - 確認 linux-headers 套件已正確安裝於開發環境 ```shell $ dpkg -L linux-headers-5.11.0-27-generic | grep "/lib/modules" /lib/modules /lib/modules/5.11.0-27-generic /lib/modules/5.11.0-27-generic/build ``` - 檢驗目前的使用者身份 ```shell $ whoami wallmountain $ sudo whoami root ``` - 編譯並測試 ```shell $ make check make -C /lib/modules/5.11.0-27-generic/build M=/home/wallmountain/fibdrv modules make[1]: Entering directory '/usr/src/linux-headers-5.11.0-27-generic' make[1]: Leaving directory '/usr/src/linux-headers-5.11.0-27-generic' make unload make[1]: Entering directory '/home/wallmountain/fibdrv' sudo rmmod fibdrv || true >/dev/null rmmod: ERROR: Module fibdrv is not currently loaded make[1]: Leaving directory '/home/wallmountain/fibdrv' make load make[1]: Entering directory '/home/wallmountain/fibdrv' sudo insmod fibdrv.ko make[1]: Leaving directory '/home/wallmountain/fibdrv' sudo ./client > out make unload make[1]: Entering directory '/home/wallmountain/fibdrv' sudo rmmod fibdrv || true >/dev/null make[1]: Leaving directory '/home/wallmountain/fibdrv' Passed [-] f(93) fail input: 7540113804746346429 expected: 12200160415121876738 ``` - 觀察產生的 fibdrv.ko 核心模組 ```shell $ modinfo fibdrv.ko filename: /home/wallmountain/fibdrv/fibdrv.ko version: 0.1 description: Fibonacci engine driver author: National Cheng Kung University, Taiwan license: Dual MIT/GPL srcversion: B8BF0BDBE87ACE60588858C depends: retpoline: Y name: fibdrv vermagic: 5.11.0-27-generic SMP mod_unload modversions ``` ## Linux 效能分析 ### 查看行程的 CPU affinity ```shell $ taskset -p 1 pid 1's current affinity mask: fff ``` 代表該行程可以在第 0 到第 11 個 CPU 核心中執行 ### 將行程固定在特定的 CPU 中執行 ```shell $ taskset -cp 3 1 pid 1's current affinity list: 0-11 pid 1's new affinity list: 3 ``` ### 排除干擾效能分析的因素 - 抑制 address space layout randomization (ASLR) ```shell $ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space" ``` - 針對 Intel 處理器,關閉 turbo mode ```shell $ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo" ``` ## 量測時間前置準備 ### kernel 量測時間 在 kernel 中 使用 [ktime 相關的 API](https://www.kernel.org/doc/html/latest/core-api/timekeeping.html) 量測時間, 並使用作業講解的方式, 在 read 中量測時間,參考 [laneser](https://hackmd.io/@laneser/fibdrv) 用了 array of function pointer, 可以避免一直修改要呼叫的函式 ```cpp static long long (*fib_func[])(long long) = {fib_sequence, fib_fast_doubly_sequence}; static long long fib_time_proxy(long long k, size_t func_count) { kt = ktime_get(); long long result = (fib_func[func_count])(k); kt = ktime_sub(ktime_get(), kt); return result; } static ssize_t fib_read(struct file *file, char *buf, size_t size, loff_t *offset) { return (ssize_t) fib_time_proxy(*offset, size); } ``` 並使用 write 回傳量測到的時間 ```cpp static ssize_t fib_write(struct file *file, const char *buf, size_t size, loff_t *offset) { return ktime_to_ns(kt); } ``` ### user space 量測時間 使用 ```clock_gettime``` 取得時間 ```cpp struct timespec t1, t2; clock_gettime(CLOCK_MONOTONIC, &t1); read(fd, write_buf, func); write(fd, write_buf, func); clock_gettime(CLOCK_MONOTONIC, &t2); t[n] = (long long) (t2.tv_sec * 1e9 + t2.tv_nsec) - (t1.tv_sec * 1e9 + t1.tv_nsec); ``` ### 用統計手法去除極端值 參考 [KYG-yaya573142 的報告](https://hackmd.io/@KYWeng/rkGdultSU), 實作對 kernel fib 函式的時間量測和處理 :::spoiler ```cpp #include <fcntl.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <unistd.h> #define FIB_DEV "/dev/fibonacci" #define sample_size 1000 int main(int argc, char **argv) { if (argc < 2) return 0; char write_buf[] = "testing writing"; int offset = 100; int func = atoi(argv[1]); int fd = open(FIB_DEV, O_RDWR); if (fd < 0) { perror("Failed to open character device"); exit(1); } /* for each F(i), measure sample_size times of data and * remove outliers based on the 95% confidence level */ for (int i = 0; i <= offset; i++) { lseek(fd, i, SEEK_SET); double t[sample_size] = {0}; double mean = 0.0, sd = 0.0, result = 0.0; int count = 0; for (int n = 0; n < sample_size; n++) { /* sampling */ /* get the runtime in kernel space here */ read(fd, write_buf, func); t[n] = (double) write(fd, write_buf, func); mean += t[n]; /* sum */ } mean /= sample_size; /* mean */ for (int n = 0; n < sample_size; n++) { sd += (t[n] - mean) * (t[n] - mean); } sd = sqrt(sd / (sample_size - 1)); /* standard deviation */ for (int n = 0; n < sample_size; n++) { /* remove outliers */ if (t[n] <= (mean + 2 * sd) && t[n] >= (mean - 2 * sd)) { result += t[n]; count++; } } result /= count; printf("%d %.5lf samples: %d\n", i, result, count); } close(fd); return 0; } ``` ::: 其中利用 main 的傳入值選擇要使用的 kernel function 對應前面所用的 array of function pointer ```shell $ taskset -c $CPUID./kernel_fib 0 > kernel_fib.txt ``` ## 實作費氏數列 ### fib_sequence 首先, 對尚未修改的 fib_sequence 函式作時間的量測 ```cpp static long long fib_sequence(long long k) { /* FIXME: use clz/ctz and fast algorithms to speed up */ long long f[k + 2]; f[0] = 0; f[1] = 1; for (int i = 2; i <= k; i++) { f[i] = f[i - 1] + f[i - 2]; } return f[k]; } ``` ![](https://i.imgur.com/iGKizLl.png) - 並比較 kernel 和 user space 的花費時間 - 可以看到處理 system call 花費蠻多時間 ![](https://i.imgur.com/cpwsiX6.png) ### fib_fast_doubling 使用 [fast_doubling](https://chunminchang.github.io/blog/post/calculating-fibonacci-numbers-by-fast-doubling) 的手法, 進行加速 而簡化過後的算法為 - $F(2k)=F(k)[2F(k+1)−F(k)]$ - $F(2k+1)=F(k+1)^2 + F(k)^2$ ```cpp static long long fib_fast_doubling_sequence(long long k) { unsigned int h = 0; for (unsigned int i = k; i; ++h, i >>= 1) ; long long f[2]; f[0] = 0; f[1] = 1; for (long long mask = 1 << (h - 1); mask; mask >>= 1) { long long a = f[0] * (2 * f[1] - f[0]); long long b = f[0] * f[0] + f[1] * f[1]; if (mask & k) { f[0] = b; f[1] = a + b; } else { f[0] = a; f[1] = b; } } return f[0]; } ``` 從下圖, 可看見明顯的加速, 原因也可從兩種算法的時間複雜度簡單得知 - fib_sequence 時間複雜度 : $O(n)$ - fast doubling 時間複雜度 : $O(log n)$ ![](https://i.imgur.com/BlRcaGy.png) #### 使用 clz 進行簡化 :::spoiler ```cpp static long long fib_fast_doubling_sequence_clz(long long k) { long long f[2]; f[0] = 0; f[1] = 1; for (long long mask = 1 << (31 - __builtin_clz(k)); mask; mask >>= 1) { long long a = f[0] * (2 * f[1] - f[0]); long long b = f[0] * f[0] + f[1] * f[1]; if (mask & k) { f[0] = b; f[1] = a + b; } else { f[0] = a; f[1] = b; } } return f[0]; } ``` ::: - 先去除數字 MSB 起算的開頭 0 位元 - 利用 clz 減去要去計算數字 MSB 起算的開頭 0 位元 原本的算法時間複雜度為 $O(logn)$ ```cpp unsigned int h = 0; for (unsigned int i = k; i; ++h, i >>= 1) ; for (long long mask = 1 << (h - 1); mask; mask >>= 1) { } ``` clz 的算法可達到常數時間, 且能讓程式碼變得更簡潔 ```cpp for (long long mask = 1 << (31 - __builtin_clz(k)); mask; mask >>= 1) { } ``` - 思考為何加速的幅度不大 ![](https://i.imgur.com/NKUBl0G.png) :::warning TODO: 查閱上述 CPU 指令的 latency,見 [Instruction tables](https://www.agner.org/optimize/instruction_tables.pdf) :notes: jserv ::: ## 減少乘法運算的成本 ```cpp static long long long_mul(long long a, long long b) { long long ans = 0; while (b != 0) { long long t = -b & b; int r = __builtin_ctzll(b); ans += a << r; b ^= t; } return ans; } static long long fib_fast_doubling_sequence_clz_pr(long long k) { long long f[2]; f[0] = 0; f[1] = 1; for (long long mask = 1 << (31 - __builtin_clz(k)); mask; mask >>= 1) { long long a = long_mul(f[0], (f[1] << 1) - f[0]); long long b = long_mul(f[0], f[0]) + long_mul(f[1], f[1]); if (mask & k) { f[0] = b; f[1] = a + b; } else { f[0] = a; f[1] = b; } } return f[0]; } ``` ![](https://i.imgur.com/UOHF0xt.png) ## 實作 big num ### 資料結構 - 使用字串來儲存大數 ```cpp #define bnum_size unsigned int #define bnum_digit unsigned char typedef struct bnum { bnum_digit *ptr; bnum_size size; bnum_size capacity; } bnum; ``` 實作方式是根據 [```Binary-coded decimal```](https://en.wikipedia.org/wiki/Binary-coded_decimal) ,利用任何單一十進制數字都能用四位元二進制數值表示,因此在結構中選擇使用 ```unsigned char``` , ```char``` 大小為8個位元,可以擺進兩個十進制數字,這個方式也稱為 packed BCD。 使用 ```unsigned``` 是因為在運算時的方便,可以直接進行加減,在一開始的實作沒有考慮到 ```char``` 的加減也有正負值的問題,因此在計算時總是出現預料外的結果,於是最後才決定使用 ```unsigned char```。 使用 ```capacity``` 紀錄目前的結構體已經跟系統要了多大的空間,可以避免因為不知道當前有多少空間,而一直多要空間,造成的空間和時間上的浪費。 缺點 :-1: : - 跟一般二進制編碼比起來,多浪費約20%的空間 - 進行加減法時,因為還是使用二進制的進位模式,所以一旦進位,就得補6,故必須多花費時間去判斷是否有進位 ### 運算函式 #### BNUM_ALLOC 在其他的運算函式中,都會使用,從 ```capacity``` 判斷這個要存入運算數值的結構體是否已經要了足夠大小的空間並把數值清0 一開始,就先判斷運算後至少需要多少空間,一次性呼叫,減少動態配置記憶體的次數,也能減少大數運算的時間 ```cpp #define bnum_ptr_resize(x, n) \ do { \ if (n > x->capacity) { \ x->size = 0; \ x->capacity = n; \ x->ptr = krealloc(x->ptr, n + 1, GFP_KERNEL); \ } \ memset(x->ptr, '\0', x->capacity); \ } while (0) #define bnum_ptr_new(x, n) \ do { \ x->capacity = n; \ x->ptr = kmalloc(n + 1, GFP_KERNEL); \ } while (0) #define BNUM_ALLOC(x, n) \ do { \ bnum *__x = (x); \ bnum_size __n = (n); \ __n = (__n >> 1) + (__n & 1); \ if (__x->ptr) \ bnum_ptr_resize(__x, __n); \ else \ bnum_ptr_new(__x, __n); \ } while (0) ``` #### bnum_lshift 考慮到如果若是進位必須要補6,先將左移一的結果直接存入 ```dest``` ,再從 ```a``` 中判斷是否進位,以十進制的角度,若是當前位數的數字不小於5,則左移一時會進位。 ```cpp void bnum_lshift(bnum *dest, bnum *a) { BNUM_ALLOC(dest, a->size + 1); bnum_digit *u = dest->ptr, *v = a->ptr; const unsigned size = (a->size + 1) >> 1; for (int i = 0; i < size; ++i) { *u += *v << 1; *u += ((*v & MASK_L) >= 5U) ? 6U : 0; if ((*v++ >> 4) >= 5U) { *u += 96U; *(++u) = 1; } else u++; } bnum_cal_size(dest, a->size); } ``` > 待完成 : 可以左移任意位數 #### bnum_add 在加法中,主要分成三個部分 1. 因為 a, b 兩個的 size 不一定相同,因此第一步驟為將兩數相加直到長度較小的數結束,並回傳是否最後還有進位 2. 若還有進位,將接下來的函式將剩餘 size 較大的數值繼續和進位作加法直到沒有進位為止 3. 最後就是把剩餘 size 較大的數值複製進儲存運算結果的結構體中 ```cpp void bnum_add(bnum *total, const bnum *a, const bnum *b) { bnum_size n = digit_max(a->size, b->size); BNUM_ALLOC(total, n + 1); digit_add(total, a->ptr, (a->size + 1) >> 1, b->ptr, (b->size + 1) >> 1); bnum_cal_size(total, n); } void digit_add(bnum *total, const bnum_digit *a, bnum_size asize, const bnum_digit *b, bnum_size bsize) { if (asize > bsize) { bnum_size i = bsize; int carry = digit_add_c(total->ptr, a, b, bsize); if (carry) i += digit_addi(&total->ptr[bsize], &a[bsize]); digit_cpy(&total->ptr[i], &a[i], asize - i + 1); } else { bnum_size i = asize; int carry = digit_add_c(total->ptr, a, b, asize); if (carry) i += digit_addi(&total->ptr[asize], &a[asize]); digit_cpy(&total->ptr[i], &a[i], bsize - i + 1); } } int digit_add_c(bnum_digit *dest, const bnum_digit *a, const bnum_digit *b, bnum_size size) { int carry = 0; for (bnum_size i = 0; i < size; i++) { unsigned int u = a[i] + b[i] + 0x66U + carry; dest[i] = a[i] + b[i] + carry; if ((u ^ a[i] ^ b[i]) & 0x10) dest[i] += 6U; if ((carry = !!(u & 0x100))) dest[i] += 0x60U; } return carry; } bnum_size digit_addi(bnum_digit *dest, const bnum_digit *a) { int carry = 1; bnum_size i = 0; while (carry) { unsigned int u = a[i] + 0x66U + carry; dest[i] = a[i] + carry; if ((u ^ a[i]) & 0x10) dest[i] += 6U; if ((carry = !!(u & 0x100))) dest[i] += 0x60U; i++; } return i; } void digit_cpy(bnum_digit *dest, const bnum_digit *a, bnum_size size) { while (size-- != 0) { dest[size] = a[size]; } } ``` #### bnum_sub 減法的部分如果被減數的當前位數的數字小於減數,則需要借位,而借位給存在 ```char``` 較大的四個位元的數字直接借10,也就是 ```0xA0``` ,而在最後四個位元的數字,因為在運算時,會直接進行兩數減法,故要判斷是否有借位,減去借位時多給的6 ```cpp void bnum_sub(bnum *total, const bnum *a, const bnum *b) { int cmp = bnum_cmp(a, b); if (cmp > 0) { BNUM_ALLOC(total, a->size); digit_sub(total->ptr, a->ptr, (a->size + 1) >> 1, b->ptr, (b->size + 1) >> 1); bnum_size size = ((a->size + 1) >> 1) - 1; do { if (total->ptr[size]) { size = (size << 1) + 1 + !!(total->ptr[size] & 0xF0); break; } } while (size--); total->size = size; } else { BNUM_ALLOC(total, 1); total->size = 1; } } void digit_sub(bnum_digit *total, const bnum_digit *a, bnum_size asize, const bnum_digit *b, bnum_size bsize) { bnum_size size = bsize; int carry = 0; for (bnum_size i = 0; i < size; i++) { if (a[i] < b[i] + carry) { total[i] = 0xA0U - b[i] + a[i] - carry; carry = 1; } else { total[i] = a[i] - b[i] - carry; carry = 0; } if ((total[i] & MASK_L) > (a[i] & MASK_L)) total[i] -= 6U; } while (carry) { if (!a[size]) { total[size] = 0x99U; } else { total[size] = a[size] - carry; if ((total[size] ^ a[size]) & 0x10U) total[size] -= 6U; carry = 0; } ++size; } for (; size < asize; ++size) total[size] = a[size]; } ``` #### bnum_mul 將數字儲存的方式先改成一般的二進制,直接進行乘法,再轉換成 Binary-coded decimal > 缺點: 多次呼叫除法和取餘數,不是一個好的算法,看能否改進成沒有除法的實作方式 ```cpp int digit_ctoi(bnum_digit a) { return (a & MASK_L) + 10 * ((a & MASK_R) >> 4); } void bnum_mul(bnum *total, const bnum *a, const bnum *b) { BNUM_ALLOC(total, a->size + b->size); bnum_size asize = ((a->size + 1) >> 1) + 1; for (bnum_size i = 0, size = (b->size + 1) >> 1; i < size; ++i) { if (b->ptr[i]) digit_mul_add(&total->ptr[i], a->ptr, asize, digit_ctoi(b->ptr[i])); } bnum_cal_size(total, a->size + b->size - 1); } void digit_mul_add(bnum_digit *dest, const bnum_digit *a, bnum_size asize, int b) { int carry = 0; for (bnum_size i = 0; i < asize; ++i) { carry += digit_ctoi(a[i]) * b + digit_ctoi(dest[i]); dest[i] = carry % 10; carry /= 10; dest[i] += carry % 10 << 4; carry /= 10; } } ``` ### 效能分析 跟教授提供的 [bignum](https://github.com/sysprog21/bignum) 作比較,因為效能遠遠不及,因此先分析 ref 的實作 ![](https://i.imgur.com/gjd9Vjg.png) ### 分析 bignum 實作