2022q1 Homework3 (fibdrv)

# 2022q1 Homework3 (fibdrv) contributed by < [tommy2234](https://github.com/tommy2234/fibdrv) > 題目連結 [fibdrv](https://hackmd.io/@sysprog/linux2022-fibdrv#K04-fibdrv) ## 針對效能分析的環境準備 ### 將用來測試的 cpu 隔離 - 修改 /etc/default/grub 中的開機參數，在最後加上 ``` GRUB_CMDLINE_LINUX="isolcpus=7" ``` - 重新開機後確認 cpu affinity ```shell taskset -p 1 ``` 可以看到 mask 從 0xff 變成 0x7f，cpu 7 已經被隔離 `pid 1's current affinity mask: 7f` ### 排除干擾因素 - 抑制 [address space layout randomization](https://en.wikipedia.org/wiki/Address_space_layout_randomization) (ASLR) ```shell $ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space" ``` - 設定 scaling_governor 為 performance。 ``` for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor do echo performance > ${i} done ``` - 針對 Intel 處理器，關閉 turbo mode ``` $ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo" ``` --- ## 研讀費氏數列首先程式一編譯就出現這個 warning `warning: ISO C90 forbids variable length array ‘f’ [-Wvla]` 查看一下模組內容： ```c /* FIXME: C99 variable-length array (VLA) is not allowed in Linux kernel. */ ``` 原來 [Linux kernel 不支援 VLA](https://www.phoronix.com/scan.php?page=news_item&px=Linux-Kills-The-VLA) ，原因似乎是為了減少 overhead ，還有對 LLVM compiler 有更好的相容性。 ### First try 改成長度固定為二的陣列，因為 fib 在計算在只會同時用到前兩項的數值，因此我們只需要兩個區域，就像把兩個水杯的水倒來到去一樣將數值累加就好。 ```c static long long fib_sequence(long long k) { long long f[] = {0, 1}; for (int i = 2; i <= k; i++) { if(i & 1) f[1] += f[0]; else f[0] += f[1]; } return k & 1 ? f[1] : f[0]; } ``` ### Fast Doubling 根據以下式子 $$ \begin{align} F(2k) = F(k)[2F(k+1) - F(k)]\\ F(2k+1) = F(k+1)^2+F(k)^2 \end{align} $$ 我們可以重複利用以前計算過的數值以減少運算量 ```c static long long fib_sequence(long long n) { if (n == 0) return 0; else if (n <= 2) return 1; long long int k = n >> 1; if (n % 2) { return fib_sequence(k) * fib_sequence(k) + fib_sequence(k + 1) * fib_sequence(k + 1); } else { return fib_sequence(k) * (2 * fib_sequence(k + 1) - fib_sequence(k)); } } ``` 但是此方法是屬於 top down 的方式，遞迴的過程中，有些數值被重複計算了，以下圖來看 F3 被算了兩次。當 n 值很大時，會浪費很多運算資源。 ![](https://i.imgur.com/mnC55Ib.png) 因此我們可以用 bottom up 的方式重新改寫。 1. 首先找到 k 去除 leading 0's 剩下幾個 bit （可使用硬體加速指令 clz) 2. 從最高位的 bit 開始往下計算假設 n 為目前涵蓋到的 bits ，k = floor(n / 2) = n >> 1 - n 是奇數： F(n) = F(2k + 1), F(n + 1) = F(2k + 2) = F(2k) + F(2k+1) - n 是偶數： F(n) = F(2k), F(n + 1) = F(2k+1) Example : 1100 | n | F(n) | F(n+1) | | -------- | --------- |:--------------- | | ==1==100 | F(2k + 1) | F(2k) + F(2k+1) | | ==11==00 | F(2k + 1) | F(2k) + F(2k+1) | | ==110==0 | F(2k) | F(2k + 1) | | ==1100== | F(2k) | F(2k + 1) | 改寫後的程式碼參考 [fast doubling](https://chunminchang.github.io/blog/post/calculating-fibonacci-numbers-by-fast-doubling) ```c static long long fib_sequence(long long n) { int h = 64 - __builtin_clzll(n); long long a = 0; // F(n) = 0 long long b = 1; // F(n + 1) = 1 for (long long mask = 1 << (h - 1); mask; mask >>= 1) { // k = floor((n >> j) / 2) // then a = F(k), b = F(k + 1) now. long long c = a * (2 * b - a); // F(2k) = F(k) * [ 2 * F(k + 1) – F(k) ] long long d = a * a + b * b; // F(2k + 1) = F(k)^2 + F(k + 1)^2 if (mask & n) { a = d; b = c + d; } else { a = c; b = d; } } return a; } ``` --- ### 效能測試使用 write 將參數 size 傳入 kernel space ，決定將要測試哪種實作。使用 ktime() 測量在 kernel space 所花費時間。 ```c static ssize_t fib_write(struct file *file, const char *buf, size_t size, loff_t *offset) { if (size > 2) return 1; ktime_t kt = ktime_get(); fibs[size](*offset); kt = ktime_sub(ktime_get(), kt); return (ssize_t)(ktime_to_ns(kt)); } ``` 原始版 fib_sequence ![](https://i.imgur.com/fz6w27G.png) 加上 fast doubling 做比較 ![](https://i.imgur.com/yRDpRly.png) ### 計算 system call 所花費時間在 user_space 使用 clock_gettime() 計算花費時間，再和 ktime() 從 write() 回傳的時間相減，即可得到 system call 花費的時間。 ```c struct timespec tt1, tt2; for (int i = 0; i <= offset; i++) { lseek(fd, i, SEEK_SET); clock_gettime(CLOCK_REALTIME, &tt1); sz = write(fd, write_buf, atoi(argv[1])); clock_gettime(CLOCK_REALTIME, &tt2); long user_time = tt2.tv_nsec - tt1.tv_nsec; long diff = user_time - sz; printf("%d %lld %ld, %ld\n", i, sz, user_time, diff); } ``` ![](https://i.imgur.com/1iPd4TM.png) ### 用統計手法去除極端值使用老師示範的 python script ```python def outlier_filter(datas, threshold = 2): datas = np.array(datas) z = np.abs((datas - datas.mean()) / datas.std()) return datas[z < threshold] ``` 處理計算每個 Fibonacci number 的時間，最後返回處理好資料 (去除 outlier 再取平均) ```python def data_processing(data_set, n): catgories = data_set[0].shape[0] samples = data_set[0].shape[1] final = np.zeros((catgories, samples)) for c in range(catgories): for s in range(samples): final[c][s] = \ outlier_filter([data_set[i][c][s] for i in range(n)]).mean() return final ``` 得到平滑的直線！ ![](https://i.imgur.com/mTQD60q.png) --- ## 計算 F93 (包含) 之後的 Fibonacci 數 ### 使用 unsigned integer 組成的 Linked list 參考 [freshLiver](https://hackmd.io/@freshLiver/2022q1-hw3-fibdrv#%E6%9B%B4%E7%B2%BE%E7%A2%BA%E7%9A%84-Fibonacci-%E9%81%8B%E7%AE%97%E5%AF%A6%E4%BD%9C) 的實作方式，覺得使用 linked list 來實作真是個好方法，不但免去了使用 char array 時重新分配時複製記憶體時的 overhead ，走訪起來也更方便。首先自定義一個 struct，還有每個 node 的上界，和最大的位數上限。將上界定在 10^18 ，超過時進位到下一個 node ，如此可以保證每個 node 不會 overflow。 ```c #define digits 18 // max digits per big_num node #define bound 1000000000000000000ULL typedef struct big_num { unsigned long long num; struct list_head head; } big_num; ``` 進位的位置在 list 末端時需要新增節點。 ```c void new_node(struct list_head *list, unsigned long long num) { big_num *temp = kmalloc(sizeof(big_num), GFP_KERNEL); temp->num = num; list_add_tail(&temp->head, list); len++; } ``` 相加兩個 list 的函式。想法是當較小的數距離上界的空間不足以加入另一個數時就需要進位，參考老師的 `addBigN` 函式。 ```c void add_big_num(struct list_head *small, struct list_head *large) // large add to small { struct list_head **ptr1 = &small->next, **ptr2 = &large->next; for (int carry = 0;; ptr1 = &(*ptr1)->next, ptr2 = &(*ptr2)->next) { if (*ptr1 == small && *ptr2 == large) { if (carry) new_node(small, 1); break; } else if (*ptr1 == small && *ptr2 != large) new_node(small, 0); big_num *sn = list_entry(*ptr1, big_num, head); big_num *ln = list_entry(*ptr2, big_num, head); if (ln->num + carry >= bound - sn->num) { sn->num = sn->num + ln->num + carry - bound; carry = 1; } else { sn->num = sn->num + ln->num + carry; carry = 0; } } } ``` 計算完成後將 list 的內容轉成字串 ```c char *list_to_string(struct list_head *list) { char *s = kmalloc(digits * len + 1, GFP_KERNEL); big_num *node; list_reverse(list); list_for_each_entry (node, list, head) { char buf[digits + 1]; snprintf(buf, digits + 1, "%018llu", node->num); strncat(s, buf, digits); } return s; } ``` Fib 實作的 linked list 版本，其實就是將一開始的 fib_sequence 函式中的陣列元素改為 `struct *list_head` ```c struct list_head *fib_big(long long k) { struct list_head *f[2]; len = 1; for (int i = 0; i < 2; i++) { f[i] = kmalloc(sizeof(struct list_head), GFP_KERNEL); INIT_LIST_HEAD(f[i]); new_node(f[i], i); } for (int i = 2; i <= k; i++) { add_big_num(f[(i & 1)], f[((i - 1) & 1)]); } return f[(k & 1)]; } ``` 最後透過 copy_to_user 將字串複製到 user space 傳入的 buffer。 ```c static ssize_t fib_read(struct file *file, char *buf, size_t size, loff_t *offset) { struct list_head *n = fib_big(*offset); char *s = list_to_string(n); char *temp = s; while (*temp == '0' && *(temp + 1) != '\0') temp++; unsigned long ret = copy_to_user(buf, temp, strlen(temp) + 1); kfree(s); return ret; } ``` ### 效能測試可以看到計算到 n 為 88 時因為 kmalloc 而突然竄起的時間，可見動態分配記憶體的 cost 非常高。 ![](https://i.imgur.com/uXXfbM7.png) --- ## 觀察 Linux 核心模組中的 mutex Fibdrv 中的 mutex 使用在 open 和 release 的部份，也就是一次只有一個 thread 能夠打開 Character Device 並進行讀寫的動作。 ```c static int fib_open(struct inode *inode, struct file *file) { if (!mutex_trylock(&fib_mutex)) { printk(KERN_ALERT "fibdrv is in use"); return -EBUSY; } return 0; } static int fib_release(struct inode *inode, struct file *file) { mutex_unlock(&fib_mutex); return 0; } ``` 於是嘗試將 critical section 移除，看看會發生什麼事 ```c static int fib_open(struct inode *inode, struct file *file) { /* if (!mutex_trylock(&fib_mutex)) { printk(KERN_ALERT "fibdrv is in use"); return -EBUSY; } */ return 0; } static int fib_release(struct inode *inode, struct file *file) { //mutex_unlock(&fib_mutex); return 0; } ``` 撰寫簡單的 multithread 程式來測試。 Thread 1 和 thread 2 都同時打開 Device ，讀取 fib(0) ~ fib(49) ，並且為了故意營造 race condition 而使用延遲讓兩個 thread 交替執行。 ```c static int fd; void *job(void* arg){ char buf[100]; int id = *((int *) arg); for (int i = 0; i <= 50; i++) { lseek(fd, i, SEEK_SET); sleep(id); read(fd, buf, 0); printf("thread %d read fib(%d): %s\n", id, i, buf); } } int main(){ fd = open("/dev/fibonacci", O_RDWR); if (fd < 0) { perror("Failed to open character device"); exit(1); } pthread_t test[2]; int pid1 = 1; pthread_create(&test[0], NULL, job, (void *), &pid1); int pid2 = 2; pthread_create(&test[1], NULL, job, (void *), &pid2); for(int i = 0; i < 2; i++) pthread_join(test[i], NULL); close(fd); return 0; } ``` 前幾項就發生錯誤了，thread 2 讀取 fib(2) 的值為 2。 ``` thread 1 read fib(0): 0 thread 2 read fib(0): 1 thread 1 read fib(1): 1 thread 1 read fib(2): 1 thread 2 read fib(1): 2 thread 1 read fib(3): 1 thread 1 read fib(4): 3 thread 2 read fib(2): 5 thread 1 read fib(5): 2 thread 1 read fib(6): 8 ``` 原因是兩個 thread 都透過 lseek 不斷修改 file offset ，當兩個 thread 都同時打開 device ，其中一個 thread 剛設定好的 offest 在透過 read() 傳入 kernel space 讀取之前就被另一個 thread 改掉了，從而回傳錯誤的結果。因此我們才需要在 device 被 open() 時將 mutex 上鎖，而 release() 時將 mutex 解鎖，保持同時只有一個 thread 能打開裝置。 --- ## 縮減大數運算的成本研讀 [KYG-yaya573142 的報告](https://hackmd.io/@KYWeng/rkGdultSU)，我發現效能躍進最大的部份是在引入 [memory pool](https://hackmd.io/BZjwYw1FQ1O0NPY5kIS7VQ?view#%E6%94%B9%E5%96%84%E6%96%B9%E6%A1%88-3---%E5%BC%95%E5%85%A5-memory-pool-%E7%9A%84%E6%A6%82%E5%BF%B5) 的概念時，也就是想辦法減少動態分配記憶體的次數。在我做的[效能測試](https://hackmd.io/cxmcJtWDTK-ujDQK3fiNZg?both#%E6%95%88%E8%83%BD%E6%B8%AC%E8%A9%A61)也可以看到 overhead 突然飆升的點就是在進位到下一個 node 時呼叫 kmalloc，可見最大的效能瓶頸的確和記憶體的分配息息相關。 --- ## `lsmod` 中的 Used by 欄位實作使用 strace 觀察 system call 的過程。 ```shell strace -o trace lsmod && grep fibdrv trace ``` 輸出之中有一行 `openat(AT_FDCWD, "/sys/module/fibdrv/refcnt", O_RDONLY|O_CLOEXEC) = 3` 看來 lsmod 是直接讀取 `/sys/module/fibdrv/refcnt` 這個檔案中的紀錄。 ### 閱讀 [Driver Basics](https://www.kernel.org/doc/html/v5.8/driver-api/basics.html#reference-counting) 實際上 reference count 是紀錄在 `refcount_struct` 結構體。並且透過 `refcount_add`, `refcount_set` 等等的 api 來設置。 ```c struct refcount_struct { atomic_t refs; }; ``` --- ## 使用 char array 實做針對大數的 add sub mul 操作首先自定義一個大數結構體 ```c typedef struct big_num2 { unsigned char *num; size_t len; // strlen } bn; ``` ### Addition ```c bn *bn_add(bn *na, bn *nb) { /* * Make sure the string length of 'a' is always greater than * the one of 'b'. */ char *a = na->num, *b = nb->num; size_t len_a = na->len, len_b = nb->len; size_t len = _max(len_a, len_b); bn *buf = bn_alloc(len + 2); int i, carry = 0; for (i = 0; i < len; i++) { int n1 = (i < len_a) ? (a[i] - '0') : 0; int n2 = (i < len_b) ? (b[i] - '0') : 0; int sum = n1 + n2 + carry; buf->num[i] = '0' + sum % 10; carry = sum / 10; } if (carry) buf->num[i++] = '0' + carry; buf->num[i] = 0; buf->len = i; return buf; } ``` ### Subtraction ```c // a must > b bn *bn_sub(bn *na, bn *nb) { /* * Make sure the string length of 'a' is always greater than * the one of 'b'. */ char *a = na->num, *b = nb->num; size_t len_a = na->len, len_b = nb->len; if (len_a < len_b) { _swap(a, b); _swap(len_a, len_b); } bn *buf = bn_alloc(len_a + 1); int i, carry = 0, borrow = 10; for (i = 0; i < len_a; i++) { int n1 = (i < len_a) ? (a[i] - '0') : 0; int n2 = (i < len_b) ? (b[i] - '0') : 0; carry = n1 - n2 - carry; if (carry < 0) { buf->num[i] = carry + borrow + '0'; carry = 1; } else { buf->num[i] = carry + '0'; carry = 0; } } buf->len = len_a; return buf; } ``` ### Multiplication ```c // s += num start from s[offset] void bn_mul_add(bn *n, int offset, int num) { char *s = n->num; int carry = 0; for (int i = offset;; i++) { int sum = num % 10 + (s[i] ? s[i] - '0' : 0) + carry; s[i] = '0' + sum % 10; carry = sum / 10; num /= 10; if (!num && !carry) { // done n->len = i + 1; return; } } } // O(M * N) long multiplication bn *bn_mul(bn *na, bn *nb) { size_t len_a = na->len, len_b = nb->len; char *a = na->num, *b = nb->num; bn *buf = bn_alloc(len_a + len_b + 1); int prod, n1, n2; for (int i = 0; i < len_a; i++) { for (int j = 0; j < len_b; j++) { n1 = a[i] - '0'; n2 = b[j] - '0'; prod = n1 * n2; bn_mul_add(buf, i + j, prod); } } return buf; } ``` ### Fast doubling ```c bn *fast_doubling(long long n) { int msb = 64 - __builtin_clzll(n); bn *a = bn_alloc(2); bn *b = bn_alloc(2); a->num[0] = '0'; // F(n) = 0 b->num[0] = '1'; // F(n + 1) = 1 bn *n2 = bn_alloc(2); n2->num[0] = '2'; for (long long mask = 1 << (msb - 1); mask; mask >>= 1) { // k = floor((n >> j) / 2) // then a = F(k), b = F(k + 1) now. // F(2k) = F(k) * [ 2 * F(k + 1) – F(k) ] bn *tmp1 = bn_mul(n2, b), *tmp2 = bn_sub(tmp1, a); bn *c = bn_mul(a, tmp2); freebn(tmp1); freebn(tmp2); // F(2k + 1) = F(k)^2 + F(k + 1)^2 tmp1 = bn_mul(a, a); tmp2 = bn_mul(b, b); bn *d = bn_add(tmp1, tmp2); freebn(tmp1); freebn(tmp2); // char *c = bn_mul(a, bn_sub(bn_mul("2", b), a)); // char *d = bn_add(bn_mul(a, a), bn_mul(b, b)); /* * n >> j is odd: n >> j = 2k + 1 * F(n >> j) = F(2k + 1) * F(n >> j + 1) = F(2k + 2) = F(2k) + F(2k + 1) * * n >> j is even: n >> j = 2k * F(n >> j) = F(2k) * F(n >> j + 1) = F(2k + 1) */ freebn(a); freebn(b); if (mask & n) { a = d; b = bn_add(c, d); freebn(c); } else { a = c; b = d; } } freebn(b); return a; } ``` --- ## 整數編碼一個 character 佔用一個 byte ，但是表達一個 0～9 的整數只需要 4 bits ，所以我打算使用 BCD encoding 減少 copy to user 的 cost。如此一來需要傳送的字串大小變成原本的一半。 ```c bn *bcd_encode(bn *ans) { size_t sz = ans->len / 2 + (ans->len & 1) + 1; bn *bin = bn_alloc(sz); for (int i = 0; i < bin->len; i++) { int offset = i * 2; bin->num[i] = (ans->num[offset] - '0') | ((ans->num[offset + 1] - '0') << 4); } return bin; } ``` 在 client 端進行解碼 ```c char *bin_to_str(unsigned char *bin, size_t sz) { int len = 2 * sz; char *str = malloc(len + 1); int i; for (i = 0; i < len / 2; i++) { char mask_low = 0x0F; str[i * 2] = (bin[i] & mask_low) + '0'; str[i * 2 + 1] = (bin[i] >> 4) + '0'; } str[i * 2] = 0; if (str[i * 2 - 1] == '0') str[i * 2 - 1] = 0; return str; } ```