2020q1 Homework2 (fibdrv)

# 2020q1 Homework2 (fibdrv) Contributed by < [AdrianHuang](https://github.com/AdrianHuang/fibdrv) > ###### tags: `linux2020` ## 測試環境 ```shell $ cat /etc/os-release NAME="Ubuntu" VERSION="18.04.4 LTS (Bionic Beaver)" $ cat /proc/version Linux version 5.3.0-42-generic (buildd@lcy01-amd64-019) (gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #34~18.04.1-Ubuntu SMP Fri Feb 28 13:42:26 UTC 2020 $ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 72 On-line CPU(s) list: 0-71 Thread(s) per core: 2 Core(s) per socket: 18 Socket(s): 2 NUMA node(s): 2 Vendor ID: GenuineIntel CPU family: 6 Model: 79 Model name: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10GHz Stepping: 1 CPU MHz: 2102.602 CPU max MHz: 2100.0000 CPU min MHz: 1200.0000 BogoMIPS: 4199.77 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 46080K NUMA node0 CPU(s): 0-17,36-53 NUMA node1 CPU(s): 18-35,54-71 $ free -h total used free shared buff/cache available Mem: 125G 1.8G 119G 19M 4.0G 122G Swap: 8.0G 0B 8.0G $ dmesg | grep "Command line" [ 0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.3.0-46-generic root=UUID= dafd4d3f-6b71-4d52-8f95-e387d58b7f5b ro isolcpus=0 maybe-ubiquity --> 確認 CPU0 不被核心排程器調度使用 ``` ## 計算 F~93~ (包含) 之後的 Fibonacci 數 - 使用數字字串並套用 quiz2 SSO (Small String Optimization) 因 F~93~ 之後的運算會發生 overflow，導致無法正確地計算結果。可以使用底下方法計算 big number: * 使用 GCC __int128 型態，或者自行定義的結構: 因考量到輸出時，只能輸出 16 進制 (10 進制輸出比較麻煩，需額外程式碼做轉換)，所以不採用此方式 (已看到其他同學說明: 輸出 16 進制後，再透過 python 轉換成 10 進制)。 ```cpp struct BigN { unsigned long long lower, upper; }; ``` * 使用數字字串做運算，並套用 [quiz2 SSO (Small String Optimization)](https://hackmd.io/@sysprog/linux2020-quiz2)。[實作程式碼](https://github.com/AdrianHuang/fibdrv/tree/big_number)。底下為數字字串加法實作，細節如下: * 確保 `a` 字串長度大於 `b` 字串 * 將 `a` 與 `b` 字串反轉 * 逐字對數字字元做加法運算 * 將得出字串反轉，即得出最終結果 ```cpp static void string_number_add(xs *a, xs *b, xs *out) { char *data_a, *data_b; size_t size_a, size_b; int i, carry = 0; int sum; /* * Make sure the string length of 'a' is always greater than * the one of 'b'. */ if (xs_size(a) < xs_size(b)) __swap((void *) &a, (void *) &b, sizeof(void *)); data_a = xs_data(a); data_b = xs_data(b); size_a = xs_size(a); size_b = xs_size(b); reverse_str(data_a, size_a); reverse_str(data_b, size_b); char buf[size_a + 2]; for (i = 0; i < size_b; i++) { sum = (data_a[i] - '0') + (data_b[i] - '0') + carry; buf[i] = '0' + sum % 10; carry = sum / 10; } for (i = size_b; i < size_a; i++) { sum = (data_a[i] - '0') + carry; buf[i] = '0' + sum % 10; carry = sum / 10; } if (carry) buf[i++] = '0' + carry; buf[i] = 0; reverse_str(buf, i); /* Restore the original string */ reverse_str(data_a, size_a); reverse_str(data_b, size_b); if (out) *out = *xs_tmp(buf); } ``` 確認計算到F~500~ [(The first 500 Fibonacci numbers )](https://blog.abelotech.com/posts/first-500-fibonacci-numbers/) 也是正確的，結果如下: ```shell $ sudo ./client ... Reading from /dev/fibonacci at offset 499, returned the sequence 86168291600238450732788312165664788095941068326060883324529903470149056115823592713458328176574447204501. Reading from /dev/fibonacci at offset 500, returned the sequence 139423224561697880139724382870407283950070256587697307264108962948325571622863290691557658876222521294125. ... ``` ## 測量不同 Fibonacci 演算法所需執行時間如作業提示，有三種不同演算法: * f~n~ = f~n~ ~-~ ~1~ + f~n~ ~-~ ~2~: 程式碼如下 ```cpp static long long fib_sequence_orig(long long k) { /* FIXME: use clz/ctz and fast algorithms to speed up */ long long f[k + 2]; f[0] = 0; f[1] = 1; for (int i = 2; i <= k; i++) { f[i] = f[i - 1] + f[i - 2]; } return f[k]; } ``` * fast doubling ```cpp static long long fib_sequence_fast_dobuling(long long k) { long long a = 0, b = 1; if (!k) return 0; for (int i = 31; i >= 0; i--) { long long t1, t2; t1 = a * (b * 2 - a); t2 = b * b + a * a; a = t1; b = t2; if (k & (1ULL << i)) { t1 = a + b; a = b; b = t1; } } return a; } ``` * fast doubling with clz: 考慮到硬體加速，可使用 clz/fls 計算出 `k` 值的真正有效位數 (去除 leading 0s)，程式碼如下: ```cpp= static long long fib_sequence_fast_dobuling_clz(long long k) { long long a = 0, b = 1; if (!k) return 0; for (int i = ilog2(k); i >= 0; i--) { long long t1, t2; t1 = a * (b * 2 - a); t2 = b * b + a * a; a = t1; b = t2; if (k & (1ULL << i)) { t1 = a + b; a = b; b = t1; } } return a; } ``` 使用 /sys 介面 (/sys/kernel/fibonacci/fib_algo) 切換不同演算法，以便快速量測數據。下圖為其量測結果。由此可知，使用 clz/fls 硬體加速可提升效能。執行底下腳本，可產生對應圖檔。 ```shell $ taskset 1 sudo ./profiling/fib_algo/fib-algo.sh ``` ![](https://i.imgur.com/MV4yOlh.png) ## 複習 C 語言數值系統和 bitwise operation，思考 Fibonacci 數快速計算演算法的實作中如何減少乘法運算的成本使用 clz/fls、bitwise 操作與加法，實作整數乘法 (不需乘法操作)。其對應程式碼如下所示: ```cpp static inline long long multiply(long long a, long long b) { long long res = 0; /* * 1. Find the MSB (Most Significant Bit) position of 'b'. * 2. Accumulate the result with (1 << MSB position). * 3. Clear the MSB position. * * FIXME: Implement the negative number multiplication. */ while (b) { int msb_pos; msb_pos = ilog2(b); res += (a << msb_pos); /* Clear MSB */ b &= ~(1ULL << msb_pos); } return res; } static long long fib_sequence_fast_doubling_clz_no_multiply(long long k) { long long a = 0, b = 1; if (!k) return 0; for (int i = ilog2(k); i >= 0; i--) { long long t1, t2; t1 = multiply(a, (b << 1) - a); t2 = multiply(b, b) + multiply(a, a); a = t1; b = t2; if (k & (1ULL << i)) { t1 = a + b; a = b; b = t1; } } return a; } ``` 量測結果如下。該結果顯示，使用該方法 (fast doubling)，反而需要更長的執行時間。執行底下腳本，可產生對應圖檔。 ```shell $ taskset 1 sudo ./profiling/fib_algo_without_mulitply/fib-algo.sh ``` ![](https://i.imgur.com/XzDEtpm.png) 使用 objdump 觀察 fib_sequence_fast_doubling_clz() 對應的乘法指令 `imul`，對照 Intel Software Optimization Reference Manual 文件: [Intel® Xeon® Scalable Processor Throughput and Latency](https://software.intel.com/en-us/download/intel-xeon-scalable-processor-throughput-and-latency)，該文件說明 `imul rax rcx` 只需花費 3 個 CPU clock cycles (在 Intel Xeon 處理器，乘法指令只需花費 3-8 CPU clock cycles)。`multiply()` 程式碼 `b` 的位元有效位越多的話，則會花更多的時間運算。所以，使用 clz/fls、bitwise 操作與加法實作整數乘法 (不需乘法操作)，反而需要花費更多的運算。 ```shell 0000000000000320 <fib_sequence_fast_doubling_clz>: 320: e8 00 00 00 00 callq 325 <fib_sequence_fast_doubling_clz+0x5> 325: 31 c0 xor %eax,%eax 327: 48 85 ff test %rdi,%rdi 32a: 74 49 je 375 <fib_sequence_fast_doubling_clz+0x55> 32c: 55 push %rbp 32d: b9 ff ff ff ff mov $0xffffffff,%ecx 332: 48 0f bd cf bsr %rdi,%rcx 336: 85 c9 test %ecx,%ecx 338: 48 89 e5 mov %rsp,%rbp 33b: 78 36 js 373 <fib_sequence_fast_doubling_clz+0x53> 33d: ba 01 00 00 00 mov $0x1,%edx 342: 49 89 c0 mov %rax,%r8 345: 48 8d 34 12 lea (%rdx,%rdx,1),%rsi 349: 4c 0f af c0 imul %rax,%r8 34d: 48 0f af d2 imul %rdx,%rdx 351: 48 29 c6 sub %rax,%rsi 354: 48 0f af c6 imul %rsi,%rax 358: 4c 01 c2 add %r8,%rdx 35b: 48 0f a3 cf bt %rcx,%rdi ``` ![](https://i.imgur.com/aalWVRj.jpg) > $\Uparrow$ **imul 指令所需 CPU clock cycles** ## 使用 gnuplot 製圖，分析 Fibonacci 數列在核心計算和傳遞到 userspace 的時間開銷下圖為 kernel space 執行 Fibonacci 時間開銷、userspace <-> kernel space 時間開銷與 user space 執行總開銷。執行底下腳本，即可產生下圖 (檔名: profiling/user_kern/fib_user_kernel.png)。 ```shell $ taskset 1 sudo ./profiling/user_kern/client.sh ``` ![](https://i.imgur.com/F5ok7Qh.png)