2021q1 Homework3 (fibdrv)

--- ###### tags: `sysprog2021q1` --- # 2021q1 Homework3 (fibdrv) contributed by < `93i7xo2` > > Source: [J06: fibdrv](https://hackmd.io/@sysprog/linux2021-fibdrv) ## 實驗環境 - Intel i5-6200U, L1/L2/L3: 64KB/512KB/3MB - Qemu 4.2.1 - Kernel: 5.4.0-72-generic ## bignum 的 Fast doubling 技巧追蹤 fibdrv 分支 - [bignum](https://github.com/sysprog21/bignum)。其中最複雜的邏輯應該是將數個位元組轉換至不同進制的字串輸出 `apm_get_str`，有了這個函式，數值就能以位元組而非字串形式儲存。 ### `fibonacci.c` 要了解 `fibonacci.c` 的 Fast-doubling 實作方式，從一開始得到的關係式開始： $$ \begin{split} F(2k) &= F(k)[2F(k+1) - F(k)] \\ F(2k+1) &= F(k+1)^2+F(k)^2 \end{split} $$ 根據 Fibonacci 數列原始定義 `F(0) = 0, F(1) = 1, F(2) = 1`，對應的虛擬碼如下: ```= Fast_Fib(n) a = 0; b = 1; // m = 0 for i = (number of binary digit in n) to 1 t1 = a*(2*b - a); t2 = b^2 + a^2; a = t1; b = t2; // m *= 2 if (current binary digit == 1) t1 = a + b; // m++ a = b; b = t1; return a; ``` 可分成 4-6、7-9 行兩部份，前者作用為 $F(n)\rightarrow F(2n)$，後者為 $F(n)\rightarrow F(n+1)$。第一次迴圈由於一開始 `a = 0; b = 1;` 的關係，第 4-5 行沒有作用，結果會是 `a = 1; b = 1;`，最後返回 `a`，相當於從 $F_1$ 開始進行 Fast-doubling。而 `bignum/fibonacci.c` 實作不同於上方虛擬碼，少了減法，好處是擴展成大數運算只需要實作加法部份。如果從整理前的關係式開始推導： $$ \begin{split} \begin{bmatrix} F_{n+1} \\ F_{n} \end{bmatrix} &= \begin{bmatrix} 1 & 1 \\ 1 & 0 \end{bmatrix}^n \begin{bmatrix} F_{1} \\ F_{0} \end{bmatrix} \\ &= \begin{bmatrix} F_{n+1} & F_{n} \\ F_{n} & F_{n-1} \end{bmatrix} \begin{bmatrix} F_{1} \\ F_{0} \end{bmatrix} \end{split} $$ $$ \begin{split} \begin{bmatrix} F_{m+n+1} \\ F_{m+n} \end{bmatrix} &= \begin{bmatrix} 1 & 1 \\ 1 & 0 \end{bmatrix}^{m+n} \begin{bmatrix} F_{1} \\ F_{0} \end{bmatrix} \\ &= \begin{bmatrix} F_{m+1} & F_{m} \\ F_{m} & F_{m-1} \end{bmatrix} \begin{bmatrix} F_{n+1} & F_{n} \\ F_{n} & F_{n-1} \end{bmatrix} \begin{bmatrix} F_{1} \\ F_{0} \end{bmatrix} \end{split} $$ 以 $m=n, n=n-1$ 代入 $$ \begin{split} \begin{bmatrix} F_{2n} \\ F_{2n-1} \end{bmatrix} &= \begin{bmatrix} 1 & 1 \\ 1 & 0 \end{bmatrix}^{2n-1} \begin{bmatrix} F_{1} \\ F_{0} \end{bmatrix} \\ &= \begin{bmatrix} F_{n+1} & F_{n} \\ F_{n} & F_{n-1} \end{bmatrix} \begin{bmatrix} F_{n} & F_{n-1} \\ F_{n-1} & F_{n-2} \end{bmatrix} \begin{bmatrix} F_{1} \\ F_{0} \end{bmatrix} \\ &= \begin{bmatrix} F_{n}(F_{n+1}+F_{n-1}) \\ F_{n}^2 + F_{n-1}^2 \end{bmatrix} \\ &= \begin{bmatrix} F_{n}(2F_{n-1}+F_{n}) \\ F_{n}^2 + F_{n-1}^2 \end{bmatrix} \end{split} $$ 可得關係式： $$ \begin{split} F(2n) &= F(n)[2F(n-1) + F(n)] \\ F(2n-1) &= F(n)^2+F(n-1)^2 \end{split} $$ 對應的實作如下： ```cpp=28 bn *a1 = fib; /* Use output param fib as a1 */ bn_t a0, tmp, a; bn_init_u32(a0, 0); /* a0 = 0 */ bn_set_u32(a1, 1); /* a1 = 1 */ bn_init(tmp); /* tmp = 0 */ bn_init(a); /* Start at second-highest bit set. */ for (uint64_t k = ((uint64_t) 1) << (62 - __builtin_clzll(n)); k; k >>= 1) { /* Both ways use two squares, two adds, one multipy and one shift. */ bn_lshift(a0, 1, a); /* a03 = a0 * 2 */ bn_add(a, a1, a); /* ... + a1 */ bn_sqr(a1, tmp); /* tmp = a1^2 */ bn_sqr(a0, a0); /* a0 = a0 * a0 */ bn_add(a0, tmp, a0); /* ... + a1 * a1 */ bn_mul(a1, a, a1); /* a1 = a1 * a */ if (k & n) { bn_swap(a1, a0); /* a1 <-> a0 */ bn_add(a0, a1, a1); /* a1 += a0 */ } } /* Now a1 (alias of output parameter fib) = F[n] */ ``` 由於一開始的返回值 `a1` 已是 $F_1$，迴圈數相較於先前的虛擬碼少一。 ```diff - uint64_t k = ((uint64_t) 1) << (63 - __builtin_clzll(n)) + uint64_t k = ((uint64_t) 1) << (62 - __builtin_clzll(n)) ``` 接著實作在 `fibdrv.c` ```cpp static long long fib_sequence_fast_doubling(long long k) { if (unlikely(k <= 2)) { return k ? 1LL : 0LL; } long long f0 = 0, f1 = 1, tmp; for (uint64_t n = ((uint64_t) 1) << (62 - __builtin_clzll(k)); n; n >>= 1) { tmp = f1 * f1 + f0 * f0; f1 = f1 * ((f0 << 1) + f1); f0 = tmp; if (k & n) { __swap(f0, f1); f1 += f0; } } return f1; } ``` 運作時，透過 sysfs interface 切換 fib_sequence 對應的實作 (參考 [bakudr18](https://hackmd.io/@MR-9Qa9wQLWglSAUyd6UhQ/HJ363p_Qd) `fib_exec` 的作法) ```cpp static ssize_t m_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int ret, input; ret = kstrtoint(buf, 10, &input); if (ret < 0) return ret; switch (input) { case 0: fib_sequence = &fib_sequence_original; break; case 1: fib_sequence = &fib_sequence_fast_doubling; break; } return count; } static struct kobj_attribute kmode_attribute = __ATTR(mode, 0664, m_show, m_store); ``` ## 量測方法為了測量 user space 及 kernel space 所花費的時間，分別於： 1. `client.c` 加入 `clock_gettime` 量測在 user space 經過的時間 ```cpp clock_gettime(CLOCK_ID, &start); sz = read(fd, buf, 1); clock_gettime(CLOCK_ID, &end); ``` 2. kernel space 以 `ktime` 紀錄計算時間，並新增 `/sys/kernel/fibdrv/time` 用以回傳時間 ```cpp /* fibdrv.c */ kt = ktime_get(); fibnum = fib_sequence(*offset); kt = ktime_sub(ktime_get(), kt); static ssize_t k_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return scnprintf(buf, PAGE_SIZE, "%lld\n", ktime_to_ns(kt)); } ``` > [snprintf(3)](https://linux.die.net/man/3/snprintf) > If the output was truncated due to this limit then the return value is the number of characters (excluding the terminating null byte) which would have been written to the final string if enough space had been available. Thus, a return value of size or more means that the output was truncated. [KYWeng](https://hackmd.io/@KYWeng/rkGdultSU) 建議使用 `scnprintf` 取代 `snprintf`。簡單的說，`snprintf` 返回值不一定是實際寫入的位元數，而是預期寫入的位元數，在不超過 buffer 大小情況下沒問題，在靠近邊界使用容易造成誤用，即使在 [btrfs](https://lore.kernel.org/linux-btrfs/s5hfte2r6gc.wl-tiwai@suse.de/) commit 還是可見這樣的討論。 ```cpp /* client.c */ #define FIB_KTIME "/sys/kernel/fibdrv/time" long long get_ktime(){ long long t = -1; FILE *kptr = fopen(FIB_KTIME, "r"); fscanf(kptr, "%lld", &t); fclose(kptr); return t; } ... long long utime = (double)(end.tv_sec - start.tv_sec) * ONE_SEC + (end.tv_nsec - start.tv_nsec); long long ktime = get_ktime(); fprintf(fptr, "%d %lld %lld\n", i, utime, ktime); ``` 用 utime - ktime 就能得到 kernel to user 和 user to kernel 的時間。 3. `ktime` 也是透過 sysfs interface 存取 ```cpp /* * The "time" file where total number of CPU-nanoseconds used by * "fib_sequence()" is read from. */ static ssize_t k_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return scnprintf(buf, PAGE_SIZE, "%lld\n", ktime_to_ns(kt)); } ``` 測試兩種實作效能差異，實驗數據已使用下方統計方法去除極值，但不排除其他干擾效能分析的因素。能看到 fast doubling 效能上的優勢，在計算越後面的項差異越明顯。 ![](https://i.imgur.com/8dZyZor.png) ## 統計方法若對母體 $\mu$ 進行區間估計。在母體 $\sigma$ 已知的情況下，樣本平均 $\bar X\sim N(\mu,\dfrac{\sigma^2}{n})$，以區間形式來估計母體 $\mu$，特定信心水準的信賴區間為： - $\bar X\pm z\dfrac{\sigma}{\sqrt{n}}$ 而在 $\sigma$ 未知的情況下用樣本標準差 $s$ 代替。應該用 t-distribution，也就是： - $\bar X\pm t\dfrac{s}{\sqrt{n}}$ 若以 95% 信賴區間作為去除離群值的方法，來求樣本的 [trimmed mean](https://en.wikipedia.org/wiki/Truncated_mean)，參考 [4ce43c4](https://github.com/colinyoyo26/fibdrv/commit/4ce43c4e7ee0b80c4ce9e6dcd995bbfdf239206c) 改寫，原本保留 $\pm 2s$ 內數據的作法應該改為下方： ```python runs=1000 def filter(df): ret = np.zeros(df.shape[1]) t = stats.t(df=(runs-1)).ppf((0.025, 0.975)) upper_bound = df.mean(0) + t[1]*df.std(0)/(runs**0.5) lower_bound = df.mean(0) + t[0]*df.std(0)/(runs**0.5) for column in df.columns: s = df[column] s = s[s <= upper_bound[column]] s = s[s >= lower_bound[column]] ret[column] = s.mean() return ret ``` > 目前尚未找到使用 C.I. 作為 trimmed mean 的實例實際做圖，細看 fibonacci 序列較小的數值會發現有些斷點，這是因為數值小運算快，量測精度不夠的情況下即使測量次數夠多，數據不會呈現理想的常態分佈，例如 `[20, 20, 20, ..., 21, 21]`，去除離群值後只剩下空陣列無法取得平均，解法是實作大數運算後拉大 fibonacci 序列範圍，縮小圖上的斷點。 - [測試程式碼](https://github.com/93i7xo2/fibdrv/commit/69c438af71bc14d504dff78f019944a519b0ae14) ```bash ~$ make test2 ``` - 實驗結果 ![](https://i.imgur.com/Pnj5F8S.png) ## 排除干擾效能分析的因素排除干擾的方法參照 [KYG-yaya573142](https://hackmd.io/@KYWeng/rkGdultSU#%E6%BA%96%E5%82%99-ktype-%E6%89%80%E9%9C%80%E6%AC%84%E4%BD%8D-default_attrs) 1. 關閉 ASLR 2. 關閉 Intel Turbo Boost / AMD Turbo Core (非必要) 3. 將程式固定在特定 CPU 上 (e.g. `isolcpus=7`) 4. 調整 `scaling_governor` 固定運作頻率整合上述功能到 Makefile 裡： ```bash CPUID := $(shell nproc --all --ignore 1) ISOLATED_CPU := $(shell cat /sys/devices/system/cpu/isolated) ORIG_ASLR := $(shell cat /proc/sys/kernel/randomize_va_space) ORIG_GOV := $(shell cat /sys/devices/system/cpu/cpu$(CPUID)/cpufreq/scaling_governor) INTEL_BOOST_EXISTS := $(shell [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ] && echo 1 || echo 0 ) BOOST_EXISTS := $(shell [ -e /sys/devices/system/cpu/cpufreq/boost ] && echo 1 || echo 0 ) ifeq ($(INTEL_BOOST_EXISTS), 1) ORIG_TURBO := $(shell cat /sys/devices/system/cpu/intel_pstate/no_turbo) else ifeq ($(BOOST_EXISTS), 1) ORIG_TURBO := $(shell cat /sys/devices/system/cpu/cpufreq/boost) endif ... test2: all ifneq ($(CPUID), $(ISOLATED_CPU)) @echo "Isolated core must be the last of all cores." @exit 1 endif $(MAKE) unload $(MAKE) load sudo bash -c "echo 0 > /proc/sys/kernel/randomize_va_space" sudo bash -c "echo performance > /sys/devices/system/cpu/cpu$(CPUID)/cpufreq/scaling_governor" ifeq ($(INTEL_BOOST_EXISTS), 1) sudo bash -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo" else ifeq ($(BOOST_EXISTS), 1) sudo bash -c "echo 0 > /sys/devices/system/cpu/cpufreq/boost" endif @python3 scripts/driver.py sudo bash -c "echo $(ORIG_ASLR) > /proc/sys/kernel/randomize_va_space" sudo bash -c "echo $(ORIG_GOV) > /sys/devices/system/cpu/cpu$(CPUID)/cpufreq/scaling_governor" ifeq ($(INTEL_BOOST_EXISTS), 1) sudo bash -c "echo $(ORIG_TURBO) > /sys/devices/system/cpu/intel_pstate/no_turbo" else ifeq ($(BOOST_EXISTS), 1) sudo bash -c "echo $(ORIG_TURBO) > /sys/devices/system/cpu/cpufreq/boost" endif $(MAKE) unload ``` 並在 `scripts/driver.py` 內以統計方法去除極值。 ## cpython 大數運算 Python 是最廣為人知有實作大數運算的工具，使用了多種方法來增進效能，以下就 Python 的實作 [cpython 3.11](https://github.com/python/cpython) (commit `a0bd9e`) 的 [longobject.c](https://github.com/python/cpython/blob/main/Objects/longobject.c) 進行加乘法運算的 trace。預期目標: - 和 bignum 進行效能比較 - 整合進 fibdrv。 ### VSCode settings 讓 python 產生必須的 header file ```bash git clone https://github.com/python/cpython cd cpython ./configure ``` 執行完上述指令產生 `pyconfig.h`，如果 IntelliSense 沒有識別到該檔，參考 [IntelliSense for cross-compiling](https://code.visualstudio.com/docs/cpp/configure-intellisense-crosscompilation) 設定`c_cpp_properties.json`內的`includePath` ```json { "configurations": [ { "name": "Linux", "includePath": [ "${workspaceFolder}/**", "/usr/src/linux-5.4.0/include", "/usr/src/linux-5.4.0/arch/x86/include" ], "defines": [ "__GNUC__", "__KERNEL__" ], "compilerPath": "/usr/bin/gcc", "cStandard": "c99", "intelliSenseMode": "gcc-x64" } ], "version": 4 } ``` ### Data structure ```cpp struct _longobject { PyObject_VAR_HEAD digit ob_digit[1]; }; ``` - `PyObject_VAR_HEAD`: 儲存 reference count、ob_size 的結構 - `ob_size`: `ob_digit[]` 長度，`<0` 表示 `ob_digit` 的值為負數。 - `ob_digit`: 用以儲存無號整數，保留前面 1 位用做緩衝用於運算，若 `digit` 型態為 `int16_t` 那實際上只有 15 bits 可用長度為 0 的陣列是 GNU C extension - [Arrays of Length Zero](https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html) 的用法，作為 variable-length object 的 header，在 C90 不支援的情況下以長度為 1 的陣列存在，雖然結構上的大小有變，但使用上並無異。 > In the absence of the zero-length array extension, in ISO C90 the contents array in the example above would typically be declared to have a single element. Unlike a zero-length array which only contributes to the size of the enclosing structure for the purposes of alignment, a one-element array always occupies at least as much space as a single object of the type. **Although using one-element arrays this way is discouraged, GCC handles accesses to trailing one-element array members analogously to zero-length arrays.** 為了避開長度為 1 的陣列造成 `sizeof` 無法取得預期的結構大小，使用 `__builtin_offsetof` ```cpp #define offsetof(t, d) __builtin_offsetof(t, d) result = PyObject_Malloc(offsetof(PyLongObject, ob_digit) + size*sizeof(digit)); ``` ### Karatsuba Multiplication (`k_mul`) cpython 以 [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm) 作為乘法運算的基礎。轉貼 wiki 部份原理：一開始假設 $x, y$ 為 $n$ 位以 $B$ 為基底的數字，$m, n\in \mathbb{N}$ 且 $m<n$，可改寫如下： $\begin{align} x&=x_1B^m+x_0\\ y&=y_1B^m+y_0\\ where\\ &x_0,y_0 \lt B^m \end{align}$ 接著乘積 $xy$ 可表示為 $\begin{align} xy &= z_2B^{2m}+z_1B^{m}+z_0\\ where\\ z_2&=x_1y_1\\ z_1&=x_1y_0+x_0y_1\\ z_0&=x_0y_0 \end{align}$ 以上共 4 次乘法運算，Karatsuba 發現透過將 $z_1$ 改寫成 - $z_1 = (x_1+x_0)(y_1+y_0)-z_2-z_0\ (1)$ 可將乘法運算降低至3次。但 $(x_1+x_0)$ 或是 $(y_1+y_0)$ 都有大於或等於 $B^m$ 的可能性，計算乘績前得先用 $m+1$ 位儲存，於是繼續改寫 - $z_1 = (x_1-x_0)(y_1-y_0)+z_2+z_0\ (2)$ 這樣的好處是 $(x_1+x_0)$ 或是 $(y_1+y_0)$ 均落在 $(-B^m,B^m)$，我們可將 sign 分離出來個別處理，等同先做兩個 $m$ 位的正數乘法再加上 sign，最重要的是結合成 $z_1$ 只剩加法。但 cpython 採取的是式 (1)，採用 $2m$ 位儲存結果，並忽略各項：$(x_1+x_0)(y_1+y_0)$、$z_2$、$z_0$ 加減造成的 overflow/underflow，理由很簡單，加減運算就像在輪盤上轉圈，反正最後出來一定是正確的值。 --- 乘法實作分為 - `k_mul`：使用 Karatsuba，視情況退化成 `x_mul`，也是最先呼叫的乘法函式。 - `x_mul`：小學乘法上述的 $x, y$ 在實作對應到 $a, b$，位數較小的一方為 $a$。在進行 Karatsuba 計算前有一小段程式用來判斷是否退化成 `x_mul` ```cpp #define KARATSUBA_CUTOFF 70 #define KARATSUBA_SQUARE_CUTOFF KARATSUBA_CUTOFF<<1 ``` ```cpp=3291 /* Use gradeschool math when either number is too small. */ i = a == b ? KARATSUBA_SQUARE_CUTOFF : KARATSUBA_CUTOFF; if (asize <= i) { if (asize == 0) return (PyLongObject *)PyLong_FromLong(0); else return x_mul(a, b); } if (2 * asize <= bsize) return k_lopsided_mul(a, b); ``` 在 `a=b` 或是 `a!=b` 的情況下，`a` 的 `ob_size` 必須分別大於 140/70 才會進行 Karatsuba。假設 `digit` 為 `uint16_t`，`ob_size=71`，符合條件的數最小為 $2^{70*15}=10^{316.08}$，**所以實際上用到的大多是 `x_mul`**。之後就是一系列相關的函式 - `k_lopsided_mul`: 平衡版的 Karatsuba - `kmul_split`: 指定切割位置，將 $a,b$ 拆成低位$(al, bl)$和高位$(ah, bh)$兩數 - `v_iadd`/`v_isub`: 輸入兩數，將另一數相加/相減至另一數上 - `long_normalize`: 以線性從 MSD (Most significant digit) 掃描至 LSD，調整 `ob_size`，但不釋放多餘記憶體，`ob_size` 會大於實際上所需通常是新物件預留給計算結果足大的空間所致。 #### `k_lopsided_mul` 當 a、b 兩數大小差距懸殊，Karatsuba 切割出來的 `ah = 0`，這樣的情況下進行計算無法得到實質上的效益。於是平衡版的 Karatsuba - `k_lopsided_mul` 因應而生，作法是將以較小的那方大小作為單位大小切割另一數，逐一將兩數以 Karatsuba 計算並累加。 ```cpp=3306 if (2 * asize <= bsize) return k_lopsided_mul(a, b); ``` ``` b = 00 | 01 | 02 | 03 = 0x00010203 a = 0C = 0x0C a * b = k_mul(03,0C) + k_mul(02,0C) << size(a) + ... ``` #### `v_iadd` & `v_isub` 由於 `digit` 設計上保留最前面的 MSB 作為 carry/borrow 的暫存，sign 在 size，倘若計算兩正數，則不必判斷兩數大小間接判斷是否 overflow/underflow，而是將最前面的 1 bit 直接傳遞下去，省去分支。`v_iadd`、`v_isub` 則是採用上述作法，實作兩正數的一般小學加減法計算。以 `v_isub` 發生借位時為例，假設 `digit` 宣告為 8 bit，`x, y` 為 digit 型態，儲存數字最大為 7 bit，計算 `x - y`。發生借位時可將補上的 $2^8$ 視為兩個 $2^7$，一個用來計算，一個代表 borrow 發生。 ``` x 0 0000000 y - 0 1111111 ------------- 1 0000000 <- borrow flag 1 0000000 - 0 1111111 ------------- 1 0000001 ``` 同理， `v_iadd` 發生進位時為，MSB 為 1。 ``` x 0 0000001 y + 0 1111111 ------------- 1 0000000 <- carry flag 0 0000000 ``` 實作如下 ```cpp=1477 static digit v_isub(digit *x, Py_ssize_t m, digit *y, Py_ssize_t n) { Py_ssize_t i; digit borrow = 0; assert(m >= n); for (i = 0; i < n; ++i) { borrow = x[i] - y[i] - borrow; x[i] = borrow & PyLong_MASK; borrow >>= PyLong_SHIFT; borrow &= 1; /* keep only 1 sign bit */ } for (; borrow && i < m; ++i) { borrow = x[i] - borrow; x[i] = borrow & PyLong_MASK; borrow >>= PyLong_SHIFT; borrow &= 1; } return borrow; } ``` 類似的函式還有 `x_add`，差別如下 - `x_add`: 兩數相加，返回儲存結果的新物件 - `v_iadd/v_isub`: 將另一數相加/相減至另一數上，不產生新物件，返回 carry/borrow ### Grade-School Multiplication (`x_mul`) - 當 `a!=b` 時，執行傳統的乘法運算，將 `b` 逐一乘上 `a` 的每一 `digit` 累加至 `z` ``` b[2] b[1] b[0] x a[1] a[0] -------------- = (b[2] b[1] b[0]) * a[0] + (b[2] b[1] b[0]) * a[1] << sizeof(digit) ``` ```cpp=3188 /* a is not the same as b -- gradeschool int mult */ for (i = 0; i < size_a; ++i) { twodigits carry = 0; twodigits f = a->ob_digit[i]; digit *pz = z->ob_digit + i; digit *pb = b->ob_digit; digit *pbend = b->ob_digit + size_b; SIGCHECK({ Py_DECREF(z); return NULL; }); while (pb < pbend) { carry += *pz + *pb++ * f; *pz++ = (digit)(carry & PyLong_MASK); carry >>= PyLong_SHIFT; assert(carry <= PyLong_MASK); } if (carry) *pz += (digit)(carry & PyLong_MASK); assert((carry >> PyLong_SHIFT) == 0); } ``` - 當 `a==b` 時，採取 [Multiprecision Squaring](https://doi.org/10.1007/0-387-23483-7_268)。原理是透過觀察 `a * a` 有相同 partial sum，且在奇數欄有次方項出現，從而減少計算次數，最終減少約一半的乘法。 ``` a3 a2 a1 a0 x a3 a2 a1 a0 ---------------------------------- a3a0 a2a0 a1a0 a0^2 a3a1 a2a1 a1^2 a0a1 a3a2 a2^2 a1a2 a0a2 a3^2 a2a3 a1a3 a0a3 | | | | └---------└---------└---------└---- odd column ``` - `a` (n-digit)，所需乘法 - 改良前 $n^2$ - 改良後 $\dfrac {n^2+n}{2}$ ### 和 bignum 進行效能比較不了解 cpython 是如何管理記憶體管理的情況下，先以 `malloc` 創立 `bn` 物件，實測結果顯示效能劣於 bignum。 - [測試程式碼](https://github.com/93i7xo2/bignum) ```bash make test ``` ![](https://i.imgur.com/jBqGsi9.png) ### 檢驗正確性模仿 `x_add` 進位的方式，實作一個將以 10 除後的餘數傳遞至前一個 `digit` 的 10 進制轉換函式，每個 10 進位數值獨自佔用一個 `digit`。理論上可顯示至 $2^{64}-1\approx 1.8\cdot 10^{19}$ 位。 $\begin{split} a&=(n_{1,k}2^{15k}+...+n_{1,2}2^{30}+n_{1,1}2^{15}+n_{1,0}2^{0})10+q_0\\ &=((n_{2,k}2^{15k}+...+n_{2,2}2^{30}+n_{2,1}2^{15}+n_{2,0}2^{0})+q_1)10+q_0\\ &=(q_mq_{m-1}...q_2q_1q_0)_{10} \end{split}$ ```cpp static bn *bn_to_dec(bn *a) { /* The maximum number stored in 'a' is 2^(maxbits)-1 ,so we * need (maxbits * log(2) / log(10) + 1) digits to present * each decimal number. */ bn_size maxbits = Bn_ABS(Bn_SIZE(a)) * ((sizeof(digit) << 3) - 1); bn *str = bn_new((bn_size)(maxbits * 0.3010 + 1)); memset(str->bn_digit, 0, sizeof(digit) * Bn_ABS(Bn_SIZE(str))); bn_size i, z = 0; twodigits carry; while (z < Bn_SIZE(str)) { for (i = Bn_SIZE(a), carry = 0; i > 0; --i) { carry = carry << Bn_SHIFT | a->bn_digit[i - 1]; a->bn_digit[i - 1] = (digit)(carry / 10); carry %= 10; } str->bn_digit[z++] = carry; } return bn_normalize(str); } void bn_print_dec(bn *a) { bn* dec = bn_to_dec(a); bn_size i = 0, n = Bn_ABS(Bn_SIZE(dec)); char *str = (char *) malloc(sizeof(char) * (n + 1)); while (n > 0) { str[i++] = (dec->bn_digit[(n--) - 1] & Bn_MASK) | 0x30; } str[i] = 0; printf("%s\n", str); free(str); Bn_DECREF(dec); } ``` ## `fibdrv` 大數運算將 cpython 整合進 fibdrv。 1. 由於 `assert()` 和 `BUG_ON()` 功能相反，`assert()` 是條件不成立才 core dump，`BUG_ON` 是條件成立就 kernel panic，因此裡面的條件要改寫成相反的。 2. 編譯時發現無法處理浮點運算，依據 [I09 - kcalc](https://hackmd.io/@sysprog/2020-kcalc) 的指示改用定點數： ``` error: SSE register return with SSE disabled 499 | bn *str = bn_new((bn_size)(maxbits * 0.3010 + 1)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``` ```diff= bn_size maxbits = Bn_ABS(Bn_SIZE(a)) * ((sizeof(digit) << 3) - 1); - bn *str = bn_new((bn_size)(maxbits * 0.3010 + 1)); + #define uint128_t __uint128_t + uint128_t new_size = ((uint128_t) maxbits) * 3010; + new_size /= 10000; + new_size += 1; + #undef uint128_t + bn *str = bn_new(new_size); ``` 使用 `__uint128_t` 處理兩個 64 位元整數的運算，但無法編譯： ``` ERROR: "__udivti3" [/home/ubuntu/fibdrv/fibdrv.ko] undefined! ``` 原因是 kernel 並不會用到 [libgcc](https://gcc.gnu.org/onlinedocs/gccint/Libgcc.html#Libgcc)，所以要改寫防止編譯器呼叫到 libgcc 裡的函式。於是將 0.3010 拆成 2 個數先除後乘： ```cpp /* 0.3010 = 2^9 / 1701 * maxbits = p*1701 + q, q<1701 */ bn_size p = maxbits / 1701, q = maxbits % 1701; bn_size new_size = (p << 9) + (q << 9) / 1701; new_size += 1; ``` 3. 由於[sysfs](https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt) 分配的 buffer 大小只有 PAGE_SIZE，無法顯示 `PAGE_SIZE` (4096) 位以後的數字 > sysfs allocates a buffer of size (PAGE_SIZE) and passes it to the method. 最終的實作放在 [fibdrv](https://github.com/93i7xo2/fibdrv)，fibonacci number 可透過以下方式讀取，計算很快，慢的是轉換成 10 進位顯示： 1. 透過 sysfs 介面存取 ```bash make load sudo sh -c "echo 10000 >/sys/kernel/fibdrv/fib" cat /sys/kernel/fibdrv/fib make unload ``` 2. 參考 `client.c` 使用 `read()` 讀取字串，`sz` 是字串大小 ```cpp /* fibdrv_mod.c */ kt = ktime_get(); fibnum = fib_sequence(*offset); kt = ktime_sub(ktime_get(), kt); dec = bn_to_dec(fibnum); str = bn_to_str(dec); remains = copy_to_user(buf, str, len = strlen(str) + 1); bfree(str); Bn_DECREF(dec); return (ssize_t) remains ? -EFAULT : len - 1; ``` ```cpp /* client.c */ lseek(fd, i, SEEK_SET); sz = read(fd, buf, 1); buf[sz] = 0; printf("Reading from " FIB_DEV " at offset %d, returned the sequence " "%s.\n", i, buf); ``` 修改測試文件 `expected.txt` 使其涵蓋到更大的數字 ``` Reading from /dev/fibonacci at offset 93, returned the sequence 12200160415121876738. Reading from /dev/fibonacci at offset 94, returned the sequence 19740274219868223167. Reading from /dev/fibonacci at offset 95, returned the sequence 31940434634990099905. Reading from /dev/fibonacci at offset 96, returned the sequence 51680708854858323072. Reading from /dev/fibonacci at offset 97, returned the sequence 83621143489848422977. Reading from /dev/fibonacci at offset 98, returned the sequence 135301852344706746049. Reading from /dev/fibonacci at offset 99, returned the sequence 218922995834555169026. Reading from /dev/fibonacci at offset 100, returned the sequence 354224848179261915075. Reading from /dev/fibonacci at offset 100, returned the sequence 354224848179261915075. Reading from /dev/fibonacci at offset 99, returned the sequence 218922995834555169026. Reading from /dev/fibonacci at offset 98, returned the sequence 135301852344706746049. Reading from /dev/fibonacci at offset 97, returned the sequence 83621143489848422977. Reading from /dev/fibonacci at offset 96, returned the sequence 51680708854858323072. Reading from /dev/fibonacci at offset 95, returned the sequence 31940434634990099905. Reading from /dev/fibonacci at offset 94, returned the sequence 19740274219868223167. Reading from /dev/fibonacci at offset 93, returned the sequence 12200160415121876738. ``` ```bash ~$ make check Passed [-] ``` ## Reference - [How to fix linker warning: “function undefined” in kernel space?](https://stackoverflow.com/questions/55999728/how-to-fix-linker-warning-function-undefined-in-kernel-space)