2022q1 Homework3 (fibdrv)

contributed by <scottxxxabc>

作業說明
 GitHub

開發環境

$ cat /proc/version
Linux version 5.13.0-30-generic (buildd@lcy02-amd64-003) (gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #33~20.04.1-Ubuntu SMP Mon Feb 7 14:25:10 UTC 2022

$ lscpu
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   39 bits physical, 48 bits virtual
CPU(s):                          8
On-line CPU(s) list:             0-7
Thread(s) per core:              2
Core(s) per socket:              4
Socket(s):                       1
NUMA node(s):                    1
Vendor ID:                       GenuineIntel
CPU family:                      6
Model:                           140
Model name:                      11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
Stepping:                        1
CPU MHz:                         900.260
CPU max MHz:                     4800.0000
CPU min MHz:                     400.0000
BogoMIPS:                        5990.40
Virtualization:                  VT-x
L1d cache:                       192 KiB
L1i cache:                       128 KiB
L2 cache:                        5 MiB
L3 cache:                        12 MiB
NUMA node0 CPU(s):               0-7
Vulnerability Itlb multihit:     Not affected
Vulnerability L1tf:              Not affected
Vulnerability Mds:               Not affected
Vulnerability Meltdown:          Not affected
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled 
                                 via prctl and seccomp
Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __use
                                 r pointer sanitization
Vulnerability Spectre v2:        Mitigation; Enhanced IBRS, IBPB conditional, R
                                 SB filling
Vulnerability Srbds:             Not affected
Vulnerability Tsx async abort:   Not affected
Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mt
                                 rr pge mca cmov pat pse36 clflush dts acpi mmx
                                  fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb
                                  rdtscp lm constant_tsc art arch_perfmon pebs 
                                 bts rep_good nopl xtopology nonstop_tsc cpuid 
                                 aperfmperf tsc_known_freq pni pclmulqdq dtes64
                                  monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma
                                  cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movb
                                 e popcnt tsc_deadline_timer aes xsave avx f16c
                                  rdrand lahf_lm abm 3dnowprefetch cpuid_fault 
                                 epb cat_l2 invpcid_single cdp_l2 ssbd ibrs ibp
                                 b stibp ibrs_enhanced tpr_shadow vnmi flexprio
                                 rity ept vpid ept_ad fsgsbase tsc_adjust bmi1 
                                 avx2 smep bmi2 erms invpcid rdt_a avx512f avx5
                                 12dq rdseed adx smap avx512ifma clflushopt clw
                                 b intel_pt avx512cd sha_ni avx512bw avx512vl x
                                 saveopt xsavec xgetbv1 xsaves split_lock_detec
                                 t dtherm ida arat pln pts hwp hwp_notify hwp_a
                                 ct_window hwp_epp hwp_pkg_req avx512vbmi umip 
                                 pku ospke avx512_vbmi2 gfni vaes vpclmulqdq av
                                 x512_vnni avx512_bitalg tme avx512_vpopcntdq r
                                 dpid movdiri movdir64b fsrm avx512_vp2intersec
                                 t md_clear flush_l1d arch_capabilities

fibdrv

引入big number

為了計算 f(93) 之後的數字，在 bn.h 中定義新資料結構 bn :

typedef struct bignum {
    uint32_t num[MAX_LENGTH_BN];
    unsigned int length;
} bn;

將數字的最低 32 bit 存放在 num[0]，第 33 ~ 64 bit 存放在 num[1]，以此類推。這個資料結構可視為一個 4294967295 進位的數。

使用 unsigned 32 bit integer 陣列來儲存數字，在做加法以及乘法時，可以使用 64 bit 的運算來省去處理 overflow 的麻煩，但比起直接使用 64 bit long long，加法以及乘法需要的次數也更多。

big number add

void bn_add(bn *a, bn *b, bn *result)
{
    unsigned int max = a->length >= b->length ? a->length : b->length;
    uint32_t buf[MAX_LENGTH_BN] = {0};
    int i = 0;
    uint32_t carry = 0;
    uint64_t sum;
    while (i < max) {
        sum = (uint64_t) a->num[i] + b->num[i] + carry;
        carry = sum >> 32;
        buf[i] = (uint32_t) sum;
        i++;
    }

    if (carry && i < MAX_LENGTH_BN)
        buf[i++] = carry;

    result->length = i;
    for (i = 0; i < MAX_LENGTH_BN; i++)
        result->num[i] = buf[i];
}

先將數字轉為 int64，從較低位元開始相加，此時較低的 32 位元為相加結果，較高的 32 位元為會成為下一次加法的進位。

big number sub

void bn_sub(bn *a, bn *b, bn *result)
{
    int max = a->length >= b->length ? a->length : b->length;
    uint32_t buf[MAX_LENGTH_BN] = {0};
    int i = 0;
    uint32_t carry = 0;
    while (i < max) {
        if (a->num[i] < b->num[i] + carry) {
            buf[i] = UINT_MAX - b->num[i] + a->num[i] + 1 - carry;
            carry = 1;
        } else {
            buf[i] = a->num[i] - b->num[i] - carry;
            carry = 0;
        }
        i++;
    }

    if (carry) {
        bn_sub(b, a, result);
        return;
    }

    result->length = MAX_LENGTH_BN;
    for (i = MAX_LENGTH_BN - 1; i >= 0; i--)
        result->num[i] = buf[i];
    for (i = result->length - 1; i > 0; i--)
        if (result->num[i] == 0)
            result->length--;
        else
            break;
}

做減法時須考慮 unsigned integer overflow 的問題。先用 UINT_MAX 減去 b->num[i]，再加上 a->num[i]，因為 UINT_MAX 是

2^{32} - 1

，最後再加上 1。

big number mul

void bn_mul(bn *a, bn *b, bn *result)
{
    int i = 0;
    bn *sum = kmalloc(sizeof(bn), GFP_KERNEL);
    bn_init(sum);
    bn *tmp = kmalloc(sizeof(bn), GFP_KERNEL);
    bn_init(tmp);


    while (i < b->length) {
        bn_mul_single(a, b->num[i], tmp);
        bn_grow(tmp, i);
        bn_add(sum, tmp, sum);
        i++;
    }

    for (int j = 0; j < MAX_LENGTH_BN; j++)
        result->num[j] = sum->num[j];
    result->length = sum->length;
    kfree(tmp);
    kfree(sum);
}

void bn_grow(bn *a, const unsigned int n)
{
    if (!a)
        return;
    if (n == 0 || a->length + n > MAX_LENGTH_BN)
        return;
    int i = MAX_LENGTH_BN - n - 1;
    for (; i >= 0; i--)
        a->num[i + n] = a->num[i];
    for (i = 0; i < n; i++)
        a->num[i] = 0;
    a->length += n;
}

void bn_mul_single(bn *a, const uint32_t b, bn *result)
{
    uint32_t buf[MAX_LENGTH_BN] = {0};
    int i = 0;
    uint32_t carry = 0;
    uint64_t sum;
    while (i < a->length) {
        sum = (uint64_t) a->num[i] * (uint64_t) b + carry;
        carry = sum >> 32;
        buf[i] = (uint32_t) sum;
        i++;
    }
    if (carry && (i < MAX_LENGTH_BN))
        buf[i++] = carry;

    for (int j = 0; j < MAX_LENGTH_BN; j++)
        result->num[j] = buf[j];
    result->length = i;
}

使用到輔助函式 bn_mul_single 以及 bn_grow。

bn_mul_single 將 bn 乘以一個 32 bit integer。
bn_grow 將 bn->num 陣列元素往後移動 i 個，相當於乘以 i 次
$2^{32}$ 。
採用直式乘法的想法，可以將 bn 視為
$2^{32}$ 進位的數字，在做乘法時
- 先呼叫 bn_mul_single() 計算 b->num[i] 乘上 a->num。
- 呼叫 bn_grow() 將計算結果乘以 i 次
  $2^{32}$ 。
- 將結果加至 sum 。
- 重複執行直到 bn->num 陣列結束。
big number tostr

char *bn_tostr(bn *a)
{
    char buf[8 * sizeof(uint32_t) * MAX_LENGTH_BN / 3 + 2] = {0};
    int str_size = 8 * sizeof(uint32_t) * MAX_LENGTH_BN / 3 + 2;
    buf[str_size - 1] = '\0';
    int i = 32 * MAX_LENGTH_BN - 1;
    // i/32 = i >> 5, i%32 = i & 0x1F
    while (i >= 0) {
        int carry;
        if (i >> 5 > a->length)
            carry = 0;
        else
            carry = (a->num[i >> 5] & ((unsigned int) 1 << (i & 0x1F))) >>
                    (i & 0x1F);

        for (int j = str_size - 2; j >= 0; j--) {
            buf[j] += buf[j] - '0' + carry;
            carry = (buf[j] > '9');
            if (carry)
                buf[j] -= 10;
        }
        i--;
    }
    i = 0;
    while (i < str_size - 2 && (buf[i] == '0' || buf[i] == '\0'))
        i++;
    char *tmp = kmalloc((str_size - i) * sizeof(char), GFP_KERNEL);
    memcpy(tmp, buf + i, (str_size - i) * sizeof(char));

    return tmp;
}

fast doubling

根據 fast doubling 以及作業要求中提供的虛擬碼

Fast_Fib(n)
    a = 0; b = 1;       // m = 0
    for i = (number of binary digit in n) to 1
        t1 = a*(2*b - a);
        t2 = b^2 + a^2;
        a = t1; b = t2; // m *= 2
        if (current binary digit == 1)
            t1 = a + b; // m++
            a = b; b = t1;
    return a;

省略 F(0)，直接從 F(1) 開始;
clz: 先去除數字 MSB 起算的開頭的 0 ，因此用 clz 計算有多少 leading 0s。
- 遇到 0 → n 為偶數，進行 fast doubling，也就是求 F(2n) 和 F(2n+1)
- 遇到 1 → n 為奇數，進行 fast doubling，先求 F(2n) 和 F(2n+1)，再求 F(2n+2)。利用 F(2n+1)，F(2n+2) 往下運算。

Fast doubling 的時間複雜度

f (2 n) = f (n) * (2 * f (n + 1) - f (n))

f (2 n + 1) = f (n)^{2} + f (n + 1)^{2}

因為每一次都會由 F(n) 以及 F(n+1) 計算出 F(2n) 和 F(2n+1)，整個 for 迴圈需要進行

⌈ l o g_{2} n ⌉

次，因此時間複雜度為

O (l o g n)

。

在上述 fast doubling 程式碼中使用我寫的 bn 結構以及函示 :

static int fib_sequence_fd(long long k, char __user *buf)
{
    bn *a = kmalloc(sizeof(bn), GFP_KERNEL),
       *b = kmalloc(sizeof(bn), GFP_KERNEL);
    bn_init(a);
    bn_init(b);
    a->num[0] = 0;
    b->num[0] = 1;
    b->length = 1;

    int h = 64 - (__builtin_clzll(k));
    for (int mask = 1 << (h - 1); mask; mask >>= 1) {
        bn tmp1, tmp2, tmp3;
        bn_init(&tmp1);
        bn_init(&tmp2);
        bn_init(&tmp3);
        bn_ls(b, 1, &tmp1);
        bn_sub(&tmp1, a, &tmp2);
        bn_mul(&tmp2, a, &tmp1);

        bn_mul(a, a, &tmp2);
        *a = tmp1;
        bn_mul(b, b, &tmp3);
        bn_add(&tmp2, &tmp3, &tmp1);
        *b = tmp1;
        if (mask & k) {
            bn_add(a, b, &tmp1);
            *a = *b;
            *b = tmp1;
        }
    }
    
    char *str = bn_tostr(a);
    int n = strlen(str);
    if (copy_to_user(buf, str, strlen(str) + 1))
        return -EFAULT;
    
    kfree(a);
    kfree(b);
    kfree(str);
    return 1;
}

測量 Kernel 時間

參考 hrtimer 以及 IBM 文章，使用 ktime 相關 api 來測量時間。我利用未使用到的 write() 方法來讀取結果，將 fib_read() 以及 fib_write() 修改如下：

static ssize_t time1;
static ssize_t fib_read(struct file *file,
                        char __user *buf,
                        size_t size,
                        loff_t *offset){
...
    ktime_t kt1 = ktime_get();
    result = fib_sequence(*offset);
    kt1 = ktime_sub(ktime_get(), kt1);
...
    time1 = ktime_to_ns(kt1);
...
}

static ssize_t fib_write(struct file *file,
                         const char *buf,
                         size_t size,
                         loff_t *offset)
{
    switch (*offset) {
    case 1:
        return time1;
    case 2:
        return time2;
    case 3:
        return time3;
    default:
        return current->pid;
    }
}

並在 Makefile 加入程式碼，讓使用者可以用 make plot 指令執行十次 ./client 並取平均，接著用 gnuplot 繪圖。

plot: all
	$(MAKE) unload
	$(MAKE) load
	for i in 1 2 3 4 5 6 7 8 9 10; do \
		sudo ./client > out; \
		sudo chown $$(whoami) time/time; \
		mv time/time ${PREFIX}$$i${POSTFIX}; \
	done
	$(MAKE) unload
	@./scripts/time.py
	gnuplot scripts/time.gp
	eog time.png

橫軸為第 x 個費氏數列數， y 軸為時間。

計算到第 1000 個費波納契數的結果如下，實際上不使用 bn 的方法只能夠計算到 fn(93) 而已。

可以發現到純加法的花費的時間在 fn(300) 以前低於 fast doubling，大致上呈線性增長，而 fast doubling 則是

l o g

曲線，

O (l o g n)

將行程固定在特定的 CPU 中執行

在開機時使用 boot loader 所提供的自訂開機參數功能，開機時浸入 GRUB 手動加入 isolcpus=cpu_id 參數

isolcpus=0

要查看指定行程的處理器親和性，可使用 taskset 加上 -p 參數再加上該行程的 PID（process ID）：

$ taskset -p 1
pid 1's current affinity mask: fe

fe 轉成二進位的格式會是 11111110，最右邊的 0 代表該行程不允許在 CPU0 執行。

$ cat /sys/devices/system/cpu/isolated
0

使用 htop 檢查 CPU 使用情況，也可以發現 CPU0 的用量為 0 。

排除干擾效能分析的因素

抑制 address space layout randomization (ASLR)

$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"

設定 scaling_governor 為 performance。準備以下 shell script，檔名為 performance.sh:

for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
do
    echo performance > ${i}
done

之後再用 $ sudo sh performance.sh 執行

針對 Intel 處理器，關閉 turbo mode

$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"

在 client.c 中使用 write 取得 fibdrv 的 pid ，並用 sched_setaffinity將 kernel module 限定在 CPU0。

lseek(fd, 0, SEEK_SET);
long long pid = write(fd, buf, 0);
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
if (sched_setaffinity(pid, sizeof(cpuset), &cpuset)) {
    exit(1);
}

#define _GNU_SOURCE             /* See feature_test_macros(7) */
#include <sched.h>
sched_setaffinity(pid, sizeof(cpuset), &cpuset)

在排除干擾效能分析之後再次繪圖

可以發現線變得更平滑，但花費的時間也小幅增加。

圖中的凸起

從上圖的可以發現獻上會有異常凸起的點，經過老師課堂上的提示，推測有可能是初次載入 page 時的 page fault 花費的額外時間造成。

mlockall()

mlockall() 會防止 page 被 swap out ，直到 munlockall() 被呼叫為止。輸入的 flag 參數 MCL_CURRENT 會讓現在已在記憶體中的 page 不會被 swap out，MCL_FUTURE 則會讓未來載入的所有 page 不會被 swap out。

在 client.c 中呼叫mlockall(MCL_CURRENT | MCL_FUTURE) ，並且事先計算一次 fib(n) ，讓足夠的 page 被載入並不被置換掉 :

#include <sys/mman.h>
...
    if (mlockall(MCL_CURRENT | MCL_FUTURE) < 0) {
        perror("mlockall");
        return 1;
    }

    for (int i = 0; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, 1);
    }
...

接著再次繪圖，發現線上異常的突起變少了。

減少 copy to user 傳送的資料量

計算 leading zeros

以

f i b (100) = 354224848179261915075 =

100110011001111011011011101101010011111000101100101001011111111000011_{2}

為例，先對

f i b (100)

取2為底的對數

\log_{2} (354224848179261915075) = 68.26322731561805

無條件進位後是69，要表達到

2^{69} - 1

總共需要 69 個 bit，每個 int 是 32 bit，共需要 3 個 int。
leading zeros 數量為 96-69 = 27個

可以用 bitwise 操作最高位的 int 來計算leading zeros。

int clz (int x){
    if (x == 0) return 32;
    int n = 0;
    if ((x & 0xFFFF0000) == 0) {n += 16; x <<= 16;}
    if ((x & 0xFF000000) == 0) {n +=  8; x <<=  8;}
    if ((x & 0xF0000000) == 0) {n +=  4; x <<=  4;}
    if ((x & 0xC0000000) == 0) {n +=  2; x <<=  2;}
    if ((x & 0x80000000) == 0) n +=  1;
    return n;
}

這個程式碼以二分搜尋法找出leading zeros，做法如下:

檢查最高 16 bit 是否全為零，全為零的話將 x 的首16個零移除(x 左移16 bit)
檢查

以 fib(100) 為例，最高位int儲存的是 0x00000013

0x00000013 & 0xFFFF0000 == 0 ， n = 16 ， x = 0x00130000
0x00130000 & 0xFF000000 == 0 ， n = 24 ， x = 0x13000000
0x13000000 & 0xF0000000 != 0
0x13000000 & 0xC0000000 == 0 ， n = 26 ， x = 0x4C000000
0x4C000000 & 0x80000000 == 0 ， n = 27

與之前的計算一致。

計算以2為底數的對數

以 32 減去 clz(x) 可獲得 x 所使用到的 bit 數量。
令

n = 32 - c l z (x)

使用這些 bit 可表達的範圍為

2^{n - 1}

2^{n} - 1

，可得

n = 32 - c l z (x) = ⌈ l o g_{2} x ⌉

且

n - 1 = 31 - c l z (x) = ⌊ l o g_{2} x ⌋

。

fibdrv 計算完後，會透過 cop_to_user() 將結果傳回 user。計算leading zeros，並將 int32 細分為 4 個 byte，呼叫cop_to_user() 時不傳送全為 0 的 byte。

static int my_copy_to_user(const bn *bignum, char __user *buf)
{
    int i = MAX_LENGTH_BN - 1;
    for (; bignum->num[i] == 0; i--)
        ;
    int lzbyte = __builtin_clz(bignum->num[i]) >> 3;

    size_t size = sizeof(int32_t) * (i + 1) - lzbyte;
    copy_to_user(buf, bignum->num, size);

    return size;
}

使用 gcc 內建函式 __builtin_clz 來計算 leading zero bit 數量， >> 3 除以 8 並捨棄餘數換算成 leading zero byte。

我的電腦使用 little-endian 儲存資料，因此非零的 byte 會被存在較低的位址。以 fib(100) 為例，需要 3 個 int，最高位int是 0x00000013，其中我只想要前 2 個 int 以及第 3 個 int 的 0x13 被 cop_to_user() 傳送。

將 cop_to_user() 的 size 指定為 2 個 int 加上 1 byte ，讓 leading zero byte 不用被傳送。

將複製的 byte 數量作為 read 的回傳值傳回 user。

..
sz = my_copy_to_user(a, buf);
...
return sz;

在 client.c 中將 bn 初始化後用 memcpy 複製資料。

    ...
    sz = read(fd, buf, 1);
        
    bn a;
    bn_init(&a);

    memcpy(a.num, buf, sz);
    int j = MAX_LENGTH_BN;
    for (; a.num[j] == 0; j--)
        ;
    a.length = j + 1;
    ...

Mutex 的影響

fibdrv 在 fib_open() 使用 mutex_trylock 上鎖，在 fib_release() 使用 mutex_unlock 解鎖。
撰寫 client_mutex_test.c 來測試多執行緒下 Mutex 對 fibdrv 的影響。

void* child1() {
    long long sz;

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        close(fd);
        pthread_exit(NULL);
    }

    char buf[1];
    for (int i = 0; i <= 92; i++) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, 1);
        printf("Thread 1 : F(%d) =  %lld.\n",
               i, sz);
        usleep(100000);
        
    }
    close(fd);
}
void* child2() {
    long long sz;

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        close(fd);
        pthread_exit(NULL);
    }

    char buf[1];
    for (int i = 0; i <= 92; i++) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, 1);
        printf("Thread2 : F(%d) =  %lld.\n",
               i, sz);
        usleep(100000);
        
    }
    close(fd);
}
int main()
{
    int offset = 100;
    pthread_t t1, t2;
    pthread_create(&t1, NULL, child1, NULL);
    sleep(1);
    pthread_create(&t2, NULL, child2, NULL);
    pthread_join(t1, NULL);
    pthread_join(t2, NULL);
    
    return 0;
}

創造 2 個 thread 來測試，Thread 1 開啟 /dev/fibonacci 後上鎖，一秒鐘後 Thread 2 嘗試開啟 /dev/fibonacci 時失敗，因為 Thread 1 已經取得 mutex 了。

$ sudo ./client_mutex_test 
...
Thread 1 : F(8) =  21.
Thread 1 : F(9) =  34.
Thread 2 Failed to open character device: Device or resource busy

將 fibdrv.c 中的 Mutex 註解掉再次進行測試 :

static int fib_open(struct inode *inode, struct file *file)
{
/*
    if (!mutex_trylock(&fib_mutex)) {
        printk(KERN_ALERT "fibdrv is in use");
        return -EBUSY;
    }
*/
    return 0;
}

static int fib_release(struct inode *inode, struct file *file)
{
    //mutex_unlock(&fib_mutex);
    return 0;
}

可以發現即使沒有 Mutex 仍能正常運行 :

...
Thread1 : F(90) =  2880067194370816120.
Thread2 : F(81) =  37889062373143906.
Thread1 : F(91) =  4660046610375530309.
Thread1 : F(92) =  7540113804746346429.
Thread2 : F(82) =  61305790721611591.
Thread2 : F(83) =  99194853094755497.
Thread2 : F(84) =  160500643816367088.
Thread2 : F(85) =  259695496911122585.
Thread2 : F(86) =  420196140727489673.
Thread2 : F(87) =  679891637638612258.
Thread2 : F(88) =  1100087778366101931.
Thread2 : F(89) =  1779979416004714189.
Thread2 : F(90) =  2880067194370816120.
Thread2 : F(91) =  4660046610375530309.
Thread2 : F(92) =  7540113804746346429.

不需要 Mutex 就能正常運行的原因是因為 thread 並沒有需要共享或是競爭的資源。若是要改寫 fibdrv 使多個 thread 共享資源則仍需要使用 Mutex 上鎖。

預先計算儲存 F(n) 需要的空間大小

我希望能一開始就配置給 bn 剛剛好能夠存放 F(n) 的記憶體大小，避免空間浪費，或是計算過程中一直重複呼叫 krealloc。

根據 wiki ，當

n

為足夠大的正整數時，

F (n) \approx 0.4472135955 \times {1.61803398875}^{n}

將近似值取 2 為底的對數，可得到需要使用的 bit 數

l o g_{2} (0.4472135955 \times {1.61803398875}^{n}) \approx - 1.160964 + n \times 0.694242

再除以 32 可得到需要使用的 uint32 數量

(- 1.160964 + n \times 0.694242) \div 32 = - 0.036280125 + n \times 0.0216950625

kernel module 無法使用浮點數運算，因此將算式乘以

10^{10}

⌊ \frac{- 362801250 + n \times 216950625}{10^{10}} ⌋ + 1

此算式計算出來的結果就是需要配置的 uint32 數量。

fibonacci 數直到 F(47) = 2971215073 都可以用一個 uint32 表達，因此當

n < 47

時直接回傳 1。

static long long fib_size(long long n) {
    if (n <= 47)
        return 1;
    return (-362801250 + n*216950625)/10000000000 + 1;
}

將 bn 結構修改，陣列改為指標，採用動態配置，並新增 max_length 變數儲存得到的 uint32_t 數量。並且修改 bn_init() 以及新增 bn_free() ，方便配置以及釋放記憶體。

typedef struct bignum {
    uint32_t *num;
    unsigned long long length;
    unsigned long long max_length;
} bn;
void bn_init(bn *a, unsigned int size)
{
    if (!a) {
        a = kzalloc(sizeof(bn), GFP_KERNEL);
        if (!a)
            return;
    }
    if (!a->num){
        a->num = kzalloc(sizeof(uint32_t) * size, GFP_KERNEL);
        if (!a->num) {
            a->length = 0;
            a->max_length = 0;
            return;
        }
    }
    a->max_length = size;
    for (int i = 0; i < a->max_length; i++)
        a->num[i] = 0;
    a->length = 1;
}

void bn_free(bn *a)
{
    if (a) {
        if (a->num)
            kfree(a->num);
        kfree(a);
    }
}

修改 fib_sequence_fd 的實作，在開始計算之前先配置記憶體，並且在配置失敗時返回。

































































#define SWAP(a, b, type) \
    do {                 \
        type tmp = a;    \
        a = b;           \
        b = tmp;         \
    } while (0)
static int fib_sequence_fd(long long k, char __user *buf)
{
    unsigned int esz = fib_size(k + 1);

    bn *a = kmalloc(sizeof(bn), GFP_KERNEL),
       *b = kmalloc(sizeof(bn), GFP_KERNEL);
    bn_init(a, esz);
    bn_init(b, esz);
    a->num[0] = 0;
    b->num[0] = 1;
    b->length = 1;
    bn *tmp1 = kmalloc(sizeof(bn), GFP_KERNEL),
       *tmp2 = kmalloc(sizeof(bn), GFP_KERNEL),
       *tmp3 = kmalloc(sizeof(bn), GFP_KERNEL);
    bn_init(tmp1, esz);
    bn_init(tmp2, esz);
    bn_init(tmp3, esz);

    if (!a || !b || !tmp1 || !tmp2 || !tmp3) {
        bn_free(a);
        bn_free(b);
        bn_free(tmp1);
        bn_free(tmp2);
        bn_free(tmp3);
        return 0;
    }

    int h = 64 - (__builtin_clzll(k));
    for (int mask = 1 << (h - 1); mask; mask >>= 1) {
        bn_init(tmp1, esz);
        bn_init(tmp2, esz);
        bn_init(tmp3, esz);
        bn_ls(b, 1, tmp1);
        bn_sub(tmp1, a, tmp2);
        bn_mul(tmp2, a, tmp1);

        bn_mul(a, a, tmp2);
        SWAP(a, tmp1, bn *);
        bn_mul(b, b, tmp3);
        bn_add(tmp2, tmp3, tmp1);
        SWAP(b, tmp1, bn *);
        if (mask & k) {
            bn_add(a, b, tmp1);
            SWAP(a, b, bn *);
            SWAP(b, tmp1, bn *);
        }
    }

    size_t sz = 0;
    sz = my_copy_to_user(a, buf);

    bn_free(a);
    bn_free(b);
    bn_free(tmp1);
    bn_free(tmp2);
    bn_free(tmp3);

    return sz;
}

第 44、47、50、51 行，將原本的 *a = tmp1 改為使用 SWAP 聚集交換兩個指標。

以相同方式估算轉換為十進位並以 string 表示需要的 char 數量 :
n 個 int 能夠表達到

2^{32 \times n} - 1

l o g_{10} (2^{32 \times n}) \approx 32 \times n \times 0.30103 \approx n \times 9.63296

同乘

10^{5}

⌊ \frac{n \times 963296}{10^{5}} ⌋ + 1

在 bn_tostr 引入上述算式事先計算需要的 char 數量 :

char *bn_tostr(bn *a){
    unsigned long long str_size = (a->max_length * 963296) / 100000 + 1;
    char *buf = kzalloc(sizeof(char) * str_size, GFP_KERNEL);
    ...
}

參考

smallhanley