Try   HackMD

contributed by < blueskyson >

作業說明 GitHub

開發環境

$ gcc --version
gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0

$ lscpu
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   39 bits physical, 48 bits virtual
CPU(s):                          12
On-line CPU(s) list:             0-11
Thread(s) per core:              2
Core(s) per socket:              6
Socket(s):                       1
NUMA node(s):                    1
Vendor ID:                       GenuineIntel
CPU family:                      6
Model:                           165
Model name:                      Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
Stepping:                        2
CPU MHz:                         2600.000
CPU max MHz:                     5000.0000
CPU min MHz:                     800.0000
BogoMIPS:                        5199.98
Virtualization:                  VT-x
L1d cache:                       192 KiB
L1i cache:                       192 KiB
L2 cache:                        1.5 MiB
L3 cache:                        12 MiB
NUMA node0 CPU(s):               0-11

修正 Fibonacci 數計算的正確性

在 fibdrv.c 的 fib_sequencelong long 來處理費氏數列的值,運算到

F93 時會 overflow,因此必須以能夠儲存更大數值的資料結構取代 long long

採用 struct BigN 取代 long long

首先在 client.c 以及 fibdrv.c 定義結構 struct BigN 並給予別名為 u128

#define BIGN
#ifdef BIGN
typedef struct BigN {
    unsigned long long upper, lower;
} u128;
#endif

我一開始用 u128 表示正整數

upper264+lower,但是在我無法基數轉換。

參考 hankluo6 的作法,讓 lower 最多只儲存

1019,因此 u128 表示的值為
upper1019+lower
。如此一來數值由 lower 進位到 upper 的規則會類似於
1019
進位,可以表示的數值範圍為
0
1.81038
,規律如下:

decimal upper lower
0 0 0
1 0 1
2 0 2
10191
0 9999
1019
1 0
1019+1
1 1
210191
1 9999
21019
2 0

如此就能夠很輕易的利用 printf 印出字串如下:

printf("%llu%019llu", fib->upper, fib->lower);

套用 u128 改寫的後的 fib_sequence 如下:

#ifdef BIGN
#define P10_UINT64 10000000000000000000ULL
static u128 fib_sequence_u128(long long k)
{
    if (k <= 1LL) {
        u128 ret = {.upper = 0, .lower = (unsigned long long) k};
        return ret;
    }

    u128 a, b, c;
    a.upper = 0;
    a.lower = 0;
    b.upper = 0;
    b.lower = 1;

    for (int i = 2; i <= k; i++) {
        c.lower = a.lower + b.lower;
        c.upper = a.upper + b.upper;
        // if lower overflows or lower >= 10^9, add a carry to upper
        if ((c.lower < a.lower) || (c.lower >= P10_UINT64)) {
            c.lower -= P10_UINT64;
            c.upper += 1;
        }
        a = b;
        b = c;
    }

    return c;
}
#endif

接下來再改寫 fib_read,原本的 fib_read 函式內部直接回傳

Fk 的值,亦即透過 read(fd, buf, 1) 的回傳值來告訴 client.c
Fk
的值,其型態為 ssize_t。追溯 ssize_t 的定義:

// posix_types.h
typedef long               __kernel_long_t;

/*
 * Most 32 bit architectures use "unsigned int" size_t,
 * and all 64 bit architectures use "unsigned long" size_t.
 */
#if __BITS_PER_LONG != 64
// typedef something...
#else
typedef __kernel_long_t    __kernel_ssize_t;
#endif

// types.h
typedef __kernel_ssize_t   ssize_t;

觀察上述定義以及參閱 Integer-Types 得知 long 為 32 或 64 位元,我測試了自己的電腦得知我的 long 為 64 位元,亦即 ssize_t 為 64 位元。

顯然 ssize_t 傳遞

F93 以後的數值會 overflow,應該改用 copy_to_user API,將
Fk
的資料,複製到 read(fd, buf, 1)buf 中,在 client.c 做基數轉換。改寫後的 fib_read 如下:

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
#ifdef BIGN
    u128 fib = fib_sequence_u128(*offset);
    copy_to_user(buf, &fib, sizeof(fib));
    return sizeof(fib);
#else
    return (ssize_t) fib_sequence(*offset);
#endif
}

接下來改寫 client.c:

#define FIB_DEV "/dev/fibonacci"
#define BIGN

#ifdef BIGN
typedef struct BigN {
    unsigned long long upper, lower;
} u128;

void print_fib_u128(int i, u128 *fib)
{
    if (fib->upper) {
        printf("Reading from " FIB_DEV
               " at offset %d, returned the sequence "
               "%llu%019llu.\n",
               i, fib->upper, fib->lower);
    } else {
        printf("Reading from " FIB_DEV
               " at offset %d, returned the sequence "
               "%llu.\n",
               i, fib->lower);
    }
}
#endif

int main()
{
    long long sz;

    char buf[sizeof(u128)];
    char write_buf[] = "testing writing";
    int offset = 100; /* TODO: try test something bigger than the limit */

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }

    for (int i = 0; i <= offset; i++) {
        sz = write(fd, write_buf, strlen(write_buf));
        printf("Writing to " FIB_DEV ", returned the sequence %lld\n", sz);
    }

    for (int i = 0; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, sizeof(u128));
#ifdef BIGN
        print_fib_u128(i, (u128 *) buf);
#else
        printf("Reading from " FIB_DEV
               " at offset %d, returned the sequence "
               "%lld.\n",
               i, sz);
#endif
    }

    for (int i = offset; i >= 0; i--) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, sizeof(u128));
#ifdef BIGN
        print_fib_u128(i, (u128 *) buf);
#else
        printf("Reading from " FIB_DEV
               " at offset %d, returned the sequence "
               "%lld.\n",
               i, sz);
#endif
    }

    close(fd);
    return 0;
}

測試時我發現程式永遠都顯示

F92 的結果,在 fib_read 中使用 printk("%lld\n", *offset); 然後用 dmesg 觀察紀錄,發現 *offset 會被限制不能超過 92,然後我在 fibdrv.c 中發現巨集:

/* MAX_LENGTH is set to 92 because
 * ssize_t can't fit the number > 92
 */
#define MAX_LENGTH 92

它在 fib_device_lseek 中限制 lseekoffset。將此巨集改為 100 後即可正常執行到

F100。執行 python scripts/verify.py 成功通過驗證。

此 BigN 實作最高可以計算

F184=12712787974383433414upper6972278486287885163lower

截至目前為止的實作在 53f5ddf

擴展 struct BigN 使其能夠計算
F1000

改寫 struct BigN 的資料結構,取名為 ubig,把 upper, lower 擴增為 cell[size] 如下:

#define BIGN
#ifdef BIGN

#define BIGNSIZE 12

typedef struct BigN {
    unsigned long long cell[BIGNSIZE];
} ubig;

#define init_ubig(x) for (int i = 0; i < BIGNSIZE; x.cell[i++] = 0)

#endif

上方的 BIGNSIZE 巨集用來控制 cell 陣列的長度,當 BIGNSIZE 為 12,ubig 即為一個擁有 768 位元的 cell 用以儲存費氏數列的值,可以表示的數值範圍為

0
1.8101912
cell[0] 儲存最低位、cell[11] 儲存最高位。

接著改寫 fib_sequence

#ifdef BIGN
#define P10_UINT64 10000000000000000000ULL
static ubig fib_sequence_ubig(long long k)
{
    if (k <= 1LL) {
        ubig ret;
        init_ubig(ret);
        ret.cell[0] = (unsigned long long) k;
        return ret;
    }

    ubig a, b, c;
    init_ubig(a);
    init_ubig(b);
    b.cell[0] = 1;

    for (int i = 2; i <= k; i++) {
        for (int j = 0; j < BIGNSIZE; j++)
            c.cell[j] = a.cell[j] + b.cell[j];

        for (int j = 0; j < BIGNSIZE - 1; j++) {
            // if lower overflows or lower >= 10^9, add a carry to upper
            if ((c.cell[j] < a.cell[j]) || (c.cell[j] >= P10_UINT64)) {
                c.cell[j] -= P10_UINT64;
                c.cell[j + 1] += 1;
            }
        }

        a = b;
        b = c;
    }

    return c;
}
#endif

為了不讓 client.c 過於複雜,我在 fibdrv.c 就將 BigN 轉換為字串,然後直接用 copy_to_user 回傳

Fk 的字串,如此一來不管
Fk
的資料結構長的如何,client.c 都能使用同一套程式碼輸出,同時也利於分析轉換字串的效能。

以下函式在 fibdrv.c 中,負責將 a 轉為字串複製到 buf 中:

size_t ubig_to_string(char *buf, size_t bufsize, const ubig *a)
{
    int index = BIGNSIZE - 1;
    while (index >= 0 && !a->cell[index])
        index--;
    if (index == -1) {
        buf[0] = '0';
        buf[1] = '\0';
        return (size_t) 1;
    }

    char buf2[22];
    snprintf(buf2, bufsize,"%llu", a->cell[index--]);
    strcat(buf, buf2);
    while (index >= 0) {
        snprintf(buf2, bufsize, "%019llu", a->cell[index--]);
        strcat(buf, buf2);
    }
    return strlen(buf);
}

然後改寫 fib_read

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
#ifdef BIGN
#define BUFFSIZE 500
    char fibbuf[BUFFSIZE] = "";
    ubig fib = fib_sequence_ubig(*offset);
    size_t len = ubig_to_string(fibbuf, BUFFSIZE, &fib);
    len = (size > len) ? (len + 1) : size;
    copy_to_user(buf, fibbuf, len);
    return (ssize_t) len;
#else
    return (ssize_t) fib_sequence(*offset);
#endif
}

最後改寫 client.c 使其直接印出 buf 字串:

#define BIGN
#define BUFFSIZE 500

//...

for (int i = 0; i <= offset; i++) {
    lseek(fd, i, SEEK_SET);
    sz = read(fd, buf, BUFFSIZE);
#ifdef BIGN
    printf("Reading from " FIB_DEV
           " at offset %d, returned the sequence "
           "%s.\n",
           i, buf);
#else
    printf("Reading from " FIB_DEV
           " at offset %d, returned the sequence "
           "%lld.\n",
           i, sz);
#endif
}

//...

此實作最大可以計算

F1093。如果我們可以根據
k
,預先計算
Fk
的十進位至少有多少位數,進而動態調整 BIGNSIZE,就可以計算任意
Fk

截至目前為止的實作在 d0dd8a4

改進 struct BigN 實作

我前面提到 struct BigN 中每個 unsigned long long 達到

1019 就會強迫進位,但是接下來實作 Fast Doubling 時,直接對其左移運算會算出錯誤的值,因為 upperlower 並不是差距
264
,所以把 lower 的位元直接左移到 upper 完全是錯誤的。

經過思考後,我回復一開始的想法,在相加以及左移時,使用 u128 表示正整數

upper264+lower,計算完
Fk
後要基數轉換時,再讓 lower 依據
1019
進位、化為字串。此作法在進行加法運算時省去了判斷 lower 是否大於
1019
並進位,所以比原版實作更有效率。

decimal upper lower
0 0 0
1 0 1
2 0 2
264
- 1
0 0xffff
264
1 0
264+1
1 1
2651
1 0xffff
265
2 0

以下為改進後的加法 (ubig_add) 的實作:

#define BIGNSIZE 12
typedef struct BigN {
    unsigned long long cell[BIGNSIZE];
} ubig;

static inline void init_ubig(ubig *x)
{
    for (int i = 0; i < BIGNSIZE; x->cell[i++] = 0)
        ;
}

static inline void ubig_add(ubig *dest, ubig *a, ubig *b) {
    for (int i = 0; i < BIGNSIZE; i++)
        dest->cell[i] = a->cell[i] + b->cell[i];

    for (int i = 0; i < BIGNSIZE - 1; i++)
        dest->cell[i + 1] += (dest->cell[i] < a->cell[i]);
}

觀摩 arthur-chang 的作法,在轉換為字串的過程,先將字元陣列初始化為 "0",每讀取一個 bit 就讓自己加自己(也就是乘以二),如果當前是 set bit 就加 1,然後進到下一個迭代,計算過程中都是透過字元陣列儲存十進位的資訊。以下舉 1010 為例

step num computation result
0 1010 "0" "0"
1 1010 ("0" + "0") + "1" "1"
2 1010 ("1" + "1") + "0" "2"
3 1010 ("2" + "2") + "1" "5"
4 1010 ("5" + "5") + "0" "10"

由上述演算法就可以用簡單的機制將 ubig 基數轉換,arthur-chang 的實作是 uint32_t 的版本,我實作 unsigned long long 的版本:

int ubig_to_string(char *buf, size_t bufsize, ubig *a)
{
    memset(buf, '0', bufsize);
    buf[bufsize - 1] = '\0';

    int index = BIGNSIZE - 1;
    while (index >= 0 && !a->cell[index])
        index--;
    if (index == -1)
        return bufsize - 2;

    for (int i = index; i >= 0; i--) {
        for (unsigned long long mask = 0x8000000000000000ULL;
             mask; mask >>= 1) {
            int carry = ((a->cell[i] & mask) != 0);
            for (int j = bufsize - 2; j >= 0; j--) {
                buf[j] += buf[j] - '0' + carry;
                carry = (buf[j] > '9');
                if (carry)
                    buf[j] -= 10;
            }
        }
    }
    
    int offset = 0;
    while (buf[offset] == '0')
        offset++;
    return (buf[offset] == '\0') ? (offset - 1) : offset;
}

接下來再套用前述的 function 實作 fibdrv:

static ubig fib_sequence_ubig(long long k)
{
    if (k <= 1LL) {
        ubig ret;
        init_ubig(&ret);
        ret.cell[0] = (unsigned long long) k;
        return ret;
    }

    ubig a, b, c;
    init_ubig(&a);
    init_ubig(&b);
    b.cell[0] = 1ULL;

    for (int i = 2; i <= k; i++) {
        ubig_add(&c, &a, &b);
        a = b;
        b = c;
    }

    return c;
}

#define BUFFSIZE 500
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
#ifdef BIGN
    char fibbuf[BUFFSIZE];
    ubig fib = fib_sequence_ubig(*offset);
    int __offset = ubig_to_string(fibbuf, BUFFSIZE, &fib);
    copy_to_user(buf, fibbuf + __offset, BUFFSIZE - __offset);
    return (ssize_t) BUFFSIZE - __offset;
#else
    return (ssize_t) fib_sequence(*offset);
#endif
}

到目前為止的程式碼 b2749a6

用 Fast Doubling 計算費氏數列

首先實作 struct BigN 的減法(ubig_sub)、左移(ubig_lshift)、直式乘法(ubig_mul):

static inline void ubig_sub(ubig *dest, ubig *a, ubig *b)
{
    for (int i = 0; i < BIGNSIZE; i++)
        dest->cell[i] = a->cell[i] - b->cell[i];

    for (int i = 0; i < BIGNSIZE - 1; i++)
        dest->cell[i + 1] -= (dest->cell[i] > a->cell[i]);
}

static inline void ubig_lshift(ubig *dest, ubig *a, int x)
{
    init_ubig(dest);

    // quotient and remainder of x being divided by 64
    unsigned quotient = x >> 6, remainder = x & 0x3f;
    for (int i = 0; i + quotient < BIGNSIZE; i++)
        dest->cell[i + quotient] |= a->cell[i] << remainder;

    if (remainder)
        for (int i = 1; i + quotient < BIGNSIZE; i++)
            dest->cell[i + quotient] |= a->cell[i - 1] >> (64 - remainder);
}

void ubig_mul(ubig *dest, ubig *a, ubig *b)
{
    init_ubig(dest);
    int index = BIGNSIZE - 1;
    while (index >= 0 && !b->cell[index])
        index--;
    if (index == -1)
        return;

    for (int i = index; i >= 0; i--) {
        int bit_index = (i << 6) + 63;
        for (unsigned long long mask = 0x8000000000000000ULL; mask;
             mask >>= 1) {
            if (b->cell[i] & mask) {
                ubig shift, tmp;
                ubig_lshift(&shift, a, bit_index);
                ubig_add(&tmp, dest, &shift);
                *dest = tmp;
            }
            bit_index--;
        }
    }
}

接下來參考作業說明提供的演算法:

[F(2n+1)F(2n)]=[1110]2n[F(1)F(0)]=[1110]n[1110]n[F(1)F(0)]=[F(n+1)F(n)F(n)F(n1)][F(n+1)F(n)F(n)F(n1)][10]=[F(n+1)2+F(n)2F(n)F(n+1)+F(n1)F(n)]

因此可得:

F(2k)=F(k)[2F(k+1)F(k)]F(2k+1)=F(k+1)2+F(k)2

pseudo code:

Fast_Fib(n)
    a = 0; b = 1;       // m = 0
    for i = (number of binary digit in n) to 1
        t1 = a*(2*b - a);
        t2 = b^2 + a^2;
        a = t1; b = t2; // m *= 2
        if (current binary digit == 1)
            t1 = a + b; // m++
            a = b; b = t1;
    return a;

舉例: 求解

F(10) 1010 = 10102

i start 4 3 2 1 result
n - 1010 1010 1010 1010 -
F(m) F(0) F(0*2+1) F(1*2) F(2*2+1) F(5*2) F(10)
a 0 1 1 5 55 55
b 1 1 2 8 89 -

用 fast doubling 實作 fibdrv:

static ubig fib_sequence_ubig(long long k)
{
    if (k <= 1LL) {
        ubig ret;
        init_ubig(&ret);
        ret.cell[0] = (unsigned long long) k;
        return ret;
    }

    ubig a, b;
    init_ubig(&a);
    init_ubig(&b);
    b.cell[0] = 1ULL;

    for (unsigned long long mask = 0x8000000000000000ULL >> __builtin_clzll(k);
         mask; mask >>= 1) {
        ubig tmp1, tmp2, t1, t2;
        ubig_lshift(&tmp1, &b, 1);   // tmp1 = 2*b
        ubig_sub(&tmp2, &tmp1, &a);  // tmp2 = 2*b - a
        ubig_mul(&t1, &a, &tmp2);    // t1 = a*(2*b - a)

        ubig_mul(&tmp1, &a, &a);      // tmp1 = a^2
        ubig_mul(&tmp2, &b, &b);      // tmp2 = b^2
        ubig_add(&t2, &tmp1, &tmp2);  // t2 = a^2 + b^2

        a = t1, b = t2;
        if (k & mask) {
            ubig_add(&t1, &a, &b);  // t1 = a + b
            a = b, b = t1;
        }
    }

    return a;
}

到目前為止的程式碼 473ff4a

Adding 與 Fast Doubling 的效能分析

isolate 特定 cpu

首先修改 grub 設定檔,強制 isolate cpu 11:

$ sudo vim /etc/default/grub

編輯 GRUB_CMDLINE_LINUX=""

GRUB_CMDLINE_LINUX="isolcpus=0"

更新 grub

$ sudo update-grub

重新開機後,用 taskset 檢查 cpu 11 是否已經被 isolate:

$ taskset -p 1
pid 1's current affinity mask: 7ff

我發現有少數 process 仍然有全部核心的 affinity,例如 rcu_gp

client 程式固定在 cpu 11 執行:

$ sudo insmod fibdrv.ko
$ sudo taskset -c 11 ./client

排除其餘干擾因素

  1. 抑制 address space layout randomization (ASLR)
    ​​​$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
    
  2. 設定 scaling_governor 為 performance:
    ​​​# performance.sh
    ​​​for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
    ​​​do
    ​​​    echo performance > ${i}
    ​​​done
    
    執行腳本
    ​​​$ sudo sh performance.sh
    
  3. 針對 Intel 處理器,關閉 turbo mode:
    ​​​$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
    

在 user space 與 kernel space 計算 fibdrv 執行時間

首先利用 ktime_tktime_get() 來處理計算費氏數列耗費時間,最後透過 fib_read 的回傳值將耗費的時間傳給 user space,具體實作如下:

#include <linux/ktime.h>

// ...

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
#ifdef BIGN
    ktime_t t = ktime_get();
    char fibbuf[BUFFSIZE];
    ubig fib = fib_sequence_ubig(*offset);
    int __offset = ubig_to_string(fibbuf, BUFFSIZE, &fib);
    copy_to_user(buf, fibbuf + __offset, BUFFSIZE - __offset);
    s64 elapsed = ktime_to_ns(ktime_sub(ktime_get(), t));
    return elapsed;
#else
    return (ssize_t) fib_sequence(*offset);
#endif
}

然後新增一個 perf-test.c 用來印出 user space、kernel space、kernel to user 的執行時間。此程式會重複計算

F0
F1000
100 次,並取 10% trimmed mean。

// perf-test.c
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>

#define FIB_DEV "/dev/fibonacci"

#define BIGN
#define BUFFSIZE 500

int compare(const void *a, const void *b)
{
    long long A = *(long long*) a;
    long long B = *(long long*) b;
    return (A > B) - (A < B);
}

int main()
{
    char buf[BUFFSIZE];
    int offset = 1000;

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }

    long long utime_arr[1001][100];
    long long ktime_arr[1001][100];
    for (int i = 0; i < 100; i++) {
        for (int j = 0; j <= offset; j++) {
            struct timespec start, end;
            lseek(fd, j, SEEK_SET);
            clock_gettime(CLOCK_REALTIME, &start);
            
            ktime_arr[j][i] = read(fd, buf, BUFFSIZE);
            
            clock_gettime(CLOCK_REALTIME, &end);
            utime_arr[j][i] = (long long)end.tv_nsec - start.tv_nsec;
        }
    }

    // compute 10% trimmed mean
    for (int i = 0; i <= 1000; i++) {
        qsort((void *)&utime_arr[i][0], 100, sizeof(long long), compare);
        qsort((void *)&ktime_arr[i][0], 100, sizeof(long long), compare);
        long long utime_avg = 0, ktime_avg = 0;
        for (int j = 10; j < 90; j++) {
            utime_avg += utime_arr[i][j];
            ktime_avg += ktime_arr[i][j];
        }
        utime_avg /= 80;
        ktime_avg /= 80;
        printf("%d %lld %lld %lld\n", i, ktime_avg, utime_avg, utime_avg - ktime_avg);
    }

    close(fd);
    return 0;
}

接下來輸出測試的結果到 fast_doubling.txt

$ gcc perf-test.c -o a
$ sudo insmod fibdrv.ko
$ sudo taskset -c 11 ./a > fast_doubling.txt
$ sudo rmmod fibdrv.ko || true >/dev/null

輸出的結果如下:

0 232 543 311
1 51551 51835 284
2 52655 52997 342
3 52379 52663 284
4 52758 53041 283
...

使用 gnuplot 畫出折線圖

gnuplot script:

# 1.gp
set title "Fast Doubling Time"
set xlabel "n th sequence of Fibonacci number"
set ylabel "time cost (ns)"
set terminal png font " Times_New_Roman,12 "
set output "time.png"
set xrange [0:1000]
set xtics 0, 100, 1000
set key left

plot \
"fast_doubling.txt" using 1:2 with points title "kernel", \
"fast_doubling.txt" using 1:3 with points title "user", \
"fast_doubling.txt" using 1:4 with points title "kernel to user", \

至此可以畫出 Fast Doubling 的 10% trimmed mean 執行時間:

接下來用同樣的手法畫出 Adding 的 10% trimmed mean 執行時間:

比較 Fast Doubling 和 Adding 的 user space time。

可以發現 Fast Doubling 的效能較 Adding 好。

到目前為止的變更 48956a1

2022/04/02 與老師一對一討論檢討

  • 在計算 10 進位的數值前,先估算有效位數
  • abstraction,將實作抽象化,減少重複複修改程式碼、增加可讀性
  • determinism,在開始計算之前,把記憶體配置好,若記憶體配置失敗就直接回傳失敗訊息,避免浪費時間計算。
  • 降低 kernel -> user 的資料傳輸量

動態配置記憶體需要注意以下議題:

  • 連續的 unsigned long long 採用動態配置時,可能會失敗
  • 乘法運算: m 位數 * m 位數 => 會得到 2m 位數 => 需要事先配置記憶體 (無法 in-place)
  • 二進位轉換成 10 進位時,也需要動態配置記憶體
  • kernel to user 資料傳遞的過程中,可能會遇到 client (user) 事先配置的記憶體不足

將實作抽象化

我將大數加法、乘法、初始化等函式統一定義為:

// ubig_xxx.h
ubig *new_ubig(int size);
void destroy_ubig(ubig *ptr);
void zero_ubig(ubig *x);

void ubig_assign(ubig *dest, ubig *src);
void ubig_add(ubig *dest, ubig *a, ubig *b);
void ubig_sub(ubig *dest, ubig *a, ubig *b);
void ubig_lshift(ubig *dest, ubig *a, int x);
void ubig_mul(ubig *dest, ubig *a, ubig *b, ubig *shift_buf, ubig *add_buf);
int ubig_to_string(char *buf, size_t bufsize, ubig *a);

ubig fib_sequence(long long k);

並將不同版本的實作寫在 ubig_1.hubig_2.h,在 fibdrv.c 藉由切換 #include "ubig_xxx.h" 來切換為不同實作。

預先估算
Fn
的有效位數

根據 wikipedia,費波納西數列的前後項比值會趨近於黃金比例,也就是

Fn1.61803...Fn+1

並且,

1log10 1.618044.78518 也就是說當數列的前後項比值接近 1.61804 時,在十進位表示時,每增加四到五項必定會進一位。

列舉費波納西數列,觀察在第幾項之後

FkFk1 會小於
1.61804

k
Fk
Fk/Fk1
13 233 1.6180555556
14 377 1.6180257511
15 610 1.6180371353
16 987 1.6180327869
17 1597 1.6180344478

從列舉看到當

k>=14 時,
FkFk1
會收斂到小於 1.61804,所以當
k>=14
時可以藉由
log10 233+(k13)log10 1.61804+1
估算第
Fk
的十進位位數。此式可被化簡為
k0.20899+0.6505

假設 ubig 使用了

Nunsigned long long,計算
log2 264Nlog2 10+1
得到 ubig 可以表示的最大位數,將常數計算完後化簡為
N19.26593+1

只要使

N19.2661+1>=k0.20899+0.6505 即可根據
k
的值動態調整
N
的值。以下便是估計所需 unsigned long long 數量的函式(因為 kernel module 不允許使用浮點數,所以將預先計算的常數同乘以 100000 後,用整數運算):

static inline int estimate_size(long long k) {
    if (k <= 93)
        return 1;
    unsigned long long n = (k * 20899 - 34950) / 1926610;
    return (int)n + 1;
}

接下來改寫 ubig 結構,使其可以根據需求調整 unsigned long long 陣列長度。

typedef struct BigN {
    int size;
    unsigned long long *cell;
} ubig;

利用 kmallockfree 分配記憶體給 ubig

ubig *new_ubig(int size)
{
    ubig *ptr = kmalloc(sizeof(ubig), GFP_KERNEL);
    if (!ptr)
        return NULL;

    unsigned long long *ullptr =
        kmalloc(size * sizeof(unsigned long long), GFP_KERNEL);
    if (!ullptr) {
        kfree(ptr);
        return NULL;
    }
    memset(ullptr, 0, size * sizeof(unsigned long long));

    ptr->size = size;
    ptr->cell = ullptr;
    return ptr;
}

static inline void destroy_ubig(ubig *ptr)
{
    if (ptr) {
        if (ptr->cell)
            kfree(ptr->cell);
        kfree(ptr);
    }
}

Deterministic 記憶體管理

在開始計算

Fk 之前,先分配需要動態配置的物件,如果配置失敗就馬上返回。所以 fast doubling 的實作變為:

static ubig* fib_sequence(long long k) { if (k <= 1LL) { ubig *ret = new_ubig(1); if (!ret) return NULL; ret->cell[0] = (unsigned long long) k; return ret; } int sz = estimate_size(k); ubig *a = new_ubig(sz); ubig *b = new_ubig(sz); ubig *tmp1 = new_ubig(sz); ubig *tmp2 = new_ubig(sz); ubig *t1 = new_ubig(sz); ubig *t2 = new_ubig(sz); ubig *mul_buf1 = new_ubig(sz); ubig *mul_buf2 = new_ubig(sz); if (!a || !b || !tmp1 || !tmp2 || !t1 || !t2 || !mul_buf1 || !mul_buf2) { destroy_ubig(a); destroy_ubig(b); destroy_ubig(tmp1); destroy_ubig(tmp2); destroy_ubig(t1); destroy_ubig(t2); destroy_ubig(mul_buf1); destroy_ubig(mul_buf2); return NULL; } b->cell[0] = 1ULL;

在第 11 到 19 行先 kmalloc 配置所有 ubig,在第 20 行確認所需的 ubig 是否全都配置完畢,若否就將所有 ubig 釋放然後回傳 NULL。接下來就只會使用這些配置好的 ubig 運算,不再呼叫 kmalloc

for (unsigned long long mask = 0x8000000000000000ULL >> __builtin_clzll(k); mask; mask >>= 1) { ubig_lshift(tmp1, b, 1); // tmp1 = 2*b ubig_sub(tmp2, tmp1, a); // tmp2 = 2*b - a ubig_mul(t1, a, tmp2, mul_buf1, mul_buf2); // t1 = a*(2*b - a) ubig_mul(tmp1, a, a, mul_buf1, mul_buf2); // tmp1 = a^2 ubig_mul(tmp2, b, b, mul_buf1, mul_buf2); // tmp2 = b^2 ubig_add(t2, tmp1, tmp2); // t2 = a^2 + b^2 ubig_assign(a, t1); ubig_assign(b, t2); if (k & mask) { ubig_add(t1, a, b); // t1 = a + b ubig_assign(a, b); ubig_assign(b, t1); } } destroy_ubig(b); destroy_ubig(tmp1); destroy_ubig(tmp2); destroy_ubig(t1); destroy_ubig(t2); destroy_ubig(mul_buf1); destroy_ubig(mul_buf2); return a; }

使用 kmalloc 動態配置記憶體後,照理來說最大可以計算 kmalloc 配置的長度的極限(128KB),即 2048 個 unsigned long long。可以表達的最大的十進位位數為

204819.26593+1=39457

十進位位數小於等於 39457 的費氏數列項次為

k0.20899+0.650539457k188795,因此目前的實作最大可以計算
F188795

到目前為止的實作在 0ee4f89ubig_2.h

改善 kernel to user 資料傳遞的過程

  • 首先讓 copy_to_user 回傳 ubig 的原始資料,在資料回傳至 user space 後,再呼叫 ubig_to_string 基數轉換,藉此減少 kernel to user 複製資料的成本。
  • 在計算費費氏數列之前也必須事先判斷 user 給的 buffer 夠不夠大,若 buffer 不夠大,就應該直接回傳失敗訊息,避免浪費時間計算。

為了達成上述兩項要求,我首先在 fib_sequence 函式增加一個參數 user_size 來接收 user buffer 的大小,在透過 estimate_size 計算完需要多少 unsigned long long 來儲存數值後,再檢查 buffer 長度夠不夠儲存這麼長的 unsigned long long 陣列:

static ubig *fib_sequence(long long k, size_t user_size)
{
    if (k <= 1LL) {
        /* Check if buffer has enough size */
        if (user_size < sizeof(unsigned long long))
            return NULL;
        
        ubig *ret = new_ubig(1);
        if (!ret)
            return NULL;
        
        ret->cell[0] = (unsigned long long) k;
        return ret;
    }

    int sz = estimate_size(k);
    /* Check if buffer has enough size */
    if (user_size < sz * sizeof(unsigned long long))
        return NULL;
    
    // remaining code ...
}

計算完

Fk 之後,就讓 copy_to_userubigunsigned long long 陣列複製進 user buffer,然後回傳 unsigned long long 陣列長度(每 8 byte 一個單位):

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    ubig *fib = fib_sequence(*offset, size);
    if (!fib)
        return -1;

    int ret_size = fib->size;
    copy_to_user(buf, fib->cell, fib->size * sizeof(unsigned long long));
    destroy_ubig(fib);

    return ret_size;
}

到目前為止的實作在 8e0021f

Kernel Module vs Userspace App

近日面試被問到假設 fibdrv 在 userspace 實現,會與 kernel module 有什麼差異,參照 Differences Between Kernel Modules and User Programs,其中提到 Kernel modules have higher execution privilege,故 kernel module 本身被分配到 cpu cycle 執行的機會就比 userspace application 高,所以比較快執行完畢。

以下是我將 fast doubling 做成 user space 的 function 來與 fibdrv 進行執行時間的比較(只計算

Fk,不包含基數轉換):

可以從圖中看出 kernel module 確實比 userspace application 快。

接下來測試計算費氏數列加上基數轉換所耗費的時間,發現基數轉換的效率非常差,是接下還需要改善的部份。

04/16 到目前為止可以改善的問題

TODO: 引入 Schönhage–Strassen algorithm 來加速乘法運算,注意 threshold 的設定

bignum

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

引入 Schönhage–Strassen Algorithm

Schönhage–Strassen 的概念是將

x
y
n
個位數為單位,將大數拆成若干個較小的數
x0
x1
y0
y1
,對
(x0,x1,...)
(y0,y1,...)
線性卷積,執行完線性卷積後將所有元素依據自己的位數左移並相加就可以完成乘法運算。舉例:

x=26699,y=188

x
y
2 個位數為界分割會得到以下兩個數列:

(2,66,99), (1,88)

對數列線性卷積:

         2    66    99
x              1    88
-----------------------
       176  5808  8712
   2    66    99
-----------------------
   2   242  5907  8712

(2,242,5907,8712)
(2,66,99)
(1,88)
的線性卷積,接下來透過左移和加法將線性卷積的結果相加即可完成大數乘法:

        8712
      5907
     242
+    2
--------------
     5019412

ubig 的乘法實作中,我打算以 32 bit 為界切割數值,並透過 64 bit 的乘法來計算線性卷積。為了方便上述運算,我將 ubig 儲存數值的資料型態由 unsigned long long 改為 unsigned int

typedef struct BigN {
    int size;
    unsigned int *cell;
} ubig;

我稍微調整乘法計算方式,按照下圖 col1col5 順序計算卷積、處理進位,然後直接存到 dest 中:

// find the array index of MSB of a
// TODO: use binary search
static inline int ubig_msb_idx(ubig *a)
{
    int msb_i = a->size - 1;
    while (msb_i >= 0 && !a->cell[msb_i])
        msb_i--;
    return msb_i;
}

void ubig_mul(ubig *dest, ubig *a, ubig *b)
{
    zero_ubig(dest);

    // find the array index of the MSB of a, b
    int msb_a = ubig_msb_idx(a);
    int msb_b = ubig_msb_idx(b);

    // a == 0 or b == 0 then dest = 0
    if (msb_a < 0 || msb_b < 0)
        return;

    // calculate the length of vector 
    // after doing linear convolution
    // i.e. column number
    int length = msb_a + msb_b + 1;

    /* do linear convolution */
    unsigned long long carry = 0;
    for (int i = 0; i < length; i++) {
        unsigned long long col_sum = carry;
        carry = 0;

        int end = (i <= msb_b) ? i : msb_b;
        int start = i - end;
        for (int j = start, k = end; j <= end; j++, k--) {
            unsigned long long product =
                (unsigned long long) a->cell[k] * b->cell[j];
            col_sum += product;
            carry += (col_sum < product);
        }
        dest->cell[i] = (unsigned int) col_sum;
        carry = (carry << 32) + (col_sum >> 32);
    }

    dest->cell[length] = carry;
}

完整實作在 b21a89aubig_schonhange_strassen.h

參考 bignum 引入 Karatsuba Algorithm

Karatsuba 的概念是將

x
y
以第
n
位數為界,拆成兩半
x1
x0
y1
y0
,把這他們視為較小的數相乘,然後再透過左移補回
x1
y1
損失的位數,以十進位為例:

x=x110n+x0y=y110n+y0

xy 可以化為:

x1y1z2102n+(x1y0+y1x0)z110n+x0y0z0

上述算法計算

z2
z1
z0
需要四次乘法,我們還可以透過以下技巧優化成三次乘法:

觀察

(x1+x0)(y1+y0) 展開的結果

(x1+x0)(y1+y0)=x1y1z2+x1y0+x0y1z1+x0y0z0

移項之後,我們就能利用

(x1+x0)(y1+y0)
z0
z2
來計算
z1

z2=x1y1
z0=x0y0
z1=(x1+x0)(y1+y0)z0z2

最後計算

z2102n+z110n+z0 便能得到
x
y
相乘的結果。

以上便是 Karatsuba 演算法。


接下來我以兩個 8 bit 數值相乘變成 16 bit 來演示 Karatsuba(假 設所採用的處理器只支援 8 bit 乘法):

x=010010012=7310, y=100000112=13110

x=x1104+x0=0100104+1001
y=y1104+y0=1000104+0011

z2=x1y1=01001000=00100000
z0=x0y0=10010011=00011011
z1=(x1+x0)(y1+y0)z0z2=(0100+1001)(1000+0011)0010000000011011=100011110010000000011011=01010100

xy=z2108+z1104+z0=0010010101011011=956310

其中

10n 可以用左移運算代替。


接續上一個例子,當

x
y
超過 8 bit 時,可以透過分治法實作 Karatsuba。
x1
x0
y1
y0
的位元數超出處理器的乘法的位數時,就把他們再切為
x11
x10
x01
x00
,再使用 Karatsuba 計算。以下以兩個 16 bit 數值相乘變成 32 bit 來演示(假設所採用的處理器只支援 8 bit 乘法):

由上圖可以看出計算

z2
z1
z0
時,透過分治法將
x1
x0
y1
y0
切成更小的數字執行乘法運算。最後再用左移與加法計算
z21016+z1108+z0
即可求得結果。

至此我可以透過分治法使用 Karatsuba 計算任意位數的大數。


我目前的實作並未考慮 ubig 可能需要 resize,所以需要大幅修改 ubig 的賦值、加法、左移等基本操作,才好實作 Karatsuba。

另一種解決辦法是將所有 unsigned int 陣列大小都統一為 128 KiB,這樣就不用在分治的時候處理不同長度的 unsigned int 陣列,但是計算較小的數會浪費記憶體。

此外,計算

(x1+x0)
(y1+y0)
(x1+x0)(y1+y0)
需要暫存的 ubig 物件,所以此實作是 not-in-place。以下為基於遞迴的 Karatsuba 的 ubig_mul 實作:

// modified addition for karatsuba
static inline void ubig_add_in_place(ubig *dest, ubig *src, int front, int end)
{
    int carry = 0, i = 0;
    for (int j = front; j < end; i++, j++) {
        unsigned int tmp = dest->cell[i] + src->cell[j] + carry;
        carry = (tmp < dest->cell[i]);
        dest->cell[i] = tmp;
    }

    for (; i < dest->size; i++) {
        unsigned int tmp = dest->cell[i] + carry;
        carry = (tmp < dest->cell[i]);
        dest->cell[i] = tmp;
    }
}

// modified addition for karatsuba
static inline void ubig_add_in_place2(ubig *dest, int offset, ubig *src)
{
    int carry = 0, i = offset;
    for (int j = 0; j < src->size; i++, j++) {
        unsigned int tmp = dest->cell[i] + src->cell[j] + carry;
        carry = (tmp < dest->cell[i]);
        dest->cell[i] = tmp;
    }

    for (; i < dest->size; i++) {
        unsigned int tmp = dest->cell[i] + carry;
        carry = (tmp < dest->cell[i]);
        dest->cell[i] = tmp;
    }
}

// modified substraction for karatsuba
static inline void ubig_sub_in_place(ubig *dest, ubig *src, int front, int end)
{
    int carry = 0, i = 0;
    for (int j = front; j < src->size && j < end; i++, j++) {
        unsigned int tmp = dest->cell[i] - src->cell[j] - carry;
        carry = (tmp > dest->cell[i]);
        dest->cell[i] = tmp;
    }

    for (; i < dest->size; i++) {
        unsigned int tmp = dest->cell[i] - carry;
        carry = (tmp > dest->cell[i]);
        dest->cell[i] = tmp;
    }
}

static inline int ubig_msb_idx(ubig *a)
{
    int msb_i = a->size - 1;
    while (msb_i >= 0 && !a->cell[msb_i])
        msb_i--;
    return msb_i;
}

// do Karatsuba recursively
int mul_recursive(ubig *dest, ubig *x, ubig *y, int front, int end)
{
    // termination 32 bit x 32 bit case
    int size = end - front;
    if (size == 1) {
        unsigned long long product =
            (unsigned long long) x->cell[front] * y->cell[front];
        if (front * 2 < dest->size) {
            dest->cell[front * 2] = product;
            if (front * 2 + 1 < dest->size)
                dest->cell[front * 2 + 1] = product >> 32;
        }
        return 1;
    }

    // dest = z2 * 2^(middle * 2) + z0 * 2^(front * 2)
    int half_size = size / 2;
    int middle = front + half_size;
    int ret;
    ret = mul_recursive(dest, x, y, middle, end);
    if (!ret)
        return 0;
    ret = mul_recursive(dest, x, y, front, middle);
    if (!ret)
        return 0;

    // tmp1 = (x0 + x1) and tmp2 = (y0 + y1)
    ubig *tmp1 = new_ubig(half_size + 2);
    ubig *tmp2 = new_ubig(half_size + 2);
    ubig *z1 = new_ubig((half_size + 2) * 2);
    if (!tmp1 || !tmp2 || !z1) {
        destroy_ubig(tmp1);
        destroy_ubig(tmp2);
        destroy_ubig(z1);
        return 0;
    }
    memcpy(tmp1->cell, x->cell + middle, (end - middle) * sizeof(unsigned int));
    memcpy(tmp2->cell, y->cell + middle, (end - middle) * sizeof(unsigned int));
    ubig_add_in_place(tmp1, x, front, middle);  // x0 + x1
    ubig_add_in_place(tmp2, y, front, middle);  // y0 + y1

    // z1 = (tmp1 * tmp2) - z0 - z2
    int sz_1 = ubig_msb_idx(tmp1) + 1;
    int sz_2 = ubig_msb_idx(tmp2) + 1;
    int common_sz = sz_1 > sz_2 ? sz_1 : sz_2;
    ret = mul_recursive(z1, tmp1, tmp2, 0, common_sz);
    destroy_ubig(tmp1);
    destroy_ubig(tmp2);
    if (!ret) {
        destroy_ubig(z1);
        return 0;
    }
    ubig_sub_in_place(z1, dest, front * 2, front * 2 + half_size * 2);
    ubig_sub_in_place(z1, dest, front * 2 + half_size * 2,
                      front * 2 + size * 2);

    // dest = dest + z1 * 2^(front + middle)
    ubig_add_in_place2(dest, front * 2 + half_size, z1);
    destroy_ubig(z1);
    return 1;
}

// multiplication entry point
int ubig_mul(ubig *dest, ubig *a, ubig *b)
{
    zero_ubig(dest);

    // don't use karatsuba if dest only has 32 bit
    if (dest->size == 1) {
        dest->cell[0] = a->cell[0] * b->cell[0];
        return 1;
    }

    // find how many unsigned int are actually
    // storing data
    int sz_a = ubig_msb_idx(a) + 1;
    int sz_b = ubig_msb_idx(b) + 1;

    // if a == 0 or b == 0 then dest = 0
    if (sz_a == 0 || sz_b == 0)
        return 1;

    int common_sz = sz_a > sz_b ? sz_a : sz_b;
    return mul_recursive(dest, a, b, 0, common_sz);
}

完整實作在 9673a70ubig_karatsuba.h

比較加法、長乘法、Karatsuba 與 Schönhage–Strassen

下圖是計算

F0
F50
F100
F50000
的執行時間(不包含基數轉換)。用加法計算
F50000
需要約 200 毫秒,用長乘法則需要更長時間,因不明原因,在測量長乘法計算
F37000
以後的時間失準。

單獨看 Karatsuba 與 Schönhage–Strassen,前者計算

F50000 時需要消耗 45 毫秒;後者則需要消耗 1.8 毫秒左右。

接下來看計算較小數字時各演算法消耗的時間:

從圖中可以發現,計算

F0
F45
時用 Adding 最快、計算
F45
F95
用 Karatsuba 與 Schönhage–Strassen 差不多快、計算之後的數字則是 Schönhage–Strassen 最快。

針對

FN 計算三種演算法的時間複雜度:

  • Adding: 做
    N
    次加法、其中每次加法要 unsigned int 陣列的長度次迭代,也就是
    log232FN
    次,所以複雜度為
    O(Nlog232FN)
  • 長乘法 Fast-Doubling:
    log2N
    次迭代,每次迭代中的長乘法要
    O(log2FNlog232FN)
    ,所以複雜度為
    O(log2Nlog2FNlog232FN)
  • Schönhage–Strassen Fast-Doubling:
    log2N
    次迭代,每次迭代中的乘法要
    O(1+2+...+log232FN+log232FN1+...+2+1)=O(log232FNlog232FN)
    ,複雜度為
    O(log2Nlog232FNlog232FN)