2022q1 Homework3 (fibdrv)

contributed by < leewei05 >

作業說明

環境設定

安裝相關套件

$ uname -r
5.13.0-30-generic

$ sudo apt install linux-headers-`uname -r`
[sudo] password for lee: 
Reading package lists... Done
Building dependency tree       
Reading state information... Done
linux-headers-5.13.0-30-generic is already the newest version (5.13.0-30.33~20.04.1).
linux-headers-5.13.0-30-generic set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.

$ dpkg -L linux-headers-5.13.0-30-generic | grep "lib/modules"
/lib/modules
/lib/modules/5.13.0-30-generic
/lib/modules/5.13.0-30-generic/build

$ whoami
lee

$ sudo whoami
root

$ sudo apt install -y util-linux strace gnuplot-nox

$ cd fibdrv
$ make check
 Passed [-]
f(93) fail
input: 7540113804746346429
expected: 12200160415121876738

$ sudo insmod fibdrv.ko
$ ls -l /dev/fibonacci 
crw------- 1 root root 511, 0 Mar  6 21:37 /dev/fibonacci

$ cat /sys/class/fibonacci/fibonacci/dev 
511:0

$ cat /sys/module/fibdrv/version 
0.1

$ lsmod | grep fibdrv
fibdrv                 16384  0

$ cat /sys/module/fibdrv/refcnt
0

排除效能分析干擾

限定 CPU 給特定的程式使用

查看電腦 CPU 核心數並保留第 12 個核心 isolcpus=11。更改完設定、更新 grub 後重新開機。之後即可以透過 taskset -c 11 ./executable 來跑效能。

$ taskset -cp 1       
pid 1's current affinity list: 0-11

$ sudo vim /etc/default/grub
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=3"

$ sudo update-grub
Sourcing file `/etc/default/grub'
Sourcing file `/etc/default/grub.d/init-select.cfg'
Generating grub configuration file ...
Found linux image: /boot/vmlinuz-5.13.0-30-generic
Found initrd image: /boot/initrd.img-5.13.0-30-generic
Found linux image: /boot/vmlinuz-5.13.0-28-generic
Found initrd image: /boot/initrd.img-5.13.0-28-generic
Found memtest86+ image: /boot/memtest86+.elf
Found memtest86+ image: /boot/memtest86+.bin
Found Windows 10 on /dev/sda1
done

// After restart
$ taskset -cp 1       
pid 1's current affinity list: 0-10

抑制 address space layout randomization (ASLR)

// default
$ cat /proc/sys/kernel/randomize_va_space
2

$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"

設定 scaling_governor 為 performance

只針對 cpu3 做設定

ll /sys/devices/system/cpu
total 0
drwxr-xr-x 9 root root    0 Mar  7 21:22 cpu0
drwxr-xr-x 9 root root    0 Mar  7 21:22 cpu1
drwxr-xr-x 9 root root    0 Mar  7 21:22 cpu2
drwxr-xr-x 9 root root    0 Mar  7 21:22 cpu3
drwxr-xr-x 7 root root    0 Mar  7 21:22 cpufreq
drwxr-xr-x 2 root root    0 Mar  7 21:22 cpuidle
drwxr-xr-x 2 root root    0 Mar  7 21:34 hotplug
drwxr-xr-x 2 root root    0 Mar  7 21:22 intel_pstate

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 
ondemand

$ echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor

針對 Intel 處理器，關閉 turbo mode

$ cat /sys/devices/system/cpu/intel_pstate/no_turbo 
0
 
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"

參考 KYG-yaya573142 同學的作法，把上述的命令整合成單一 script。

Gnuplot 製作

Kernel 模組執行時間

為了能夠透過 gnuplot 圖表分析效能，需要先把核心計算、傳送至 userspace 的時間寫入一個檔案。自我檢查列表有提到 printk - print a kernel message，雖然可能會有效能上的問題，但尚未了解 sysfs 以前先透過 printk 來印出計算各個 fib_sequence() 結果的時間。閱讀 printk 以及 The high-resolution timer API 撰寫以下的程式：

static ktime_t start_time, end_time, process_time;

static long long fib_time_proxy(long long k)
{
    start_time = ktime_get();
    long long result = fib_sequence(k);
    end_time = ktime_get();
    process_time = ktime_sub(end_time, start_time);
    printk("fib(%llu)'s result is %llu, process time: %llu", k, result, ktime_to_ns(process_time));

    return result;
}

/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    return (ssize_t) fib_time_proxy(*offset);
}

/* write operation is skipped */
static ssize_t fib_write(struct file *file,
                         const char *buf,
                         size_t size,
                         loff_t *offset)
{
    return ktime_to_ns(process_time);
}

一開始 ./client 的輸出都沒有印時間，但過了一陣子才想到 printk 是印 Linux 核心的訊息，不會直接在 userspace 輸出。接著，我執行 dmesg 命令，查看核心的輸出，果不其然有上述 prink 的結果：

[ 7935.781317] fib(0)'s result is 0, process time: 45
...
[ 7935.781779] fib(3)'s result is 2, process time: 42
[ 7935.781781] fib(2)'s result is 1, process time: 42
[ 7935.781783] fib(1)'s result is 1, process time: 40

原先 printk 的方式不利於取得函式的回傳時間，故改寫 fib_write 回傳 fib_sequence() 的執行時間。

/* write operation is skipped */
static ssize_t fib_write(struct file *file,
                         const char *buf,
                         size_t size,
                         loff_t *offset)
{
    start_time = ktime_get();
    fib_sequence(*offset);
    end_time = ktime_get();
    process_time = ktime_sub(end_time, start_time);
    return (ssize_t) ktime_to_ns(process_time);
}

Copy to user 執行時間

為了要傳遞 Kernel 的計算結果至 user space，我們可以使用 copy_to_user 函式。

unsigned long copy_to_user(void __user * to, const void * from, unsigned long n);

// example for using copy_to_user
if( copy_to_user(user_buffer, g_s_Hello_World_string + *position, count) != 0 )
        return -EFAULT;

首先，需要先配置 Kernel space 的記憶體。因為我們要回傳的 fibonacci 數列結果很長，所以選擇用 vmalloc 來配置虛擬的連續記憶體空間。實作以下 k_to_u 函式:

static long long k_to_u(char *buf, long long val, unsigned long len)
{
    // How many pages do I need to allocate?
    char *ker = vmalloc(16);
    if (!ker) {
        printk(KERN_ALERT "no memory allocated in kernel");
        return -ENOMEM;
    }

    snprintf(ker, len, "%lu", (unsigned long) val);

    if (copy_to_user(buf, ker, len) != 0)
        return -EFAULT;

    vfree(ker);
    return 0;
}

並且額外實作一個 fib_basic 封裝相關參數：

static ssize_t fib_basic(long long k, char *buf, size_t size)
{
    /* FIXME: C99 variable-length array (VLA) is not allowed in Linux kernel. */
    long long f[k + 2];

    f[0] = 0;
    f[1] = 1;

    for (int i = 2; i <= k; i++) {
        f[i] = f[i - 1] + f[i - 2];
    }

    if (k_to_u(buf, f[k], size) != 0)
        printk("copy from kernel to user failed");

    return f[k];
}

Userspace 的執行時間

參考 KYG-yaya573142 同學的做法，利用 clock_gettime 來取得 userspace 的執行時間。並且為了更方便產生 gnuplot 的圖，

// client.c
#include <time.h>
#include <unistd.h>

#define FIB_DEV "/dev/fibonacci"

int main()
{
    FILE *fp = fopen("./plot", "w");
    char buf[1];
    int offset = 100; /* TODO: try test something bigger than the limit */

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }


    for (int i = 0; i <= offset; i++) {
        struct timespec start_ts, end_ts;
        long long kernel_ts;
        lseek(fd, i, SEEK_SET);

        clock_gettime(CLOCK_MONOTONIC, &start_ts);
        kernel_ts = write(fd, buf, 1);
        clock_gettime(CLOCK_MONOTONIC, &end_ts);
        long long user_ts = (long long) (end_ts.tv_nsec - start_ts.tv_nsec);
        printf("%d %lld %lld %lld\n", i, kernel_ts, user_ts,
               user_ts - kernel_ts);
        fprintf(fp, "%d %lld %lld %lld\n", i, kernel_ts, user_ts,
                user_ts - kernel_ts);
    }

    close(fd);
    fclose(fp);
    return 0;
}

Gnuplot script

參考 gnuplot 語法解說和示範，撰寫以下 script：

# ./scripts/plot.gp
reset
set output 'f.png'
set xlabel 'fib(n)'
set ylabel 'time(ns)'
set title 'fibdrv performance'
set term png enhanced font 'Verdana,10'

plot [0:100][0:10000] \
'plot' using 1:2 with linespoints linewidth 2 title "kernel space time", \
'plot' using 1:3 with linespoints linewidth 2 title "user space time", \
'plot' using 1:4 with linespoints linewidth 2 title "kernel to user space time", \

$ sudo gnuplot ./scripts/plot.gp

執行結果如下：

透過排除效能分析干擾的技巧獲得以下結果

費氏數列

首先修改 fib_basic 的 VLA warning。理論上我們只需要知道第 N 項的費氏數列，所以不需要定義 k+2 長度的陣列。初步的想法為定義一個長度為 3 的陣列：

// 計算完 f[2] 之後即可以捨棄 f[0] 並把 f[1] 的值填入至 f[0]，f[2] 的值填入至 f[1]
f[2] = f[0] + f[1];
swap(f[0],f[1])
f[2] = f[1]

實作程式碼

static ssize_t fib_basic(long long k, char *buf, size_t size)
{
    long long f[3];

    f[0] = 0;
    f[1] = 1;

    for (int i = 2; i <= k; i++) {
        f[2] = f[0] + f[1];
        xor_swap(&f[0], &f[1]);
        f[1] = f[2];
    }

    if (k_to_u(buf, f[2], size) != 0)
        printk("copy from kernel to user failed");

    printk("%lld: %lld", k, f[2]);
    return f[2];
}

Fast Doubling

閱讀給定的 Fast doubling 網誌並實作以下程式碼。實作想法為小於 2 的輸入直接回傳值，而大於 2 的 Fibonacci number 則是根據輸入的數值為偶數還是奇數做對應的公式計算。

static ssize_t fib_fd(long long n)
{
    if (n == 0)
        return 0;

    if (n <= 2)
        return 1;

    // odd: F(n) = F(2k+1) = F(k+1)^2 + F(k)^2
    if (n & 0x01) {
        unsigned int k = (n - 1) / 2;
        return fib_fd(k + 1) * fib_fd(k + 1) + fib_fd(k) * fib_fd(k);
    } else {
        // even: F(n) = F(2k) = F(k) * [ 2 * F(k+1) – F(k) ]
        unsigned int k = n / 2;
        return fib_fd(k) * ((2 * fib_fd(k + 1)) - fib_fd(k));
    }
}

static ssize_t fib_fast_doubling(long long k, char *buf, size_t size)
{
    ssize_t result = fib_fd(k);

    if (k_to_u(buf, result, size) != 0)
        printk("copy from kernel to user failed");

    printk("%lld: %lld", k, (long long) result);
    return result;
}

透過 clz 硬體指令加速

Small/Short String Optmization

Small String Optmization(SSO) 是一個處理字串的最佳化技巧。SSO 所解決的問題是可以減少 small-sized string 在建立時所動態配置的記憶體。透過 SSO，當應用程式建立的字串小於特定長度時，我們不會配置新的 heap memory 來存字串，而是直接使用 stack memory(buffer) 存字串。這個方法可以大量減少動態配置記憶體的成本並提昇執行時期的效能。

以下是為 gcc 的字串結構：

Small String Optmization 實作細節

分為 small string(0~15 bytes) 以及 large string(15~x bytes 需要動態配置 heap memory)。small string 的最後一個 byte 仿照 fbstring 的實作方式，存剩餘的空間以及 flag bit。

為了實現 f(500) 的運算，至少需要支援 107 個 bytes 包含 null terminator。為了方便計算直接設定字串長度上限為 256 個 bytes:

$ echo "139423224561697880139724382870407283950070256587697307264108962948325571622863290691557658876222521294125" | wc -c
106

雜

空字串佔據 1 個位元組的空間，因為需要存 \0 NULL terminator。
copy_on_write: 當複製一個字串時直接建立一個指向原本字串的指標，並且 refcount + 1，表示有多少指標正指向此字串。
fbstring 的 small string 最多可以容納 23 bytes 的字串。最後一個 byte 顯示此 23 bytes 還剩餘的空間，當此字串已經塞滿 23 個 bytes 時，最後一個 byte 即為 0 且可以當作 NULL terminator 使用。
Pointer 8 bytes, size 8 bytes(實際字串的長度), capacity 8 bytes(可容納字串的長度)
flag bit 判斷是否為 small string 還是 large string
gcc string(version >= 5)
- 有 SSO 但只有 15 個 bytes capacity
- Size 32 bytes, is power of 2
Memory layout is important

Linux Virtual File System

Linux Kernel 在 fs.h 定義了 file_operations 介面(interface)。可以透過註冊不同的檔案來取得不同 Fibonacci 求值的實作。

參考 hsuedw 同學的實作，首先在 client.c 定義檔案名稱：

// 原有的檔案
#define FIB_DEV "/dev/fibonacci"
#define FIB_DEV_INPUT "/dev/fibonacci/input"
#define FIB_DEV_OUTPUT "/dev/fibonacci/output"
#define FIB_DEV_TIME "/dev/fibonacci/time"