contributed by < leewei05
>
$ uname -r
5.13.0-30-generic
$ sudo apt install linux-headers-`uname -r`
[sudo] password for lee:
Reading package lists... Done
Building dependency tree
Reading state information... Done
linux-headers-5.13.0-30-generic is already the newest version (5.13.0-30.33~20.04.1).
linux-headers-5.13.0-30-generic set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
$ dpkg -L linux-headers-5.13.0-30-generic | grep "lib/modules"
/lib/modules
/lib/modules/5.13.0-30-generic
/lib/modules/5.13.0-30-generic/build
$ whoami
lee
$ sudo whoami
root
$ sudo apt install -y util-linux strace gnuplot-nox
$ cd fibdrv
$ make check
Passed [-]
f(93) fail
input: 7540113804746346429
expected: 12200160415121876738
$ sudo insmod fibdrv.ko
$ ls -l /dev/fibonacci
crw------- 1 root root 511, 0 Mar 6 21:37 /dev/fibonacci
$ cat /sys/class/fibonacci/fibonacci/dev
511:0
$ cat /sys/module/fibdrv/version
0.1
$ lsmod | grep fibdrv
fibdrv 16384 0
$ cat /sys/module/fibdrv/refcnt
0
查看電腦 CPU 核心數並保留第 12 個核心 isolcpus=11
。更改完設定、更新 grub
後重新開機。之後即可以透過 taskset -c 11 ./executable
來跑效能。
$ taskset -cp 1
pid 1's current affinity list: 0-11
$ sudo vim /etc/default/grub
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=3"
$ sudo update-grub
Sourcing file `/etc/default/grub'
Sourcing file `/etc/default/grub.d/init-select.cfg'
Generating grub configuration file ...
Found linux image: /boot/vmlinuz-5.13.0-30-generic
Found initrd image: /boot/initrd.img-5.13.0-30-generic
Found linux image: /boot/vmlinuz-5.13.0-28-generic
Found initrd image: /boot/initrd.img-5.13.0-28-generic
Found memtest86+ image: /boot/memtest86+.elf
Found memtest86+ image: /boot/memtest86+.bin
Found Windows 10 on /dev/sda1
done
// After restart
$ taskset -cp 1
pid 1's current affinity list: 0-10
// default
$ cat /proc/sys/kernel/randomize_va_space
2
$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
只針對 cpu3 做設定
ll /sys/devices/system/cpu
total 0
drwxr-xr-x 9 root root 0 Mar 7 21:22 cpu0
drwxr-xr-x 9 root root 0 Mar 7 21:22 cpu1
drwxr-xr-x 9 root root 0 Mar 7 21:22 cpu2
drwxr-xr-x 9 root root 0 Mar 7 21:22 cpu3
drwxr-xr-x 7 root root 0 Mar 7 21:22 cpufreq
drwxr-xr-x 2 root root 0 Mar 7 21:22 cpuidle
drwxr-xr-x 2 root root 0 Mar 7 21:34 hotplug
drwxr-xr-x 2 root root 0 Mar 7 21:22 intel_pstate
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
ondemand
$ echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor
$ cat /sys/devices/system/cpu/intel_pstate/no_turbo
0
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
參考 KYG-yaya573142 同學 的作法,把上述的命令整合成單一 script。
為了能夠透過 gnuplot 圖表分析效能,需要先把核心計算、傳送至 userspace 的時間寫入一個檔案。自我檢查列表有提到 printk - print a kernel message
,雖然可能會有效能上的問題,但尚未了解 sysfs
以前先透過 printk
來印出計算各個 fib_sequence()
結果的時間。閱讀 printk 以及 The high-resolution timer API 撰寫以下的程式:
static ktime_t start_time, end_time, process_time;
static long long fib_time_proxy(long long k)
{
start_time = ktime_get();
long long result = fib_sequence(k);
end_time = ktime_get();
process_time = ktime_sub(end_time, start_time);
printk("fib(%llu)'s result is %llu, process time: %llu", k, result, ktime_to_ns(process_time));
return result;
}
/* calculate the fibonacci number at given offset */
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
return (ssize_t) fib_time_proxy(*offset);
}
/* write operation is skipped */
static ssize_t fib_write(struct file *file,
const char *buf,
size_t size,
loff_t *offset)
{
return ktime_to_ns(process_time);
}
一開始 ./client
的輸出都沒有印時間,但過了一陣子才想到 printk
是印 Linux 核心的訊息,不會直接在 userspace 輸出。接著,我執行 dmesg
命令,查看核心的輸出,果不其然有上述 prink
的結果:
[ 7935.781317] fib(0)'s result is 0, process time: 45
...
[ 7935.781779] fib(3)'s result is 2, process time: 42
[ 7935.781781] fib(2)'s result is 1, process time: 42
[ 7935.781783] fib(1)'s result is 1, process time: 40
原先 printk
的方式不利於取得函式的回傳時間,故改寫 fib_write
回傳 fib_sequence()
的執行時間。
/* write operation is skipped */
static ssize_t fib_write(struct file *file,
const char *buf,
size_t size,
loff_t *offset)
{
start_time = ktime_get();
fib_sequence(*offset);
end_time = ktime_get();
process_time = ktime_sub(end_time, start_time);
return (ssize_t) ktime_to_ns(process_time);
}
為了要傳遞 Kernel 的計算結果至 user space,我們可以使用 copy_to_user
函式。
unsigned long copy_to_user(void __user * to, const void * from, unsigned long n);
// example for using copy_to_user
if( copy_to_user(user_buffer, g_s_Hello_World_string + *position, count) != 0 )
return -EFAULT;
首先,需要先配置 Kernel space 的記憶體。因為我們要回傳的 fibonacci 數列結果很長,所以選擇用 vmalloc
來配置虛擬的連續記憶體空間。實作以下 k_to_u
函式:
static long long k_to_u(char *buf, long long val, unsigned long len)
{
// How many pages do I need to allocate?
char *ker = vmalloc(16);
if (!ker) {
printk(KERN_ALERT "no memory allocated in kernel");
return -ENOMEM;
}
snprintf(ker, len, "%lu", (unsigned long) val);
if (copy_to_user(buf, ker, len) != 0)
return -EFAULT;
vfree(ker);
return 0;
}
並且額外實作一個 fib_basic
封裝相關參數:
static ssize_t fib_basic(long long k, char *buf, size_t size)
{
/* FIXME: C99 variable-length array (VLA) is not allowed in Linux kernel. */
long long f[k + 2];
f[0] = 0;
f[1] = 1;
for (int i = 2; i <= k; i++) {
f[i] = f[i - 1] + f[i - 2];
}
if (k_to_u(buf, f[k], size) != 0)
printk("copy from kernel to user failed");
return f[k];
}
參考 KYG-yaya573142 同學的做法,利用 clock_gettime
來取得 userspace 的執行時間。並且為了更方便產生 gnuplot 的圖,
// client.c
#include <time.h>
#include <unistd.h>
#define FIB_DEV "/dev/fibonacci"
int main()
{
FILE *fp = fopen("./plot", "w");
char buf[1];
int offset = 100; /* TODO: try test something bigger than the limit */
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
for (int i = 0; i <= offset; i++) {
struct timespec start_ts, end_ts;
long long kernel_ts;
lseek(fd, i, SEEK_SET);
clock_gettime(CLOCK_MONOTONIC, &start_ts);
kernel_ts = write(fd, buf, 1);
clock_gettime(CLOCK_MONOTONIC, &end_ts);
long long user_ts = (long long) (end_ts.tv_nsec - start_ts.tv_nsec);
printf("%d %lld %lld %lld\n", i, kernel_ts, user_ts,
user_ts - kernel_ts);
fprintf(fp, "%d %lld %lld %lld\n", i, kernel_ts, user_ts,
user_ts - kernel_ts);
}
close(fd);
fclose(fp);
return 0;
}
參考 gnuplot 語法解說和示範,撰寫以下 script:
# ./scripts/plot.gp
reset
set output 'f.png'
set xlabel 'fib(n)'
set ylabel 'time(ns)'
set title 'fibdrv performance'
set term png enhanced font 'Verdana,10'
plot [0:100][0:10000] \
'plot' using 1:2 with linespoints linewidth 2 title "kernel space time", \
'plot' using 1:3 with linespoints linewidth 2 title "user space time", \
'plot' using 1:4 with linespoints linewidth 2 title "kernel to user space time", \
$ sudo gnuplot ./scripts/plot.gp
執行結果如下:
透過排除效能分析干擾的技巧獲得以下結果
首先修改 fib_basic
的 VLA warning。理論上我們只需要知道第 N 項的費氏數列,所以不需要定義 k+2 長度的陣列。初步的想法為定義一個長度為 3 的陣列:
// 計算完 f[2] 之後即可以捨棄 f[0] 並把 f[1] 的值填入至 f[0],f[2] 的值填入至 f[1]
f[2] = f[0] + f[1];
swap(f[0],f[1])
f[2] = f[1]
實作程式碼
static ssize_t fib_basic(long long k, char *buf, size_t size)
{
long long f[3];
f[0] = 0;
f[1] = 1;
for (int i = 2; i <= k; i++) {
f[2] = f[0] + f[1];
xor_swap(&f[0], &f[1]);
f[1] = f[2];
}
if (k_to_u(buf, f[2], size) != 0)
printk("copy from kernel to user failed");
printk("%lld: %lld", k, f[2]);
return f[2];
}
閱讀給定的 Fast doubling 網誌 並實作以下程式碼。實作想法為小於 2 的輸入直接回傳值,而大於 2 的 Fibonacci number 則是根據輸入的數值為偶數還是奇數做對應的公式計算。
static ssize_t fib_fd(long long n)
{
if (n == 0)
return 0;
if (n <= 2)
return 1;
// odd: F(n) = F(2k+1) = F(k+1)^2 + F(k)^2
if (n & 0x01) {
unsigned int k = (n - 1) / 2;
return fib_fd(k + 1) * fib_fd(k + 1) + fib_fd(k) * fib_fd(k);
} else {
// even: F(n) = F(2k) = F(k) * [ 2 * F(k+1) – F(k) ]
unsigned int k = n / 2;
return fib_fd(k) * ((2 * fib_fd(k + 1)) - fib_fd(k));
}
}
static ssize_t fib_fast_doubling(long long k, char *buf, size_t size)
{
ssize_t result = fib_fd(k);
if (k_to_u(buf, result, size) != 0)
printk("copy from kernel to user failed");
printk("%lld: %lld", k, (long long) result);
return result;
}
透過 clz 硬體指令加速
Small String Optmization(SSO) 是一個處理字串的最佳化技巧。SSO 所解決的問題是可以減少 small-sized string 在建立時所動態配置的記憶體。透過 SSO,當應用程式建立的字串小於特定長度時,我們不會配置新的 heap memory 來存字串,而是直接使用 stack memory(buffer) 存字串。這個方法可以大量減少動態配置記憶體的成本並提昇執行時期的效能。
以下是為 gcc 的字串結構:
Small String Optmization 實作細節
分為 small string(0~15 bytes) 以及 large string(15~x bytes 需要動態配置 heap memory)。small string 的最後一個 byte 仿照 fbstring 的實作方式,存剩餘的空間以及 flag bit。
為了實現 f(500)
的運算,至少需要支援 107 個 bytes 包含 null terminator。為了方便計算直接設定字串長度上限為 256 個 bytes:
$ echo "139423224561697880139724382870407283950070256587697307264108962948325571622863290691557658876222521294125" | wc -c
106
雜
\0
NULL terminator。延伸閱讀
Linux Kernel 在 fs.h 定義了 file_operations
介面(interface)。可以透過註冊不同的檔案來取得不同 Fibonacci 求值的實作。
參考 hsuedw 同學 的實作,首先在 client.c
定義檔案名稱:
// 原有的檔案
#define FIB_DEV "/dev/fibonacci"
#define FIB_DEV_INPUT "/dev/fibonacci/input"
#define FIB_DEV_OUTPUT "/dev/fibonacci/output"
#define FIB_DEV_TIME "/dev/fibonacci/time"
f(92) = 7540113804746346429