contributed by < Wallmountain >
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 39 bits physical, 48 bits virtual
CPU(s): 12
On-line CPU(s) list: 0-11
Thread(s) per core: 2
Core(s) per socket: 6
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 158
Model name: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
Stepping: 10
CPU MHz: 1358.240
CPU max MHz: 4100.0000
CPU min MHz: 800.0000
BogoMIPS: 4399.99
Virtualization: VT-x
L1d cache: 192 KiB
L1i cache: 192 KiB
L2 cache: 1.5 MiB
L3 cache: 9 MiB
NUMA node0 CPU(s): 0-11
$ uname -r
5.11.0-27-generic
$ sudo apt install linux-headers-`uname -r`
Reading package lists... Done
Building dependency tree
Reading state information... Done
linux-headers-5.11.0-27-generic is already the newest version (5.11.0-27.29~20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.
$ dpkg -L linux-headers-5.11.0-27-generic | grep "/lib/modules"
/lib/modules
/lib/modules/5.11.0-27-generic
/lib/modules/5.11.0-27-generic/build
$ whoami
wallmountain
$ sudo whoami
root
$ make check
make -C /lib/modules/5.11.0-27-generic/build M=/home/wallmountain/fibdrv modules
make[1]: Entering directory '/usr/src/linux-headers-5.11.0-27-generic'
make[1]: Leaving directory '/usr/src/linux-headers-5.11.0-27-generic'
make unload
make[1]: Entering directory '/home/wallmountain/fibdrv'
sudo rmmod fibdrv || true >/dev/null
rmmod: ERROR: Module fibdrv is not currently loaded
make[1]: Leaving directory '/home/wallmountain/fibdrv'
make load
make[1]: Entering directory '/home/wallmountain/fibdrv'
sudo insmod fibdrv.ko
make[1]: Leaving directory '/home/wallmountain/fibdrv'
sudo ./client > out
make unload
make[1]: Entering directory '/home/wallmountain/fibdrv'
sudo rmmod fibdrv || true >/dev/null
make[1]: Leaving directory '/home/wallmountain/fibdrv'
Passed [-]
f(93) fail
input: 7540113804746346429
expected: 12200160415121876738
$ modinfo fibdrv.ko
filename: /home/wallmountain/fibdrv/fibdrv.ko
version: 0.1
description: Fibonacci engine driver
author: National Cheng Kung University, Taiwan
license: Dual MIT/GPL
srcversion: B8BF0BDBE87ACE60588858C
depends:
retpoline: Y
name: fibdrv
vermagic: 5.11.0-27-generic SMP mod_unload modversions
$ taskset -p 1
pid 1's current affinity mask: fff
代表該行程可以在第 0 到第 11 個 CPU 核心中執行
$ taskset -cp 3 1
pid 1's current affinity list: 0-11
pid 1's new affinity list: 3
$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
在 kernel 中 使用 ktime 相關的 API 量測時間, 並使用作業講解的方式, 在 read 中量測時間,參考 laneser 用了 array of function pointer, 可以避免一直修改要呼叫的函式
static long long (*fib_func[])(long long) = {fib_sequence,
fib_fast_doubly_sequence};
static long long fib_time_proxy(long long k, size_t func_count)
{
kt = ktime_get();
long long result = (fib_func[func_count])(k);
kt = ktime_sub(ktime_get(), kt);
return result;
}
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
return (ssize_t) fib_time_proxy(*offset, size);
}
並使用 write 回傳量測到的時間
static ssize_t fib_write(struct file *file,
const char *buf,
size_t size,
loff_t *offset)
{
return ktime_to_ns(kt);
}
使用 clock_gettime
取得時間
struct timespec t1, t2;
clock_gettime(CLOCK_MONOTONIC, &t1);
read(fd, write_buf, func);
write(fd, write_buf, func);
clock_gettime(CLOCK_MONOTONIC, &t2);
t[n] = (long long) (t2.tv_sec * 1e9 + t2.tv_nsec) -
(t1.tv_sec * 1e9 + t1.tv_nsec);
參考 KYG-yaya573142 的報告, 實作對 kernel fib 函式的時間量測和處理
#include <fcntl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#define FIB_DEV "/dev/fibonacci"
#define sample_size 1000
int main(int argc, char **argv)
{
if (argc < 2)
return 0;
char write_buf[] = "testing writing";
int offset = 100;
int func = atoi(argv[1]);
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
/* for each F(i), measure sample_size times of data and
* remove outliers based on the 95% confidence level
*/
for (int i = 0; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
double t[sample_size] = {0};
double mean = 0.0, sd = 0.0, result = 0.0;
int count = 0;
for (int n = 0; n < sample_size; n++) { /* sampling */
/* get the runtime in kernel space here */
read(fd, write_buf, func);
t[n] = (double) write(fd, write_buf, func);
mean += t[n]; /* sum */
}
mean /= sample_size; /* mean */
for (int n = 0; n < sample_size; n++) {
sd += (t[n] - mean) * (t[n] - mean);
}
sd = sqrt(sd / (sample_size - 1)); /* standard deviation */
for (int n = 0; n < sample_size; n++) { /* remove outliers */
if (t[n] <= (mean + 2 * sd) && t[n] >= (mean - 2 * sd)) {
result += t[n];
count++;
}
}
result /= count;
printf("%d %.5lf samples: %d\n", i, result, count);
}
close(fd);
return 0;
}
其中利用 main 的傳入值選擇要使用的 kernel function 對應前面所用的 array of function pointer
$ taskset -c $CPUID./kernel_fib 0 > kernel_fib.txt
首先, 對尚未修改的 fib_sequence 函式作時間的量測
static long long fib_sequence(long long k)
{
/* FIXME: use clz/ctz and fast algorithms to speed up */
long long f[k + 2];
f[0] = 0;
f[1] = 1;
for (int i = 2; i <= k; i++) {
f[i] = f[i - 1] + f[i - 2];
}
return f[k];
}
使用 fast_doubling 的手法, 進行加速
而簡化過後的算法為
static long long fib_fast_doubling_sequence(long long k)
{
unsigned int h = 0;
for (unsigned int i = k; i; ++h, i >>= 1)
;
long long f[2];
f[0] = 0;
f[1] = 1;
for (long long mask = 1 << (h - 1); mask; mask >>= 1) {
long long a = f[0] * (2 * f[1] - f[0]);
long long b = f[0] * f[0] + f[1] * f[1];
if (mask & k) {
f[0] = b;
f[1] = a + b;
} else {
f[0] = a;
f[1] = b;
}
}
return f[0];
}
從下圖, 可看見明顯的加速, 原因也可從兩種算法的時間複雜度簡單得知
static long long fib_fast_doubling_sequence_clz(long long k)
{
long long f[2];
f[0] = 0;
f[1] = 1;
for (long long mask = 1 << (31 - __builtin_clz(k)); mask; mask >>= 1) {
long long a = f[0] * (2 * f[1] - f[0]);
long long b = f[0] * f[0] + f[1] * f[1];
if (mask & k) {
f[0] = b;
f[1] = a + b;
} else {
f[0] = a;
f[1] = b;
}
}
return f[0];
}
原本的算法時間複雜度為
unsigned int h = 0;
for (unsigned int i = k; i; ++h, i >>= 1)
;
for (long long mask = 1 << (h - 1); mask; mask >>= 1) {
}
clz 的算法可達到常數時間, 且能讓程式碼變得更簡潔
for (long long mask = 1 << (31 - __builtin_clz(k)); mask; mask >>= 1) {
}
TODO: 查閱上述 CPU 指令的 latency,見 Instruction tables
:notes: jserv
static long long long_mul(long long a, long long b)
{
long long ans = 0;
while (b != 0) {
long long t = -b & b;
int r = __builtin_ctzll(b);
ans += a << r;
b ^= t;
}
return ans;
}
static long long fib_fast_doubling_sequence_clz_pr(long long k)
{
long long f[2];
f[0] = 0;
f[1] = 1;
for (long long mask = 1 << (31 - __builtin_clz(k)); mask; mask >>= 1) {
long long a = long_mul(f[0], (f[1] << 1) - f[0]);
long long b = long_mul(f[0], f[0]) + long_mul(f[1], f[1]);
if (mask & k) {
f[0] = b;
f[1] = a + b;
} else {
f[0] = a;
f[1] = b;
}
}
return f[0];
}
#define bnum_size unsigned int
#define bnum_digit unsigned char
typedef struct bnum {
bnum_digit *ptr;
bnum_size size;
bnum_size capacity;
} bnum;
實作方式是根據 Binary-coded decimal
,利用任何單一十進制數字都能用四位元二進制數值表示,因此在結構中選擇使用 unsigned char
, char
大小為8個位元,可以擺進兩個十進制數字,這個方式也稱為 packed BCD。
使用 unsigned
是因為在運算時的方便,可以直接進行加減,在一開始的實作沒有考慮到 char
的加減也有正負值的問題,因此在計算時總是出現預料外的結果,於是最後才決定使用 unsigned char
。
使用 capacity
紀錄目前的結構體已經跟系統要了多大的空間,可以避免因為不知道當前有多少空間,而一直多要空間,造成的空間和時間上的浪費。
缺點 :-1: :
在其他的運算函式中,都會使用,從 capacity
判斷這個要存入運算數值的結構體是否已經要了足夠大小的空間並把數值清0
一開始,就先判斷運算後至少需要多少空間,一次性呼叫,減少動態配置記憶體的次數,也能減少大數運算的時間
#define bnum_ptr_resize(x, n) \
do { \
if (n > x->capacity) { \
x->size = 0; \
x->capacity = n; \
x->ptr = krealloc(x->ptr, n + 1, GFP_KERNEL); \
} \
memset(x->ptr, '\0', x->capacity); \
} while (0)
#define bnum_ptr_new(x, n) \
do { \
x->capacity = n; \
x->ptr = kmalloc(n + 1, GFP_KERNEL); \
} while (0)
#define BNUM_ALLOC(x, n) \
do { \
bnum *__x = (x); \
bnum_size __n = (n); \
__n = (__n >> 1) + (__n & 1); \
if (__x->ptr) \
bnum_ptr_resize(__x, __n); \
else \
bnum_ptr_new(__x, __n); \
} while (0)
考慮到如果若是進位必須要補6,先將左移一的結果直接存入 dest
,再從 a
中判斷是否進位,以十進制的角度,若是當前位數的數字不小於5,則左移一時會進位。
void bnum_lshift(bnum *dest, bnum *a)
{
BNUM_ALLOC(dest, a->size + 1);
bnum_digit *u = dest->ptr, *v = a->ptr;
const unsigned size = (a->size + 1) >> 1;
for (int i = 0; i < size; ++i) {
*u += *v << 1;
*u += ((*v & MASK_L) >= 5U) ? 6U : 0;
if ((*v++ >> 4) >= 5U) {
*u += 96U;
*(++u) = 1;
} else
u++;
}
bnum_cal_size(dest, a->size);
}
待完成 : 可以左移任意位數
在加法中,主要分成三個部分
void bnum_add(bnum *total, const bnum *a, const bnum *b)
{
bnum_size n = digit_max(a->size, b->size);
BNUM_ALLOC(total, n + 1);
digit_add(total, a->ptr, (a->size + 1) >> 1, b->ptr, (b->size + 1) >> 1);
bnum_cal_size(total, n);
}
void digit_add(bnum *total,
const bnum_digit *a,
bnum_size asize,
const bnum_digit *b,
bnum_size bsize)
{
if (asize > bsize) {
bnum_size i = bsize;
int carry = digit_add_c(total->ptr, a, b, bsize);
if (carry)
i += digit_addi(&total->ptr[bsize], &a[bsize]);
digit_cpy(&total->ptr[i], &a[i], asize - i + 1);
} else {
bnum_size i = asize;
int carry = digit_add_c(total->ptr, a, b, asize);
if (carry)
i += digit_addi(&total->ptr[asize], &a[asize]);
digit_cpy(&total->ptr[i], &a[i], bsize - i + 1);
}
}
int digit_add_c(bnum_digit *dest,
const bnum_digit *a,
const bnum_digit *b,
bnum_size size)
{
int carry = 0;
for (bnum_size i = 0; i < size; i++) {
unsigned int u = a[i] + b[i] + 0x66U + carry;
dest[i] = a[i] + b[i] + carry;
if ((u ^ a[i] ^ b[i]) & 0x10)
dest[i] += 6U;
if ((carry = !!(u & 0x100)))
dest[i] += 0x60U;
}
return carry;
}
bnum_size digit_addi(bnum_digit *dest, const bnum_digit *a)
{
int carry = 1;
bnum_size i = 0;
while (carry) {
unsigned int u = a[i] + 0x66U + carry;
dest[i] = a[i] + carry;
if ((u ^ a[i]) & 0x10)
dest[i] += 6U;
if ((carry = !!(u & 0x100)))
dest[i] += 0x60U;
i++;
}
return i;
}
void digit_cpy(bnum_digit *dest, const bnum_digit *a, bnum_size size)
{
while (size-- != 0) {
dest[size] = a[size];
}
}
減法的部分如果被減數的當前位數的數字小於減數,則需要借位,而借位給存在 char
較大的四個位元的數字直接借10,也就是 0xA0
,而在最後四個位元的數字,因為在運算時,會直接進行兩數減法,故要判斷是否有借位,減去借位時多給的6
void bnum_sub(bnum *total, const bnum *a, const bnum *b)
{
int cmp = bnum_cmp(a, b);
if (cmp > 0) {
BNUM_ALLOC(total, a->size);
digit_sub(total->ptr, a->ptr, (a->size + 1) >> 1, b->ptr,
(b->size + 1) >> 1);
bnum_size size = ((a->size + 1) >> 1) - 1;
do {
if (total->ptr[size]) {
size = (size << 1) + 1 + !!(total->ptr[size] & 0xF0);
break;
}
} while (size--);
total->size = size;
} else {
BNUM_ALLOC(total, 1);
total->size = 1;
}
}
void digit_sub(bnum_digit *total,
const bnum_digit *a,
bnum_size asize,
const bnum_digit *b,
bnum_size bsize)
{
bnum_size size = bsize;
int carry = 0;
for (bnum_size i = 0; i < size; i++) {
if (a[i] < b[i] + carry) {
total[i] = 0xA0U - b[i] + a[i] - carry;
carry = 1;
} else {
total[i] = a[i] - b[i] - carry;
carry = 0;
}
if ((total[i] & MASK_L) > (a[i] & MASK_L))
total[i] -= 6U;
}
while (carry) {
if (!a[size]) {
total[size] = 0x99U;
} else {
total[size] = a[size] - carry;
if ((total[size] ^ a[size]) & 0x10U)
total[size] -= 6U;
carry = 0;
}
++size;
}
for (; size < asize; ++size)
total[size] = a[size];
}
將數字儲存的方式先改成一般的二進制,直接進行乘法,再轉換成 Binary-coded decimal
缺點: 多次呼叫除法和取餘數,不是一個好的算法,看能否改進成沒有除法的實作方式
int digit_ctoi(bnum_digit a)
{
return (a & MASK_L) + 10 * ((a & MASK_R) >> 4);
}
void bnum_mul(bnum *total, const bnum *a, const bnum *b)
{
BNUM_ALLOC(total, a->size + b->size);
bnum_size asize = ((a->size + 1) >> 1) + 1;
for (bnum_size i = 0, size = (b->size + 1) >> 1; i < size; ++i) {
if (b->ptr[i])
digit_mul_add(&total->ptr[i], a->ptr, asize, digit_ctoi(b->ptr[i]));
}
bnum_cal_size(total, a->size + b->size - 1);
}
void digit_mul_add(bnum_digit *dest,
const bnum_digit *a,
bnum_size asize,
int b)
{
int carry = 0;
for (bnum_size i = 0; i < asize; ++i) {
carry += digit_ctoi(a[i]) * b + digit_ctoi(dest[i]);
dest[i] = carry % 10;
carry /= 10;
dest[i] += carry % 10 << 4;
carry /= 10;
}
}
跟教授提供的 bignum 作比較,因為效能遠遠不及,因此先分析 ref 的實作
contributed by < Wallmountain > 測驗1 在 Linux 核心原始程式碼,include/linux/bitfield.h 提及一個巨集 GENMASK,其作用是依據給定的範圍,產生連續的 bitmask,例如: GENMASK(6, 4) 產生 $01110000_2$ GENMASK(39, 21) 產生 0x000000ffffe00000 (64 位元) 已知我們使用的微處理器架構為 64 位元,且 unsigned long 為 8 位元組寬度 (符合 LP64 資料模型),以下是可能的 GENMASK 巨集的定義:
May 10, 2022contributed by < Wallmountain > 實驗環境 $ gcc --version gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0 $ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian
Feb 25, 2022or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up