contributed by < blueskyson
>
$ gcc --version
gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
$ lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 39 bits physical, 48 bits virtual
CPU(s): 12
On-line CPU(s) list: 0-11
Thread(s) per core: 2
Core(s) per socket: 6
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 165
Model name: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
Stepping: 2
CPU MHz: 2600.000
CPU max MHz: 5000.0000
CPU min MHz: 800.0000
BogoMIPS: 5199.98
Virtualization: VT-x
L1d cache: 192 KiB
L1i cache: 192 KiB
L2 cache: 1.5 MiB
L3 cache: 12 MiB
NUMA node0 CPU(s): 0-11
在 fibdrv.c 的 fib_sequence
以 long long
來處理費氏數列的值,運算到 long long
。
struct BigN
取代 long long
首先在 client.c 以及 fibdrv.c 定義結構 struct BigN
並給予別名為 u128
:
#define BIGN
#ifdef BIGN
typedef struct BigN {
unsigned long long upper, lower;
} u128;
#endif
我一開始用 u128
表示正整數
參考 hankluo6 的作法,讓 lower 最多只儲存 u128
表示的值為 lower
進位到 upper
的規則會類似於
decimal | upper | lower |
---|---|---|
0 | 0 | 0 |
1 | 0 | 1 |
2 | 0 | 2 |
0 | 99…99 | |
1 | 0 | |
1 | 1 | |
1 | 99…99 | |
2 | 0 |
如此就能夠很輕易的利用 printf
印出字串如下:
printf("%llu%019llu", fib->upper, fib->lower);
套用 u128
改寫的後的 fib_sequence
如下:
#ifdef BIGN
#define P10_UINT64 10000000000000000000ULL
static u128 fib_sequence_u128(long long k)
{
if (k <= 1LL) {
u128 ret = {.upper = 0, .lower = (unsigned long long) k};
return ret;
}
u128 a, b, c;
a.upper = 0;
a.lower = 0;
b.upper = 0;
b.lower = 1;
for (int i = 2; i <= k; i++) {
c.lower = a.lower + b.lower;
c.upper = a.upper + b.upper;
// if lower overflows or lower >= 10^9, add a carry to upper
if ((c.lower < a.lower) || (c.lower >= P10_UINT64)) {
c.lower -= P10_UINT64;
c.upper += 1;
}
a = b;
b = c;
}
return c;
}
#endif
接下來再改寫 fib_read
,原本的 fib_read
函式內部直接回傳 read(fd, buf, 1)
的回傳值來告訴 client.c ssize_t
。追溯 ssize_t
的定義:
// posix_types.h
typedef long __kernel_long_t;
/*
* Most 32 bit architectures use "unsigned int" size_t,
* and all 64 bit architectures use "unsigned long" size_t.
*/
#if __BITS_PER_LONG != 64
// typedef something...
#else
typedef __kernel_long_t __kernel_ssize_t;
#endif
// types.h
typedef __kernel_ssize_t ssize_t;
觀察上述定義以及參閱 Integer-Types 得知 long
為 32 或 64 位元,我測試了自己的電腦得知我的 long
為 64 位元,亦即 ssize_t
為 64 位元。
顯然 ssize_t
傳遞 copy_to_user
API,將 read(fd, buf, 1)
的 buf
中,在 client.c 做基數轉換。改寫後的 fib_read
如下:
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
#ifdef BIGN
u128 fib = fib_sequence_u128(*offset);
copy_to_user(buf, &fib, sizeof(fib));
return sizeof(fib);
#else
return (ssize_t) fib_sequence(*offset);
#endif
}
接下來改寫 client.c:
#define FIB_DEV "/dev/fibonacci"
#define BIGN
#ifdef BIGN
typedef struct BigN {
unsigned long long upper, lower;
} u128;
void print_fib_u128(int i, u128 *fib)
{
if (fib->upper) {
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%llu%019llu.\n",
i, fib->upper, fib->lower);
} else {
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%llu.\n",
i, fib->lower);
}
}
#endif
int main()
{
long long sz;
char buf[sizeof(u128)];
char write_buf[] = "testing writing";
int offset = 100; /* TODO: try test something bigger than the limit */
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
for (int i = 0; i <= offset; i++) {
sz = write(fd, write_buf, strlen(write_buf));
printf("Writing to " FIB_DEV ", returned the sequence %lld\n", sz);
}
for (int i = 0; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
sz = read(fd, buf, sizeof(u128));
#ifdef BIGN
print_fib_u128(i, (u128 *) buf);
#else
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%lld.\n",
i, sz);
#endif
}
for (int i = offset; i >= 0; i--) {
lseek(fd, i, SEEK_SET);
sz = read(fd, buf, sizeof(u128));
#ifdef BIGN
print_fib_u128(i, (u128 *) buf);
#else
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%lld.\n",
i, sz);
#endif
}
close(fd);
return 0;
}
測試時我發現程式永遠都顯示 fib_read
中使用 printk("%lld\n", *offset);
然後用 dmesg
觀察紀錄,發現 *offset
會被限制不能超過 92,然後我在 fibdrv.c 中發現巨集:
/* MAX_LENGTH is set to 92 because
* ssize_t can't fit the number > 92
*/
#define MAX_LENGTH 92
它在 fib_device_lseek
中限制 lseek
的 offset
。將此巨集改為 100 後即可正常執行到 python scripts/verify.py
成功通過驗證。
此 BigN 實作最高可以計算
截至目前為止的實作在 53f5ddf。
struct BigN
使其能夠計算 改寫 struct BigN
的資料結構,取名為 ubig
,把 upper
, lower
擴增為 cell[size]
如下:
#define BIGN
#ifdef BIGN
#define BIGNSIZE 12
typedef struct BigN {
unsigned long long cell[BIGNSIZE];
} ubig;
#define init_ubig(x) for (int i = 0; i < BIGNSIZE; x.cell[i++] = 0)
#endif
上方的 BIGNSIZE
巨集用來控制 cell
陣列的長度,當 BIGNSIZE
為 12,ubig
即為一個擁有 768 位元的 cell
用以儲存費氏數列的值,可以表示的數值範圍為 cell[0]
儲存最低位、cell[11]
儲存最高位。
接著改寫 fib_sequence
:
#ifdef BIGN
#define P10_UINT64 10000000000000000000ULL
static ubig fib_sequence_ubig(long long k)
{
if (k <= 1LL) {
ubig ret;
init_ubig(ret);
ret.cell[0] = (unsigned long long) k;
return ret;
}
ubig a, b, c;
init_ubig(a);
init_ubig(b);
b.cell[0] = 1;
for (int i = 2; i <= k; i++) {
for (int j = 0; j < BIGNSIZE; j++)
c.cell[j] = a.cell[j] + b.cell[j];
for (int j = 0; j < BIGNSIZE - 1; j++) {
// if lower overflows or lower >= 10^9, add a carry to upper
if ((c.cell[j] < a.cell[j]) || (c.cell[j] >= P10_UINT64)) {
c.cell[j] -= P10_UINT64;
c.cell[j + 1] += 1;
}
}
a = b;
b = c;
}
return c;
}
#endif
為了不讓 client.c 過於複雜,我在 fibdrv.c 就將 BigN 轉換為字串,然後直接用 copy_to_user
回傳
以下函式在 fibdrv.c 中,負責將 a
轉為字串複製到 buf
中:
size_t ubig_to_string(char *buf, size_t bufsize, const ubig *a)
{
int index = BIGNSIZE - 1;
while (index >= 0 && !a->cell[index])
index--;
if (index == -1) {
buf[0] = '0';
buf[1] = '\0';
return (size_t) 1;
}
char buf2[22];
snprintf(buf2, bufsize,"%llu", a->cell[index--]);
strcat(buf, buf2);
while (index >= 0) {
snprintf(buf2, bufsize, "%019llu", a->cell[index--]);
strcat(buf, buf2);
}
return strlen(buf);
}
然後改寫 fib_read
:
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
#ifdef BIGN
#define BUFFSIZE 500
char fibbuf[BUFFSIZE] = "";
ubig fib = fib_sequence_ubig(*offset);
size_t len = ubig_to_string(fibbuf, BUFFSIZE, &fib);
len = (size > len) ? (len + 1) : size;
copy_to_user(buf, fibbuf, len);
return (ssize_t) len;
#else
return (ssize_t) fib_sequence(*offset);
#endif
}
最後改寫 client.c 使其直接印出 buf
字串:
#define BIGN
#define BUFFSIZE 500
//...
for (int i = 0; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
sz = read(fd, buf, BUFFSIZE);
#ifdef BIGN
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%s.\n",
i, buf);
#else
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%lld.\n",
i, sz);
#endif
}
//...
此實作最大可以計算 BIGNSIZE
,就可以計算任意
截至目前為止的實作在 d0dd8a4。
struct BigN
實作我前面提到 struct BigN
中每個 unsigned long long
達到 upper
與 lower
並不是差距 lower
的位元直接左移到 upper
完全是錯誤的。
經過思考後,我回復一開始的想法,在相加以及左移時,使用 u128
表示正整數 lower
依據 lower
是否大於
decimal | upper | lower |
---|---|---|
0 | 0 | 0 |
1 | 0 | 1 |
2 | 0 | 2 |
0 | 0xff…ff | |
1 | 0 | |
1 | 1 | |
1 | 0xff…ff | |
2 | 0 |
以下為改進後的加法 (ubig_add
) 的實作:
#define BIGNSIZE 12
typedef struct BigN {
unsigned long long cell[BIGNSIZE];
} ubig;
static inline void init_ubig(ubig *x)
{
for (int i = 0; i < BIGNSIZE; x->cell[i++] = 0)
;
}
static inline void ubig_add(ubig *dest, ubig *a, ubig *b) {
for (int i = 0; i < BIGNSIZE; i++)
dest->cell[i] = a->cell[i] + b->cell[i];
for (int i = 0; i < BIGNSIZE - 1; i++)
dest->cell[i + 1] += (dest->cell[i] < a->cell[i]);
}
觀摩 arthur-chang 的作法,在轉換為字串的過程,先將字元陣列初始化為 "0"
,每讀取一個 bit 就讓自己加自己(也就是乘以二),如果當前是 set bit 就加 1,然後進到下一個迭代,計算過程中都是透過字元陣列儲存十進位的資訊。以下舉 1010
為例
step | num | computation | result |
---|---|---|---|
0 | 1010 | "0" |
"0" |
1 | 1010 | ("0" + "0") + "1" | "1" |
2 | 1010 | ("1" + "1") + "0" | "2" |
3 | 1010 | ("2" + "2") + "1" | "5" |
4 | 1010 | ("5" + "5") + "0" | "10" |
由上述演算法就可以用簡單的機制將 ubig
基數轉換,arthur-chang 的實作是 uint32_t
的版本,我實作 unsigned long long
的版本:
int ubig_to_string(char *buf, size_t bufsize, ubig *a)
{
memset(buf, '0', bufsize);
buf[bufsize - 1] = '\0';
int index = BIGNSIZE - 1;
while (index >= 0 && !a->cell[index])
index--;
if (index == -1)
return bufsize - 2;
for (int i = index; i >= 0; i--) {
for (unsigned long long mask = 0x8000000000000000ULL;
mask; mask >>= 1) {
int carry = ((a->cell[i] & mask) != 0);
for (int j = bufsize - 2; j >= 0; j--) {
buf[j] += buf[j] - '0' + carry;
carry = (buf[j] > '9');
if (carry)
buf[j] -= 10;
}
}
}
int offset = 0;
while (buf[offset] == '0')
offset++;
return (buf[offset] == '\0') ? (offset - 1) : offset;
}
接下來再套用前述的 function 實作 fibdrv:
static ubig fib_sequence_ubig(long long k)
{
if (k <= 1LL) {
ubig ret;
init_ubig(&ret);
ret.cell[0] = (unsigned long long) k;
return ret;
}
ubig a, b, c;
init_ubig(&a);
init_ubig(&b);
b.cell[0] = 1ULL;
for (int i = 2; i <= k; i++) {
ubig_add(&c, &a, &b);
a = b;
b = c;
}
return c;
}
#define BUFFSIZE 500
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
#ifdef BIGN
char fibbuf[BUFFSIZE];
ubig fib = fib_sequence_ubig(*offset);
int __offset = ubig_to_string(fibbuf, BUFFSIZE, &fib);
copy_to_user(buf, fibbuf + __offset, BUFFSIZE - __offset);
return (ssize_t) BUFFSIZE - __offset;
#else
return (ssize_t) fib_sequence(*offset);
#endif
}
到目前為止的程式碼 b2749a6。
首先實作 struct BigN
的減法(ubig_sub
)、左移(ubig_lshift
)、直式乘法(ubig_mul
):
static inline void ubig_sub(ubig *dest, ubig *a, ubig *b)
{
for (int i = 0; i < BIGNSIZE; i++)
dest->cell[i] = a->cell[i] - b->cell[i];
for (int i = 0; i < BIGNSIZE - 1; i++)
dest->cell[i + 1] -= (dest->cell[i] > a->cell[i]);
}
static inline void ubig_lshift(ubig *dest, ubig *a, int x)
{
init_ubig(dest);
// quotient and remainder of x being divided by 64
unsigned quotient = x >> 6, remainder = x & 0x3f;
for (int i = 0; i + quotient < BIGNSIZE; i++)
dest->cell[i + quotient] |= a->cell[i] << remainder;
if (remainder)
for (int i = 1; i + quotient < BIGNSIZE; i++)
dest->cell[i + quotient] |= a->cell[i - 1] >> (64 - remainder);
}
void ubig_mul(ubig *dest, ubig *a, ubig *b)
{
init_ubig(dest);
int index = BIGNSIZE - 1;
while (index >= 0 && !b->cell[index])
index--;
if (index == -1)
return;
for (int i = index; i >= 0; i--) {
int bit_index = (i << 6) + 63;
for (unsigned long long mask = 0x8000000000000000ULL; mask;
mask >>= 1) {
if (b->cell[i] & mask) {
ubig shift, tmp;
ubig_lshift(&shift, a, bit_index);
ubig_add(&tmp, dest, &shift);
*dest = tmp;
}
bit_index--;
}
}
}
接下來參考作業說明提供的演算法:
因此可得:
pseudo code:
Fast_Fib(n)
a = 0; b = 1; // m = 0
for i = (number of binary digit in n) to 1
t1 = a*(2*b - a);
t2 = b^2 + a^2;
a = t1; b = t2; // m *= 2
if (current binary digit == 1)
t1 = a + b; // m++
a = b; b = t1;
return a;
舉例: 求解
i | start | 4 | 3 | 2 | 1 | result |
---|---|---|---|---|---|---|
n | - | 1010 | 1010 | 1010 | 1010 | - |
F(m) | F(0) | F(0*2+1) | F(1*2) | F(2*2+1) | F(5*2) | F(10) |
a | 0 | 1 | 1 | 5 | 55 | 55 |
b | 1 | 1 | 2 | 8 | 89 | - |
用 fast doubling 實作 fibdrv:
static ubig fib_sequence_ubig(long long k)
{
if (k <= 1LL) {
ubig ret;
init_ubig(&ret);
ret.cell[0] = (unsigned long long) k;
return ret;
}
ubig a, b;
init_ubig(&a);
init_ubig(&b);
b.cell[0] = 1ULL;
for (unsigned long long mask = 0x8000000000000000ULL >> __builtin_clzll(k);
mask; mask >>= 1) {
ubig tmp1, tmp2, t1, t2;
ubig_lshift(&tmp1, &b, 1); // tmp1 = 2*b
ubig_sub(&tmp2, &tmp1, &a); // tmp2 = 2*b - a
ubig_mul(&t1, &a, &tmp2); // t1 = a*(2*b - a)
ubig_mul(&tmp1, &a, &a); // tmp1 = a^2
ubig_mul(&tmp2, &b, &b); // tmp2 = b^2
ubig_add(&t2, &tmp1, &tmp2); // t2 = a^2 + b^2
a = t1, b = t2;
if (k & mask) {
ubig_add(&t1, &a, &b); // t1 = a + b
a = b, b = t1;
}
}
return a;
}
到目前為止的程式碼 473ff4a。
首先修改 grub 設定檔,強制 isolate cpu 11:
$ sudo vim /etc/default/grub
編輯 GRUB_CMDLINE_LINUX=""
:
GRUB_CMDLINE_LINUX="isolcpus=0"
更新 grub
$ sudo update-grub
重新開機後,用 taskset
檢查 cpu 11 是否已經被 isolate:
$ taskset -p 1
pid 1's current affinity mask: 7ff
我發現有少數 process 仍然有全部核心的 affinity,例如 rcu_gp
。
將 client
程式固定在 cpu 11 執行:
$ sudo insmod fibdrv.ko
$ sudo taskset -c 11 ./client
$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
# performance.sh
for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
do
echo performance > ${i}
done
$ sudo sh performance.sh
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
首先利用 ktime_t
與 ktime_get()
來處理計算費氏數列耗費時間,最後透過 fib_read
的回傳值將耗費的時間傳給 user space,具體實作如下:
#include <linux/ktime.h>
// ...
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
#ifdef BIGN
ktime_t t = ktime_get();
char fibbuf[BUFFSIZE];
ubig fib = fib_sequence_ubig(*offset);
int __offset = ubig_to_string(fibbuf, BUFFSIZE, &fib);
copy_to_user(buf, fibbuf + __offset, BUFFSIZE - __offset);
s64 elapsed = ktime_to_ns(ktime_sub(ktime_get(), t));
return elapsed;
#else
return (ssize_t) fib_sequence(*offset);
#endif
}
然後新增一個 perf-test.c
用來印出 user space、kernel space、kernel to user 的執行時間。此程式會重複計算
// perf-test.c
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#define FIB_DEV "/dev/fibonacci"
#define BIGN
#define BUFFSIZE 500
int compare(const void *a, const void *b)
{
long long A = *(long long*) a;
long long B = *(long long*) b;
return (A > B) - (A < B);
}
int main()
{
char buf[BUFFSIZE];
int offset = 1000;
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
long long utime_arr[1001][100];
long long ktime_arr[1001][100];
for (int i = 0; i < 100; i++) {
for (int j = 0; j <= offset; j++) {
struct timespec start, end;
lseek(fd, j, SEEK_SET);
clock_gettime(CLOCK_REALTIME, &start);
ktime_arr[j][i] = read(fd, buf, BUFFSIZE);
clock_gettime(CLOCK_REALTIME, &end);
utime_arr[j][i] = (long long)end.tv_nsec - start.tv_nsec;
}
}
// compute 10% trimmed mean
for (int i = 0; i <= 1000; i++) {
qsort((void *)&utime_arr[i][0], 100, sizeof(long long), compare);
qsort((void *)&ktime_arr[i][0], 100, sizeof(long long), compare);
long long utime_avg = 0, ktime_avg = 0;
for (int j = 10; j < 90; j++) {
utime_avg += utime_arr[i][j];
ktime_avg += ktime_arr[i][j];
}
utime_avg /= 80;
ktime_avg /= 80;
printf("%d %lld %lld %lld\n", i, ktime_avg, utime_avg, utime_avg - ktime_avg);
}
close(fd);
return 0;
}
接下來輸出測試的結果到 fast_doubling.txt
:
$ gcc perf-test.c -o a
$ sudo insmod fibdrv.ko
$ sudo taskset -c 11 ./a > fast_doubling.txt
$ sudo rmmod fibdrv.ko || true >/dev/null
輸出的結果如下:
0 232 543 311
1 51551 51835 284
2 52655 52997 342
3 52379 52663 284
4 52758 53041 283
...
gnuplot script:
# 1.gp
set title "Fast Doubling Time"
set xlabel "n th sequence of Fibonacci number"
set ylabel "time cost (ns)"
set terminal png font " Times_New_Roman,12 "
set output "time.png"
set xrange [0:1000]
set xtics 0, 100, 1000
set key left
plot \
"fast_doubling.txt" using 1:2 with points title "kernel", \
"fast_doubling.txt" using 1:3 with points title "user", \
"fast_doubling.txt" using 1:4 with points title "kernel to user", \
至此可以畫出 Fast Doubling 的 10% trimmed mean 執行時間:
接下來用同樣的手法畫出 Adding 的 10% trimmed mean 執行時間:
比較 Fast Doubling 和 Adding 的 user space time。
可以發現 Fast Doubling 的效能較 Adding 好。
到目前為止的變更 48956a1。
動態配置記憶體需要注意以下議題:
我將大數加法、乘法、初始化等函式統一定義為:
// ubig_xxx.h
ubig *new_ubig(int size);
void destroy_ubig(ubig *ptr);
void zero_ubig(ubig *x);
void ubig_assign(ubig *dest, ubig *src);
void ubig_add(ubig *dest, ubig *a, ubig *b);
void ubig_sub(ubig *dest, ubig *a, ubig *b);
void ubig_lshift(ubig *dest, ubig *a, int x);
void ubig_mul(ubig *dest, ubig *a, ubig *b, ubig *shift_buf, ubig *add_buf);
int ubig_to_string(char *buf, size_t bufsize, ubig *a);
ubig fib_sequence(long long k);
並將不同版本的實作寫在 ubig_1.h
、ubig_2.h
、…,在 fibdrv.c
藉由切換 #include "ubig_xxx.h"
來切換為不同實作。
根據 wikipedia,費波納西數列的前後項比值會趨近於黃金比例,也就是
並且,
列舉費波納西數列,觀察在第幾項之後
13 | 233 | 1.6180555556 |
14 | 377 | 1.6180257511 |
15 | 610 | 1.6180371353 |
16 | 987 | 1.6180327869 |
17 | 1597 | 1.6180344478 |
從列舉看到當
假設 ubig
使用了 unsigned long long
,計算 ubig
可以表示的最大位數,將常數計算完後化簡為
只要使 unsigned long long
數量的函式(因為 kernel module 不允許使用浮點數,所以將預先計算的常數同乘以 100000 後,用整數運算):
static inline int estimate_size(long long k) {
if (k <= 93)
return 1;
unsigned long long n = (k * 20899 - 34950) / 1926610;
return (int)n + 1;
}
接下來改寫 ubig
結構,使其可以根據需求調整 unsigned long long
陣列長度。
typedef struct BigN {
int size;
unsigned long long *cell;
} ubig;
利用 kmalloc
和 kfree
分配記憶體給 ubig
:
ubig *new_ubig(int size)
{
ubig *ptr = kmalloc(sizeof(ubig), GFP_KERNEL);
if (!ptr)
return NULL;
unsigned long long *ullptr =
kmalloc(size * sizeof(unsigned long long), GFP_KERNEL);
if (!ullptr) {
kfree(ptr);
return NULL;
}
memset(ullptr, 0, size * sizeof(unsigned long long));
ptr->size = size;
ptr->cell = ullptr;
return ptr;
}
static inline void destroy_ubig(ubig *ptr)
{
if (ptr) {
if (ptr->cell)
kfree(ptr->cell);
kfree(ptr);
}
}
在開始計算
static ubig* fib_sequence(long long k)
{
if (k <= 1LL) {
ubig *ret = new_ubig(1);
if (!ret)
return NULL;
ret->cell[0] = (unsigned long long) k;
return ret;
}
int sz = estimate_size(k);
ubig *a = new_ubig(sz);
ubig *b = new_ubig(sz);
ubig *tmp1 = new_ubig(sz);
ubig *tmp2 = new_ubig(sz);
ubig *t1 = new_ubig(sz);
ubig *t2 = new_ubig(sz);
ubig *mul_buf1 = new_ubig(sz);
ubig *mul_buf2 = new_ubig(sz);
if (!a || !b || !tmp1 || !tmp2 || !t1 || !t2 || !mul_buf1 || !mul_buf2) {
destroy_ubig(a);
destroy_ubig(b);
destroy_ubig(tmp1);
destroy_ubig(tmp2);
destroy_ubig(t1);
destroy_ubig(t2);
destroy_ubig(mul_buf1);
destroy_ubig(mul_buf2);
return NULL;
}
b->cell[0] = 1ULL;
在第 11 到 19 行先 kmalloc
配置所有 ubig
,在第 20 行確認所需的 ubig
是否全都配置完畢,若否就將所有 ubig
釋放然後回傳 NULL
。接下來就只會使用這些配置好的 ubig
運算,不再呼叫 kmalloc
:
for (unsigned long long mask = 0x8000000000000000ULL >> __builtin_clzll(k);
mask; mask >>= 1) {
ubig_lshift(tmp1, b, 1); // tmp1 = 2*b
ubig_sub(tmp2, tmp1, a); // tmp2 = 2*b - a
ubig_mul(t1, a, tmp2, mul_buf1, mul_buf2); // t1 = a*(2*b - a)
ubig_mul(tmp1, a, a, mul_buf1, mul_buf2); // tmp1 = a^2
ubig_mul(tmp2, b, b, mul_buf1, mul_buf2); // tmp2 = b^2
ubig_add(t2, tmp1, tmp2); // t2 = a^2 + b^2
ubig_assign(a, t1);
ubig_assign(b, t2);
if (k & mask) {
ubig_add(t1, a, b); // t1 = a + b
ubig_assign(a, b);
ubig_assign(b, t1);
}
}
destroy_ubig(b);
destroy_ubig(tmp1);
destroy_ubig(tmp2);
destroy_ubig(t1);
destroy_ubig(t2);
destroy_ubig(mul_buf1);
destroy_ubig(mul_buf2);
return a;
}
使用 kmalloc
動態配置記憶體後,照理來說最大可以計算 kmalloc
配置的長度的極限(128KB),即 2048 個 unsigned long long
。可以表達的最大的十進位位數為
十進位位數小於等於 39457 的費氏數列項次為
到目前為止的實作在 0ee4f89 的 ubig_2.h
。
copy_to_user
回傳 ubig
的原始資料,在資料回傳至 user space 後,再呼叫 ubig_to_string
基數轉換,藉此減少 kernel to user 複製資料的成本。為了達成上述兩項要求,我首先在 fib_sequence
函式增加一個參數 user_size
來接收 user buffer 的大小,在透過 estimate_size
計算完需要多少 unsigned long long
來儲存數值後,再檢查 buffer 長度夠不夠儲存這麼長的 unsigned long long
陣列:
static ubig *fib_sequence(long long k, size_t user_size)
{
if (k <= 1LL) {
/* Check if buffer has enough size */
if (user_size < sizeof(unsigned long long))
return NULL;
ubig *ret = new_ubig(1);
if (!ret)
return NULL;
ret->cell[0] = (unsigned long long) k;
return ret;
}
int sz = estimate_size(k);
/* Check if buffer has enough size */
if (user_size < sz * sizeof(unsigned long long))
return NULL;
// remaining code ...
}
計算完 copy_to_user
把 ubig
的 unsigned long long
陣列複製進 user buffer,然後回傳 unsigned long long
陣列長度(每 8 byte 一個單位):
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
ubig *fib = fib_sequence(*offset, size);
if (!fib)
return -1;
int ret_size = fib->size;
copy_to_user(buf, fib->cell, fib->size * sizeof(unsigned long long));
destroy_ubig(fib);
return ret_size;
}
到目前為止的實作在 8e0021f。
近日面試被問到假設 fibdrv 在 userspace 實現,會與 kernel module 有什麼差異,參照 Differences Between Kernel Modules and User Programs,其中提到 Kernel modules have higher execution privilege,故 kernel module 本身被分配到 cpu cycle 執行的機會就比 userspace application 高,所以比較快執行完畢。
以下是我將 fast doubling 做成 user space 的 function 來與 fibdrv 進行執行時間的比較(只計算
可以從圖中看出 kernel module 確實比 userspace application 快。
接下來測試計算費氏數列加上基數轉換所耗費的時間,發現基數轉換的效率非常差,是接下還需要改善的部份。
TODO: 引入 Schönhage–Strassen algorithm 來加速乘法運算,注意 threshold 的設定
jservImage Not Showing Possible ReasonsLearn More →
- The image file may be corrupted
- The server hosting the image is unavailable
- The image path is incorrect
- The image format is not supported
Schönhage–Strassen 的概念是將
令
將
對數列線性卷積:
2 66 99
x 1 88
-----------------------
176 5808 8712
2 66 99
-----------------------
2 242 5907 8712
8712
5907
242
+ 2
--------------
5019412
在 ubig
的乘法實作中,我打算以 32 bit 為界切割數值,並透過 64 bit 的乘法來計算線性卷積。為了方便上述運算,我將 ubig
儲存數值的資料型態由 unsigned long long
改為 unsigned int
。
typedef struct BigN {
int size;
unsigned int *cell;
} ubig;
我稍微調整乘法計算方式,按照下圖 col1
到 col5
順序計算卷積、處理進位,然後直接存到 dest
中:
// find the array index of MSB of a
// TODO: use binary search
static inline int ubig_msb_idx(ubig *a)
{
int msb_i = a->size - 1;
while (msb_i >= 0 && !a->cell[msb_i])
msb_i--;
return msb_i;
}
void ubig_mul(ubig *dest, ubig *a, ubig *b)
{
zero_ubig(dest);
// find the array index of the MSB of a, b
int msb_a = ubig_msb_idx(a);
int msb_b = ubig_msb_idx(b);
// a == 0 or b == 0 then dest = 0
if (msb_a < 0 || msb_b < 0)
return;
// calculate the length of vector
// after doing linear convolution
// i.e. column number
int length = msb_a + msb_b + 1;
/* do linear convolution */
unsigned long long carry = 0;
for (int i = 0; i < length; i++) {
unsigned long long col_sum = carry;
carry = 0;
int end = (i <= msb_b) ? i : msb_b;
int start = i - end;
for (int j = start, k = end; j <= end; j++, k--) {
unsigned long long product =
(unsigned long long) a->cell[k] * b->cell[j];
col_sum += product;
carry += (col_sum < product);
}
dest->cell[i] = (unsigned int) col_sum;
carry = (carry << 32) + (col_sum >> 32);
}
dest->cell[length] = carry;
}
完整實作在 b21a89a 的 ubig_schonhange_strassen.h
。
Karatsuba 的概念是將
則
上述算法計算
觀察
移項之後,我們就能利用
最後計算
以上便是 Karatsuba 演算法。
接下來我以兩個 8 bit 數值相乘變成 16 bit 來演示 Karatsuba(假 設所採用的處理器只支援 8 bit 乘法):
令
其中
接續上一個例子,當
由上圖可以看出計算
至此我可以透過分治法使用 Karatsuba 計算任意位數的大數。
我目前的實作並未考慮 ubig
可能需要 resize,所以需要大幅修改 ubig
的賦值、加法、左移等基本操作,才好實作 Karatsuba。
另一種解決辦法是將所有 unsigned int
陣列大小都統一為 128 KiB,這樣就不用在分治的時候處理不同長度的 unsigned int
陣列,但是計算較小的數會浪費記憶體。
此外,計算 ubig
物件,所以此實作是 not-in-place。以下為基於遞迴的 Karatsuba 的 ubig_mul
實作:
// modified addition for karatsuba
static inline void ubig_add_in_place(ubig *dest, ubig *src, int front, int end)
{
int carry = 0, i = 0;
for (int j = front; j < end; i++, j++) {
unsigned int tmp = dest->cell[i] + src->cell[j] + carry;
carry = (tmp < dest->cell[i]);
dest->cell[i] = tmp;
}
for (; i < dest->size; i++) {
unsigned int tmp = dest->cell[i] + carry;
carry = (tmp < dest->cell[i]);
dest->cell[i] = tmp;
}
}
// modified addition for karatsuba
static inline void ubig_add_in_place2(ubig *dest, int offset, ubig *src)
{
int carry = 0, i = offset;
for (int j = 0; j < src->size; i++, j++) {
unsigned int tmp = dest->cell[i] + src->cell[j] + carry;
carry = (tmp < dest->cell[i]);
dest->cell[i] = tmp;
}
for (; i < dest->size; i++) {
unsigned int tmp = dest->cell[i] + carry;
carry = (tmp < dest->cell[i]);
dest->cell[i] = tmp;
}
}
// modified substraction for karatsuba
static inline void ubig_sub_in_place(ubig *dest, ubig *src, int front, int end)
{
int carry = 0, i = 0;
for (int j = front; j < src->size && j < end; i++, j++) {
unsigned int tmp = dest->cell[i] - src->cell[j] - carry;
carry = (tmp > dest->cell[i]);
dest->cell[i] = tmp;
}
for (; i < dest->size; i++) {
unsigned int tmp = dest->cell[i] - carry;
carry = (tmp > dest->cell[i]);
dest->cell[i] = tmp;
}
}
static inline int ubig_msb_idx(ubig *a)
{
int msb_i = a->size - 1;
while (msb_i >= 0 && !a->cell[msb_i])
msb_i--;
return msb_i;
}
// do Karatsuba recursively
int mul_recursive(ubig *dest, ubig *x, ubig *y, int front, int end)
{
// termination 32 bit x 32 bit case
int size = end - front;
if (size == 1) {
unsigned long long product =
(unsigned long long) x->cell[front] * y->cell[front];
if (front * 2 < dest->size) {
dest->cell[front * 2] = product;
if (front * 2 + 1 < dest->size)
dest->cell[front * 2 + 1] = product >> 32;
}
return 1;
}
// dest = z2 * 2^(middle * 2) + z0 * 2^(front * 2)
int half_size = size / 2;
int middle = front + half_size;
int ret;
ret = mul_recursive(dest, x, y, middle, end);
if (!ret)
return 0;
ret = mul_recursive(dest, x, y, front, middle);
if (!ret)
return 0;
// tmp1 = (x0 + x1) and tmp2 = (y0 + y1)
ubig *tmp1 = new_ubig(half_size + 2);
ubig *tmp2 = new_ubig(half_size + 2);
ubig *z1 = new_ubig((half_size + 2) * 2);
if (!tmp1 || !tmp2 || !z1) {
destroy_ubig(tmp1);
destroy_ubig(tmp2);
destroy_ubig(z1);
return 0;
}
memcpy(tmp1->cell, x->cell + middle, (end - middle) * sizeof(unsigned int));
memcpy(tmp2->cell, y->cell + middle, (end - middle) * sizeof(unsigned int));
ubig_add_in_place(tmp1, x, front, middle); // x0 + x1
ubig_add_in_place(tmp2, y, front, middle); // y0 + y1
// z1 = (tmp1 * tmp2) - z0 - z2
int sz_1 = ubig_msb_idx(tmp1) + 1;
int sz_2 = ubig_msb_idx(tmp2) + 1;
int common_sz = sz_1 > sz_2 ? sz_1 : sz_2;
ret = mul_recursive(z1, tmp1, tmp2, 0, common_sz);
destroy_ubig(tmp1);
destroy_ubig(tmp2);
if (!ret) {
destroy_ubig(z1);
return 0;
}
ubig_sub_in_place(z1, dest, front * 2, front * 2 + half_size * 2);
ubig_sub_in_place(z1, dest, front * 2 + half_size * 2,
front * 2 + size * 2);
// dest = dest + z1 * 2^(front + middle)
ubig_add_in_place2(dest, front * 2 + half_size, z1);
destroy_ubig(z1);
return 1;
}
// multiplication entry point
int ubig_mul(ubig *dest, ubig *a, ubig *b)
{
zero_ubig(dest);
// don't use karatsuba if dest only has 32 bit
if (dest->size == 1) {
dest->cell[0] = a->cell[0] * b->cell[0];
return 1;
}
// find how many unsigned int are actually
// storing data
int sz_a = ubig_msb_idx(a) + 1;
int sz_b = ubig_msb_idx(b) + 1;
// if a == 0 or b == 0 then dest = 0
if (sz_a == 0 || sz_b == 0)
return 1;
int common_sz = sz_a > sz_b ? sz_a : sz_b;
return mul_recursive(dest, a, b, 0, common_sz);
}
完整實作在 9673a70 的 ubig_karatsuba.h
。
下圖是計算
單獨看 Karatsuba 與 Schönhage–Strassen,前者計算
接下來看計算較小數字時各演算法消耗的時間:
從圖中可以發現,計算
針對
unsigned int
陣列的長度次迭代,也就是