# 2022q1 Homework3 (fibdrv)
contributed by < `a12345645` >
[作業說明](https://hackmd.io/@sysprog/linux2022-fibdrv)
[github](https://github.com/a12345645/fibdrv)
## 實驗環境
```shell
$ gcc --version
gcc (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0
$ lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 39 bits physical, 48 bits virtual
CPU(s): 8
On-line CPU(s) list: 0-7
Thread(s) per core: 2
Core(s) per socket: 4
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 94
Model name: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
Stepping: 3
CPU MHz: 3700.003
CPU max MHz: 4000.0000
CPU min MHz: 800.0000
BogoMIPS: 6799.81
Virtualization: VT-x
L1d cache: 128 KiB
L1i cache: 128 KiB
L2 cache: 1 MiB
L3 cache: 8 MiB
NUMA node0 CPU(s): 0-7
```
## 測量原始程式執行時間
```c
static ktime_t kt;
static long long fib_time_proxy(long long k)
{
kt = ktime_get();
long long result = fib_sequence(k);
kt = ktime_sub(ktime_get(), kt);
return result;
}
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
return (ssize_t) fib_time_proxy(*offset);
}
static ssize_t fib_write(struct file *file,
const char *buf,
size_t size,
loff_t *offset)
{
return ktime_to_ns(kt);
}
```
<!--
首先使用作業說明中的程式碼來測量原本程式時間
下面為第一次產出的圖
並不是很好看
![](https://i.imgur.com/XmLWVQr.png)
-->
### 排除干擾
參考 [KYG-yaya573142 排除干擾效能分析的因素](https://hackmd.io/@KYWeng/rkGdultSU#%E6%8E%92%E9%99%A4%E5%B9%B2%E6%93%BE%E6%95%88%E8%83%BD%E5%88%86%E6%9E%90%E7%9A%84%E5%9B%A0%E7%B4%A0)
照著作業說明使用 taskset 指定行程使用的 CPU 核
- 查看 CPU 的核列表
```
$ taskset -cp 1
pid 1's current affinity list: 0-7
```
- 在 `/etc/default/grub` 下新增
```
GRUB_CMDLINE_LINUX="isolcpus=1,3"
```
重新載入 grub
```
$ sudo update-grub
```
使用 CPU 核 1 與 3 並且重新開機
- 重新開機後可以看到核心 1 與 3 就不會被自動分配給其他程式了
```shell
$ taskset -cp 1
pid 1's current affinity list: 0,2,4-7
```
抑制 address space layout randomization ([ASLR](https://en.wikipedia.org/wiki/Address_space_layout_randomization))
```
$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
```
設定 scaling_governor 為 performance
```
$ nano performance.sh
for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
do
echo performance > $i
done
$ sudo sh performance.sh
```
關閉 Intel 處理器的 turbo mode
```
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
```
稍微更改了 makefile 來指定 `./client` 執行時所用的處理器核
```
check: all
$(MAKE) unload
$(MAKE) load
sudo taskset 0x1 ./client > out
$(MAKE) unload
scripts/verify.py
make gnuplot
```
```
$ sudo taskset 0x1 ./client
```
![](https://i.imgur.com/5lb7RTh.png)
## 大數運算
使用 64 bit integer 只能正確計算到第 93 位
### uint128_t
使用 `__uint128_t` 做運算
```c
static __uint128_t fib_sequence_128_bit(long long k, __uint128_t *buf)
{
__uint128_t f[3];
f[0] = 0;
f[1] = 1;
for (int i = 2; i <= k; i++)
f[i % 3] = f[(i - 1) % 3] + f[(i - 2) % 3];
copy_to_user(buf, &f[k % 3], sizeof(__uint128_t));
return f[k % 3];
}
```
#### 驗證
透過 $F_{500}$ [The first 500 Fibonacci numbers](https://blog.abelotech.com/posts/first-500-fibonacci-numbers/) 驗證
先把回傳的 128 bit integer 轉成 string
```c
int uint128_to_string(__uint128_t t, char *buf, size_t buf_size)
{
memset(buf, 0, buf_size);
buf_size--; // '\0'
int digits = 0;
buf[digits] = '0';
while (t > 0 && digits < buf_size) {
buf[digits++] = t % 10 + '0';
t /= 10;
}
if(t)
return -1;
reverse_str(buf, digits);
return digits;
}
```
並使用 python 進行驗證
```python
import csv
with open('fib_500.txt', newline='') as verifyfile, open('out', newline='') as outfile:
answer = csv.reader(verifyfile, delimiter=' ')
out = csv.reader(outfile, delimiter=' ')
for i, j in zip(answer, out):
print(i[0], end=' ')
if(i[1] == j[1]) :
print('OK', i[1])
else:
print('Error', i[1], j[1])
```
確認可以正確運算到第 186 位 `332825110087067562321196029789634457848`
符合預期 $2^{128} - 1 \approx 3.4e+38$
### BigN
使用 [作業說明](https://hackmd.io/@sysprog/linux2022-fibdrv#%E8%A8%88%E7%AE%97-F93-%E5%8C%85%E5%90%AB-%E4%B9%8B%E5%BE%8C%E7%9A%84-Fibonacci-%E6%95%B8---%E4%BD%BF%E7%94%A8%E6%95%B8%E5%AD%97%E5%AD%97%E4%B8%B2%E4%B8%A6%E5%A5%97%E7%94%A8-quiz2-SSO-Small-String-Optimization) 中的 `BigN` 結構
```c
struct BigN {
unsigned long long lower, upper;
};
```
```c
static inline void addBigN(struct BigN *output, struct BigN x, struct BigN y)
{
output->upper = x.upper + y.upper;
if (y.lower > ~x.lower)
output->upper++;
output->lower = x.lower + y.lower;
}
```
```c
static long long fib_sequence_big_number(long long k, BigN *buf)
{
BigN f[3];
f[0].lower = f[0].upper = 0;
f[1].lower = 1;
f[1].upper = 0;
for (int i = 2; i <= k; i++) {
addBigN(&f[i % 3], f[(i - 1) % 3], f[(i - 2) % 3]);
}
copy_to_user(buf, &f[k % 3], sizeof(BigN));
return k;
}
```
#### 驗證
經比對如預期與 `__uint128_t` 相同
正確運算到第 186 位
```c
int bn_to_string(BigN *bn, char *buf, size_t buf_size)
{
memset(buf, 0, buf_size);
buf_size--; // '\0'
unsigned long mask = 1L << ((sizeof(long) << 3) - 1);
int digits = 1;
for (mask; mask != 0; mask >>= 1) {
char carry = (mask & bn->upper) > 0;
for (int i = 0; i < digits; i++) {
buf[i] = (buf[i] << 1) + carry;
carry = buf[i] >= 10;
buf[i] %= 10;
}
if (carry) {
if ((digits + 1) > buf_size)
return 0;
buf[digits++] = carry;
carry = 0;
}
}
mask = 1L << ((sizeof(long) << 3) - 1);
for (mask; mask != 0; mask >>= 1) {
char carry = (mask & bn->lower) > 0;
for (int i = 0; i < digits; i++) {
buf[i] = (buf[i] << 1) + carry;
carry = buf[i] >= 10;
buf[i] %= 10;
}
if (carry) {
if ((digits + 1) > buf_size)
return 0;
buf[digits++] = carry;
carry = 0;
}
}
for (int i = 0; i < digits; i++)
buf[i] += '0';
reverse_str(buf, digits);
return digits;
}
```
### uint128_t 與 BigN 計算效能
兩種方法的執行時間都差不多
可以確認使用迭代的方式都是線性成長
但是使用兩個 long 的運算比較複雜所以比較高
![](https://i.imgur.com/VDsQVP4.png)
### String number
參考 [作業說明](https://hackmd.io/@sysprog/linux2022-fibdrv#%E8%A8%88%E7%AE%97-F93-%E5%8C%85%E5%90%AB-%E4%B9%8B%E5%BE%8C%E7%9A%84-Fibonacci-%E6%95%B8---%E4%BD%BF%E7%94%A8%E6%95%B8%E5%AD%97%E5%AD%97%E4%B8%B2%E4%B8%A6%E5%A5%97%E7%94%A8-quiz2-SSO-Small-String-Optimization) 中的 `string_number_add` 函式
使用 string 儲存十進位數字
```c
typedef struct _str_num {
char *str;
int len;
int digits;
} str_num;
```
這樣的設計在記憶體分配沒有出問題的前提下
就不會有 overflow 的問題
可以順利的計算到 500 位
```c
static void add_str_num(str_num *output, str_num x, str_num y)
{
if (output->len < x.digits + 1) {
output->str = krealloc(output->str, output->len * 2, GFP_KERNEL);
output->len *= 2;
}
if (x.digits > y.digits)
y.str[y.digits] = 0;
int carry = 0, j;
for (j = 0; j < x.digits; j++) {
output->str[j] = x.str[j] + y.str[j] + carry;
carry = output->str[j] >= 10;
output->str[j] %= 10;
}
if (carry)
output->str[j++] = carry;
output->digits = j;
}
static long long fib_sequence_string_number(long long k, char *buf)
{
str_num f[3];
for (int i = 0; i < 3; i++) {
f[i].str = kmalloc(128, GFP_KERNEL);
f[i].len = 128;
f[i].digits = 1;
}
f[0].str[0] = 0;
f[1].str[0] = 1;
for (int i = 2; i <= k; i++) {
add_str_num(&f[i % 3], f[(i - 1) % 3], f[(i - 2) % 3]);
}
str_num *ans = &f[k % 3];
for (int i = 0; i < ans->digits; i++)
ans->str[i] += '0';
reverse_str(ans->str, ans->digits);
copy_to_user(buf, ans->str, ans->digits + 1);
for (int i = 0; i < 3; i++)
kfree(f[i].str);
return k;
}
```
<!--
#### 效能分析
因為在做相加的時候也是使用迴圈
所以越大的數相加就需要話越久的時間
因此是呈現冪增長
但是中間有幾個斷層
我想是因為計算累積要求記憶體到一個境界
產生 page fault 現象而導致的
![](https://i.imgur.com/Z0dPKgt.png)
-->
### Small/Short String Optimization
參考 [Small/Short String Optimization](http://wdv4758h.github.io/notes/cpp/string-optimization.html)
定義出結構
```c
typedef union {
struct {
char data[15];
uint8_t sso_size : 6, sso_size_msb : 1, sso_capacity_msb : 1;
};
struct {
char *ptr;
ulong size : 31, capacity : 31, size_msb : 1, capacity_msb : 1;
};
} SSO;
```
#### 初始化
會先判斷位數是否超過 String 結構的長度
決定是要是用自身儲存
還是要使用另外的 char 陣列
```c
void sso_init(SSO *str, ulong n)
{
uint size = n == 0, i;
ulong tmp = n;
memset(str, 0, sizeof(SSO));
while (tmp > 0) {
size++;
tmp /= 10;
}
if (size <= SSO_CAP) {
for (i = 0; i < size; i++) {
str->data[i] = n % 10;
n /= 10;
}
str->sso_size = SSO_CAP - size;
} else {
int capacity = size + (size % 8);
char *ptr = kcalloc(capacity, sizeof(char), GFP_KERNEL);
for (i = 0; i < size; i++) {
ptr[i] = n % 10;
n /= 10;
}
str->ptr = ptr;
str->capacity = capacity;
str->capacity_msb = (capacity & MSB) > 0;
str->size = size;
str->size_msb = !(size & MSB);
}
}
```
#### 相加
會先從 String 中取出值
判斷還需不需要為 `output` 增加空間
再來就與上面的字串加法一樣了
```c
void sso_add(SSO *output, SSO *x, SSO *y)
{
char *ans, *xn, *yn;
int ans_size, xn_size, yn_size, ans_cap, xn_cap, yn_cap;
get_sso_content(output, &ans, &ans_size, &ans_cap);
get_sso_content(x, &xn, &xn_size, &xn_cap);
get_sso_content(y, &yn, &yn_size, &yn_cap);
if (ans_cap < (xn_size + 2) || ans_cap < (yn_size + 2)) {
sso_realloc(output, ans_cap + 8);
get_sso_content(output, &ans, &ans_size, &ans_cap);
}
if (xn_size > yn_size)
yn[yn_size] = 0;
int carry = 0, j;
for (j = 0; j < xn_size; j++) {
ans[j] = xn[j] + yn[j] + carry;
carry = ans[j] >= 10;
ans[j] %= 10;
}
if (carry)
ans[j++] = carry;
sso_set_size(output, j);
}
```
#### 與 string number 分析
因為 SSO 也是使用字串作為基礎做大數加法
所以也是呈現冪增長
而 SSO 會比 string number 多了一點運算也比較頻繁的分配記憶體
目前驗證到 50000 位是可以正常運算的
![](https://i.imgur.com/gpvTWHp.png)
### bn
參考 [KYG-yaya573142 大數運算](https://hackmd.io/@KYWeng/rkGdultSU#%E5%A4%A7%E6%95%B8%E9%81%8B%E7%AE%97)
```c
typedef struct _bign {
uint *num;
size_t size;
uint len;
} bign;
```
#### 相加
```c
int bn_add(bign *out, const bign *bn1, const bign *bn2)
{
const bign *x, *y;
if (bn1->len > bn2->len) {
x = bn1;
y = bn2;
} else {
x = bn2;
y = bn1;
}
if (out->size < max(x->len, y->len) + 1)
bn_resize(out, max(x->len, y->len) + 2);
uint i = 0;
ulong carry = 0;
for (i = 0; i < y->len; i++) {
carry = x->num[i] + carry + y->num[i];
out->num[i] = carry;
carry >>= 32;
}
for (i; i < x->len; i++) {
carry = x->num[i] + carry;
out->num[i] = carry;
carry >>= 32;
}
if (carry) {
out->num[i] = carry;
i++;
}
out->len = i;
return 1;
}
```
#### 與上述大數運算分析
因為一次是使用一個 int 加法
做的加法比較少次
所以成長的幅度比較小
![](https://i.imgur.com/oaD05Ya.png)
## fast doubling
參考了作業要求中的公式實作
![](https://i.imgur.com/nMsJoPt.png)
```c
for (mask; mask > 0; mask >>= 1) {
bn_mult(&t1, &f1, &f1);
bn_mult(&t2, &f2, &f2);
bn_add(&t3, &t1, &t2);
bn_lshift(&f2, 1);
bn_sub(&t1, &f2, &f1);
bn_cpy(&t2, &f1);
bn_mult(&f1, &t1, &t2);
bn_cpy(&f2, &t3);
if (k & mask) {
bn_cpy(&t1, &f1);
bn_cpy(&t2, &f2);
bn_cpy(&f1, &f2);
bn_add(&f2, &t2, &t1);
}
}
```
原本在計算時間上發現相差不多
![](https://i.imgur.com/wRa23HW.png)
後來發現是因為我把計算從大數結構轉成字串的函式放在核心執行
這樣花了很多時間需要優化這段函式
![](https://i.imgur.com/mowzUP2.png)
## Mutex
在計算費式數列時
因為在輸入再要計算在第幾位是存在 `loff_t *offset` 所以不同的 thread 進入會附蓋掉前一個
```c
void *ret;
fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
pthread_t t1;
pthread_create(&t1, NULL, fib, 1);
pthread_t t2;
pthread_create(&t2, NULL, fib, 2);
pthread_join(t1, &ret);
pthread_join(t2, &ret);
close(fd);
```
所以在 `read` 函式裡面傳的 buffer 的長度改程需要計算的位數可以正常執行
或是在buffer的前幾位改成計算的位數就會不發生衝突了