contributed by <arthurchang09
>
限定 CPU 給特定的程式使用
進入 /etc/default/grub
$ sudo vim /etc/default/grub
看到以下這行
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
預計要將 cpu 0
獨立出來,將上方做更改
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=0"
接著輸入
$ sudo update-grub
重新開機即可,系統管理員顯示 CPU 0
佔用為
若須程式放在 cpu 0
執行,使用 taskset
taskset 0x01 ./client
address space layout randomization (ASLR)
$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
performance.sh
:for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
do
echo performance > ${i}
done
之後再用 $ sudo sh performance.sh
執行
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
參考 Fast doubling的作法以及以下虛擬碼
Fast_Fib(n)
a = 0; b = 1; // m = 0
for i = (number of binary digit in n) to 1
t1 = a*(2*b - a);
t2 = b^2 + a^2;
a = t1; b = t2; // m *= 2
if (current binary digit == 1)
t1 = a + b; // m++
a = b; b = t1;
return a;
為了得到 number of binary digit in n
,採用一個 for
迴圈計算 n
的最高 bit 為 1 的位置,並使用 mask
記下,隨著迴圈進行逐步右移,如此一來可以判斷某 bit 是否為 1。
uint64_t my_fib(unsigned int n){
unsigned int h = 0;
for (unsigned int i = n; i; i>>=1 ){
h++;
}
uint64_t a = 0;
uint64_t b = 1;
for (unsigned int mask = 1 << (h - 1); mask; mask >>= 1){
uint64_t t1 = a*(2*b - a);
uint64_t t2 = a*a + b*b;
a = t1;
b = t2;
if (mask & n) {
t1 = a + b;
a = b;
b = t1;
}
}
return a;
}
在 考慮到硬體加速 F(n) 的手法 中有提到 clz
這一類硬體加速指令。而在 GCC 中有提供 __builtin_clz ,考慮到 __builtin_clz(0)
為 undefined behavior ,增加一個條件判斷 n
是否為 0 ,由於此情況非常少見,參考 lab0 中 list_sort 的 unlikely
提供編譯器優化。
#define unlikely(x) __builtin_expect(!!(x), 0)
uint64_t my_fib(unsigned int n)
{
if (unlikely(!n)) {
return 0;
}
uint64_t a = 0;
uint64_t b = 1;
for (unsigned int mask = 1 << (__builtin_clz(n)); mask; mask >>= 1) {
uint64_t t1 = a * (2 * b - a);
uint64_t t2 = a * a + b * b;
a = t1;
b = t2;
if (mask & n) {
t1 = a + b;
a = b;
b = t1;
}
}
return a;
}
用以下程式碼測量執行時間
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
//return (ssize_t) fib_sequence(*offset);
ktime_t kt = ktime_get();
ssize_t add = fib_sequence(*offset);
kt = ktime_sub(ktime_get(), kt);
ktime_t kt2 = ktime_get();
unsigned long long ans = fib_fast_doubling(*offset);
kt2 = ktime_sub(ktime_get(), kt2);
ktime_t kt3 = ktime_get();
unsigned long long ans1 = fib_fast_doubling_clz(*offset);
kt3 = ktime_sub(ktime_get(), kt3);
printk(KERN_INFO "%lld %lld %lld %lld", *offset, ktime_to_ns(kt),
ktime_to_ns(kt2), ktime_to_ns(kt3));
return add;
}
比較兩這的執行時間得下圖
可看出 fast doubling 的兩個版本均比 iteration 還要快,且計算時間相對平穩。然而,有 clz
的版本似乎較沒有 clz
者慢。根據 KYG-yaya573142,可能是被優化掉,因此查看了組合語言
ktime_t kt3 = ktime_get();
1ad: 49 89 c7 mov %rax,%r15
if (unlikely(!n)) {
1b0: 48 85 c9 test %rcx,%rcx
1b3: 74 15 je 1ca <fib_read+0x10a>
for (unsigned int mask = 1 << (__builtin_clz(n)); mask; mask >>= 1) {
1b5: 0f bd c9 bsr %ecx,%ecx
1b8: b8 01 00 00 00 mov $0x1,%eax
1bd: 83 f1 1f xor $0x1f,%ecx
1c0: d3 e0 shl %cl,%eax
1c2: 85 c0 test %eax,%eax
1c4: 74 04 je 1ca <fib_read+0x10a>
1c6: d1 e8 shr %eax
1c8: 75 fc jne 1c6 <fib_read+0x106>
並沒有被優化掉,
尚欠效能比較…
採用以下之 struct
紀錄大數,目前先採用
#define LENGTH 8
typedef struct bn {
uint32_t num[LENGTH];
} bn;
接著加法、減法、左移需要作出相應的調整。
void bn_add(bn a, bn b, bn *sum)
{
uint32_t N[LENGTH];
uint64_t c = 0;
for (int i = 0; i < LENGTH; ++i) {
c += (uint64_t) a.num[i] + b.num[i];
N[i] = c;
c >>=32;
}
for (int i = LENGTH - 1; i >= 0; --i) {
sum->num[i] = N[i];
}
}
加法先從較低位元開始相加,並將結果存入 uint64_t
以保留進位,接著將除了進位以外的總和存起來,並將 c
右移 32 位,即會成為下一組 32 bits 的進位。
以 8 bits 為例,如下面的直式,做完加法後產生 carry ,位在第 8 bit 。
存入對應的 N
陣列後,將 c
右移 8 bit ,即是下一組較高位 8 bits 要再 +1 的位置。
void bn_dec(bn a, bn b, bn *diff) // a - b
{
uint32_t N[LENGTH];
uint32_t br = 0;
for (int i = 0; i < LENGTH; ++i) {
if (a.num[i] < (b.num[i] + br)) {
N[i] = UINT32_MAX - b.num[i] + a.num[i] - br + 1;
br = 1;
} else {
N[i] = a.num[i] - b.num[i] - br;
br = 0;
}
}
for (int i = LENGTH - 1; i >= 0; --i) {
diff->num[i] = N[i];
}
}
減法要考慮 borrow ,當 a
比 b + br
小的時候就需要借位。為了避免 overflow , a
和 b
不能先相減,這裡使用 UINT32_MAX
先減掉 b
,再依序加上 a
減去 br
。由於 UINT_MAX
為
void bn_lshift(bn a, int shift, bn *res)
{
int shift_32b_amount = shift >> 5;
memcpy(res, &a, sizeof(bn));
uint32_t tmp[shift_32b_amount + 1];
uint32_t tmp2[shift_32b_amount + 1];
uint32_t i = 0;
int shift_bit = shift & 0x0000001f;
memset(tmp, 0, sizeof(int) * shift_32b_amount);
if (shift_32b_amount){
for (int i = 0; i < shift_32b_amount; ++i) {
tmp[i] = res->num[i];
res->num[i] = 0;
}
for (i = shift_32b_amount; i <= LENGTH - shift_32b_amount; i += shift_32b_amount) {
memcpy(tmp2, &res->num[i], sizeof(uint32_t)* shift_32b_amount);
memcpy(&res->num[i], tmp, sizeof(uint32_t)* shift_32b_amount);
memcpy(tmp, tmp2, sizeof(uint32_t)*shift_32b_amount);
}
if (shift_32b_amount >= (LENGTH >> 1) + 1) {
memcpy(&res->num[shift_32b_amount], tmp,
sizeof(uint32_t)*(LENGTH - shift_32b_amount));
}
}
if (shift_bit) {
tmp[0] = (res->num[0] >> (32 - shift_bit));
res->num[0] <<= shift_bit;
for (i = 0; i < LENGTH - 1; ++i){
tmp2[0] = (res->num[i + 1] >> (32 - shift_bit)) ;
res->num[i + 1] = res->num[i + 1] << shift_bit | tmp[0];
tmp[0] = tmp2[0];
}
}
}
上方的程式碼是左移的實作。先計算總共位移幾組 32 bits 和剩下不到 32 bits 之位移量。接著先處理 32 bits 位移,用 memcpy
複製數值到較高位元。剩下不到 32 bits 位移先取最高位的 bit 再和 左移後較高位的 32 bits 做 |
。
void bn_mul_1(bn a, bn b, bn *res)
{
//printf("%d\n", clz(0));
uint64_t tmp[LENGTH];
memset(tmp, 0,sizeof(uint64_t) * LENGTH);
for (int i = 0; i < LENGTH; ++i) {
for (int j = 0; j < LENGTH; ++j) {
if ((i + j) < LENGTH ) {
tmp[i + j] += (uint64_t) a.num[i] * b.num[j];
}
}
}
for (int i = 1; i < LENGTH; ++i) {
tmp[i] += tmp[i - 1] >> 32;
tmp[i - 1] &= 0xffffffff;
}
for (int i = LENGTH - 1; i >= 0; --i) {
res->num[i] = (uint32_t) tmp[i];
}
}
上方的程式碼為初始版本之乘法實作,為直式乘法的實作。然而,當計算到第 190 項時會出現錯誤,原因是第 9 行的加法式可能出現 overflow。 參考 KYG-yaya573142 作法將乘法後的結果依序加到較高位的 bit 。
void bn_mul_2(bn a, bn b, bn *res)
{
uint64_t tmp[LENGTH];
uint32_t t[LENGTH];
uint64_t c1 = 0,c2 = 0;
memset(tmp, 0,sizeof(uint64_t) * LENGTH);
memset(t, 0,sizeof(uint32_t) * LENGTH);
for (int i = 0; i < LENGTH; ++i) {
for (int j = 0; j < LENGTH; ++j) {
if ((i + j) < LENGTH ) {
c1 = (uint64_t) a.num[i] * b.num[j];
c2 = 0;
for (int k = i + j; k < LENGTH; ++k) {
c2 += (uint64_t) t[k] + (c1 & 0xffffffff);
t[k] = c2 & 0xffffffff;
c2 >>= 32;
c1 >>= 32;
if (!c1 && !c2) {
break;
}
}
}
}
}
for (int i = LENGTH - 1; i >= 0; --i) {
res->num[i] = t[i];
}
}
為了將這些大數印出,參考 OscarShiang 的方式將大數轉成十進位印出,
char * bn_to_string(bn a){
char s[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
uint32_t n[LENGTH];
int i;
memset(s, '0', sizeof(s) - 1);
memcpy(n, a.num, sizeof(uint32_t) * LENGTH);
s[sizeof(s) - 1] = '\0';
//printf("%ld\n", sizeof(a));
for (i = 0 ; i < 8 * sizeof(uint32_t) * LENGTH; ++i) {
int j, carry;
carry = (n[LENGTH - 1] >= 0x80000000);
for (int j = LENGTH - 1; j >= 0; --j) {
n[j] = ((n[j] << 1) & 0xffffffff) + ((j - 1) >= 0 ? (n[j - 1] >= 0x80000000) : 0);
}
for (int j = sizeof(s) - 2; j >= 0; j--) {
s[j] += s[j] - '0' + carry;
carry = (s[j] > '9');
if (carry)
s[j] -= 10;
}
}
i = 0;
while (i < sizeof(s) - 2 && s[i] == '0')
i++;
char *dec = (char *)malloc(sizeof(s) - i);
memcpy(dec, &s[i], sizeof(s) - i);
return dec;
}
以下為 iteration 的作法
void bn_norm_fib(unsigned int n, bn *res)
{
if (unlikely(!n)) {
memset(res, 0, sizeof(bn));
bn_to_string(*res);
return;
}
bn a, b;
memset(&a, 0, sizeof(bn));
memset(&b, 0, sizeof(bn));
memset(res, 0, sizeof(bn));
a.num[0] = 0;
b.num[0] = 1;
res->num[0] = 1;
for (int i = 1; i < n; ++i){
bn_add(a, b, res);
memcpy(&a, &b, sizeof(bn));
memcpy(&b, res, sizeof(bn));
}
}
void bn_fib(unsigned int n, bn *res)
{
if (unlikely(!n)) {
memset(res, 0, sizeof(bn));
bn_to_string(*res);
return ;
}
unsigned int h = 0;
bn a,b;
memset(&a, 0, sizeof(bn));
memset(&b, 0, sizeof(bn));
a.num[0] = 0;
b.num[0] = 1;
for (unsigned int mask = 1 << (__builtin_clz(n)); mask; mask >>= 1) {
bn t1;
bn t2;
memset(&t1, 0, sizeof(bn));
memset(&t2, 0, sizeof(bn));
bn_lshift(b, 1, &t1);
bn_dec(t1, a, &t1);
bn_mul_2(a, t1, &t1);
bn_mul_2(a, a, &t2);
bn_mul_2(b, b, &b);
bn_add(t2, b, &t2);
memcpy(&a, &t1, sizeof(bn));
memcpy(&b, &t2, sizeof(bn));
if (mask & n) {
bn_add(a,b, &t1);
memcpy(&a, &b, sizeof(bn));
memcpy(&b, &t1, sizeof(bn));
}
}
memcpy(res, &a, sizeof(bn));
}
接著修改 fibdrv.c
,直接使用 copy_to_user
將字串傳到 client.c
。
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
bn res;
bn_fib(*offset, &res);
char *str = bn_to_string(res);
size_t len = strlen(str) + 1;
copy_to_user(buf, str, len);
kfree(str);
return fib_sequence(*offset);
}
修改 client.c
使其能接收到字串
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#define FIB_DEV "/dev/fibonacci"
int main()
{
long long sz;
char buf[1];
char write_buf[] = "testing writing";
int offset = 100; /* TODO: try test something bigger than the limit */
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
for (int i = 0; i <= offset; i++) {
sz = write(fd, write_buf, strlen(write_buf));
printf("Writing to " FIB_DEV ", returned the sequence %lld\n", sz);
}
for (int i = 0; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
sz = read(fd, buf, 1);
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%s.\n",
i, buf);
}
for (int i = offset; i >= 0; i--) {
lseek(fd, i, SEEK_SET);
sz = read(fd, buf, 1);
printf("Reading from " FIB_DEV
" at offset %d, returned the sequence "
"%s.\n",
i, buf);
}
close(fd);
return 0;
}
運行 client.c
發生以下錯誤
*** stack smashing detected ***: terminated
Aborted
參考 blueskyson 修改
int bn_to_string(bn a, char str[])
{
char s[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
unsigned int n[LENGTH];
int i;
memset(s, '0', sizeof(s) - 1);
memcpy(n, a.num, sizeof(uint32_t) * LENGTH);
s[sizeof(s) - 1] = '\0';
// printf("%ld\n", sizeof(a));
for (i = 0; i < 8 * sizeof(uint32_t) * LENGTH; ++i) {
int carry;
carry = (n[LENGTH - 1] >= 0x80000000);
for (int j = LENGTH - 1; j >= 0; --j) {
n[j] = ((n[j] << 1) & 0xffffffff) +
((j - 1) >= 0 ? (n[j - 1] >= 0x80000000) : 0);
}
for (int j = sizeof(s) - 2; j >= 0; j--) {
s[j] += s[j] - '0' + carry;
carry = (s[j] > '9');
if (carry)
s[j] -= 10;
}
}
i = 0;
while (i < sizeof(s) - 2 && s[i] == '0')
i++;
//char *dec = (char *) kmalloc(sizeof(s) - i, GFP_KERNEL);
//memcpy(dec, &s[i], sizeof(s) - i);
memcpy(str, s, sizeof(s));
return i;
}
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
bn res;
bn_fib(*offset, &res);
char str[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
int len = bn_to_string(res, str);
copy_to_user(buf, str + len, 8 * sizeof(uint32_t) * LENGTH / 3 + 2 - len);
return fib_sequence(*offset);
}
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#define FIB_DEV "/dev/fibonacci"
#define LENGTH 22
#define BUFFSIZE 8 * sizeof(int) * LENGTH / 3 + 2
int main()
{
long long sz;
char buf[BUFFSIZE];
char write_buf[] = "testing writing";
int offset = 1000; /* TODO: try test something bigger than the limit */
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
for (int i = 0; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
sz = read(fd, buf, BUFFSIZE);
printf(" offset %d, returned the sequence "
"%s.\n",
i, buf);
}
close(fd);
return 0;
}
首先在 userspace 測試
sudo perf stat -r 15 -e cycles,instructions,cache-references,cache-misses,branch,branch-instructions,branch-misses ./a.out
採用 perf
查看執行費波納契數列第 0 項到第 500 項之 cache miss 和 branch miss。
Performance counter stats for './a.out' (15 runs):
7995,7081 cycles ( +- 0.30% ) (53.67%)
1,8737,9429 instructions # 2.34 insn per cycle ( +- 0.06% ) (69.11%)
7,6359 cache-references ( +- 1.39% ) (69.11%)
1,8337 cache-misses # 24.015 % of all cache refs ( +- 6.72% ) (69.11%)
2689,1258 branch ( +- 0.05% ) (71.15%)
2727,4701 branch-instructions ( +- 0.04% ) (77.22%)
4,4548 branch-misses # 0.16% of all branches ( +- 10.93% ) (59.74%)
0.0261203 +- 0.0000789 seconds time elapsed ( +- 0.30% )
Performance counter stats for './a.out' (15 runs):
2951,5320 cycles ( +- 0.47% ) (24.03%)
5480,8003 instructions # 1.86 insn per cycle ( +- 0.42% ) (65.18%)
4,8197 cache-references ( +- 2.72% ) (98.36%)
1,1488 cache-misses # 23.836 % of all cache refs ( +- 12.41% )
489,4013 branch ( +- 0.01% )
491,1414 branch-instructions ( +- 0.09% ) (75.97%)
3,3682 branch-misses # 0.69% of all branches ( +- 37.92% ) (1.64%)
0.009944 +- 0.000117 seconds time elapsed ( +- 1.17% )
Fast Doubling 在 Instruction per cycle(IPC) 和 Branch miss 上表現均優於 Iteration ,然而總執行的指令多非常多,因此執行時間也較多。下方的圖也顯示這一點。
下圖是執行 0 到 1000 項之結果
參考 Risheng1128 將 CPU 0 獨立出來執行
接著按照 Linux 效能分析的提示 抑制 address space layout randomization (ASLR),設定 scaling_governor 為 performance ,關閉 turbo mode。得到以下圖表
非常明顯 bn-fast-doubling
較 bn_iteration
慢。
$ valgrind --tool=cachegrind --branch-sim=yes ./a.out
==23370== Cachegrind, a cache and branch-prediction profiler
==23370== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==23370== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==23370== Command: ./a.out
==23370==
--23370-- warning: L3 cache found, using its data for the LL simulation.
==23370==
==23370== I refs: 187,585,847
==23370== I1 misses: 1,075
==23370== LLi misses: 1,053
==23370== I1 miss rate: 0.00%
==23370== LLi miss rate: 0.00%
==23370==
==23370== D refs: 102,874,241 (86,658,760 rd + 16,215,481 wr)
==23370== D1 misses: 3,201 ( 2,542 rd + 659 wr)
==23370== LLd misses: 2,663 ( 2,069 rd + 594 wr)
==23370== D1 miss rate: 0.0% ( 0.0% + 0.0% )
==23370== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==23370==
==23370== LL refs: 4,276 ( 3,617 rd + 659 wr)
==23370== LL misses: 3,716 ( 3,122 rd + 594 wr)
==23370== LL miss rate: 0.0% ( 0.0% + 0.0% )
==23370==
==23370== Branches: 22,906,501 (22,779,656 cond + 126,845 ind)
==23370== Mispredicts: 1,147,376 ( 1,147,235 cond + 141 ind)
==23370== Mispred rate: 5.0% ( 5.0% + 0.1% )
--------------------------------------------------------------------------------
I1 cache: 32768 B, 64 B, 8-way associative
D1 cache: 32768 B, 64 B, 8-way associative
LL cache: 3145728 B, 64 B, 12-way associative
Command: ./a.out
Data file: cachegrind.out.26188
Events recorded: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Events shown: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Thresholds: 0.1 100 100 100 100 100 100 100 100 100 100 100 100
Include dirs:
User annotated:
Auto-annotation: on
--------------------------------------------------------------------------------
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
--------------------------------------------------------------------------------
186,025,668 971 951 86,059,765 1,336 1,146 16,109,504 566 545 22,649,428 1,144,320 126,819 128 PROGRAM TOTALS
--------------------------------------------------------------------------------
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim file:function
--------------------------------------------------------------------------------
163,840,484 8 8 75,946,074 0 0 12,966,362 0 0 21,096,210 1,070,152 0 0 /home/arthurchang09/quiz2/test.c:bn_mul_2
6,913,606 13 13 2,850,456 0 0 725,116 0 0 237,538 12,525 0 0 /home/arthurchang09/quiz2/test.c:bn_lshift
5,839,347 4 4 2,937,970 0 0 525,084 0 0 487,578 26,360 0 0 /home/arthurchang09/quiz2/test.c:bn_dec
5,462,604 2 2 2,930,076 0 0 441,720 0 0 397,548 29,486 0 0 /home/arthurchang09/quiz2/test.c:bn_add
1,763,169 18 18 1,082,999 0 0 1,136,227 2 1 26,506 3,128 0 0 /home/arthurchang09/quiz2/test.c:bn_fib
1,662,276 5 5 113,519 0 0 277,046 3 2 352,058 9 0 0 /build/glibc-eX1tMB/glibc-2.31/string/../sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:__memset_avx2_unaligned_erms
253,223 17 16 126,584 1 0 11 1 1 10 5 126,562 11 ???:???
==25942== Cachegrind, a cache and branch-prediction profiler
==25942== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25942== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25942== Command: ./a.out
==25942==
--25942-- warning: L3 cache found, using its data for the LL simulation.
==25942==
==25942== I refs: 52,946,905
==25942== I1 misses: 927
==25942== LLi misses: 912
==25942== I1 miss rate: 0.00%
==25942== LLi miss rate: 0.00%
==25942==
==25942== D refs: 34,742,105 (28,234,653 rd + 6,507,452 wr)
==25942== D1 misses: 1,902 ( 1,336 rd + 566 wr)
==25942== LLd misses: 1,691 ( 1,146 rd + 545 wr)
==25942== D1 miss rate: 0.0% ( 0.0% + 0.0% )
==25942== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==25942==
==25942== LL refs: 2,829 ( 2,263 rd + 566 wr)
==25942== LL misses: 2,603 ( 2,058 rd + 545 wr)
==25942== LL miss rate: 0.0% ( 0.0% + 0.0% )
==25942==
==25942== Branches: 3,899,539 ( 3,772,990 cond + 126,549 ind)
==25942== Mispredicts: 252,834 ( 252,706 cond + 128 ind)
==25942== Mispred rate: 6.5% ( 6.7% + 0.1% )
--------------------------------------------------------------------------------
I1 cache: 32768 B, 64 B, 8-way associative
D1 cache: 32768 B, 64 B, 8-way associative
LL cache: 3145728 B, 64 B, 12-way associative
Command: ./a.out
Data file: bn_norm_cachegrind.out.25942
Events recorded: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Events shown: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Thresholds: 0.1 100 100 100 100 100 100 100 100 100 100 100 100
Include dirs:
User annotated:
Auto-annotation: on
--------------------------------------------------------------------------------
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
--------------------------------------------------------------------------------
52,946,905 927 912 28,234,653 1,336 1,146 6,507,452 566 545 3,772,990 252,706 126,549 128 PROGRAM TOTALS
--------------------------------------------------------------------------------
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim file:function
--------------------------------------------------------------------------------
46,282,250 3 3 24,825,250 0 0 3,742,500 1 0 3,368,250 249,538 0 0 /home/arthurchang09/quiz2/test.c:bn_add
4,762,025 6 6 2,874,257 0 0 2,500,505 3 3 126,252 510 0 0 /home/arthurchang09/quiz2/test.c:bn_norm_fib
1,497,000 2 2 374,250 0 0 249,500 0 0 249,500 0 0 0 /build/glibc-eX1tMB/glibc-2.31/string/../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:__memcpy_avx_unaligned_erms
252,683 17 16 126,314 1 0 11 1 1 10 5 126,292 11 ???:???
將以上重要資訊整理如下
Bn Fast Doubling | Bn Iteration | |
---|---|---|
perf instructions | 1,8737,9429 | 5480,8003 |
perf cache-references | 7,6359 | 4,8197 |
perf cache miss | 1,8337 | 1,1488 |
perf branch | 2689,1258 | 489,4013 |
perf branch miss | 4,4548 | 3,3682 |
perf branch miss rate | 0.16% | 0.69% |
cachegrind LL cache miss | 3,716 | 2,603 |
cachegrind Branch | 22,906,501 | 3,899,539 |
cachegrind Branch miss | 1,147,376 | 252,834 |
cachegrind Branch miss rate | 5.0% | 6.5% |
可以發現 Bn Fast doubling 執行的指令數約為 iteration 版的 3 倍多。雖然有較低 cache miss rate 和 Branch miss rate ,但仍無助於減少執行時間。
先從 cachegrind 的數據觀察,可以發現 bn_mul2
之 Ir
Dr
Dw
比其他高出非常多,即此部分程式執行最多次,大部分的 Branch miss 發生在此,因此這個部份極有可能是拖慢執行速度的原因。
--------------------------------------------------------------------------------
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim file:function
--------------------------------------------------------------------------------
163,840,484 8 8 75,946,074 0 0 12,966,362 0 0 21,096,210 1,070,152 0 0 /home/arthurchang09/quiz2/test.c:bn_mul_2
接著看函式執行的狀況,三層的巢狀迴圈是造成大量 branch miss ,也是最多 Ir
之處,大部分 cycle 都是執行此處指令。
. . . . . . . . . . . . . void bn_mul_2(bn a, bn b, bn *res)
300,048 2 2 37,506 0 0 112,518 0 0 0 0 0 0 {
. . . . . . . . . . . . . uint64_t tmp[LENGTH];
. . . . . . . . . . . . . uint32_t t[LENGTH];
75,012 0 0 0 0 0 75,012 0 0 0 0 0 0 uint64_t c1 = 0,c2 = 0;
187,530 0 0 0 0 0 37,506 0 0 0 0 0 0 memset(tmp, 0,sizeof(uint64_t) * LENGTH);
187,530 1 1 0 0 0 37,506 0 0 0 0 0 0 memset(t, 0,sizeof(uint32_t) * LENGTH);
1,500,240 1 1 937,650 0 0 37,506 0 0 487,578 37,516 0 0 for (int i = 0; i < LENGTH; ++i) {
18,002,880 0 0 11,251,800 0 0 450,072 0 0 5,850,936 525,100 0 0 for (int j = 0; j < LENGTH; ++j) {
27,004,320 1 1 10,801,728 0 0 0 0 0 5,400,864 450,217 0 0 if ((i + j) < LENGTH ) {
29,254,680 0 0 11,701,872 0 0 2,925,468 0 0 0 0 0 0 c1 = (uint64_t) a.num[i] * b.num[j];
2,925,468 0 0 0 0 0 2,925,468 0 0 0 0 0 0 c2 = 0;
20,558,652 2 2 8,829,988 0 0 2,925,468 0 0 2,952,260 16 0 0 for (int k = i + j; k < LENGTH; ++k) {
23,618,080 0 0 11,809,040 0 0 0 0 0 0 0 0 0 c2 += (uint64_t) t[k] + (c1 & 0xffffffff);
14,761,300 0 0 5,904,520 0 0 2,952,260 0 0 0 0 0 0 t[k] = c2 & 0xffffffff;
2,952,260 0 0 2,952,260 0 0 0 0 0 0 0 0 0 c2 >>= 32;
2,952,260 0 0 2,952,260 0 0 0 0 0 0 0 0 0 c1 >>= 32;
11,758,976 0 0 5,879,488 0 0 0 0 0 5,879,488 19,779 0 0 if (!c1 && !c2) {
2,925,468 0 0 0 0 0 0 0 0 0 0 0 0 break;
. . . . . . . . . . . . . }
. . . . . . . . . . . . . }
. . . . . . . . . . . . . }
. . . . . . . . . . . . . }
. . . . . . . . . . . . . }
1,500,240 1 1 937,650 0 0 37,506 0 0 487,578 37,522 0 0 for (int i = LENGTH - 1; i >= 0; --i) {
3,150,504 0 0 1,800,288 0 0 450,072 0 0 0 0 0 0 res->num[i] = t[i];
. . . . . . . . . . . . . }
225,036 0 0 150,024 0 0 0 0 0 37,506 2 0 0 }
接著使用 perf record 觀察,
sudo perf record -g --call-graph dwarf ./a.out
sudo perf report --stdio -g graph,0.5,caller
以下節錄其中的片段
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 659 of event 'cycles'
# Event count (approx.): 502472023
#
# Children Self Command Shared Object Symbol
# ........ ........ ....... ................. ..................................
#
99.41% 0.00% a.out a.out [.] _start
|
---_start
__libc_start_main
main
bn_fib
|
|--89.69%--bn_mul_2
| |
| --1.53%--__memset_avx2_unaligned_erms
|
|--3.06%--bn_lshift
|
|--2.51%--bn_dec
|
|--2.17%--bn_add
|
--0.76%--__memset_avx2_unaligned_erms
可發現 bn_mul_2
佔據 89% ,剩下的 bn_lshift
、 bn_dec
和 bn_add
占比就比較少,因此必須設法提高乘法的效率。
因此修改 bn_mul_2
,修改長乘法的方式,
之前的作法是依照長乘法的習慣先由右而左,由上而下做乘法,即先算 bn_add
一樣將可能的 carry 向高位數加,因此需要三層巢狀迴圈。因此我將計算順序稍作修改,先由上而下,由右而左,也就是計算 for
迴圈。
void bn_mul_3(bn a, bn b, bn *res) {
uint64_t c1 = 0,c2 = 0,carry;
memset(res, 0, sizeof(bn));
for (int i = 0; i < LENGTH; ++i) {
c1 = carry;
carry = 0;
for (int j = 0; j <= i; ++j) {
c2 = (uint64_t) a.num[j] * b.num[i - j];
c1 += c2;
carry += c1 < c2;
}
res->num[i] = c1 & 0xffffffff;
carry = (carry << 32) + (c1 >> 32);
}
}
接著在 userspace 用timespec
測量時間,如下:
很明顯減少了約 50% 的執行時間
使用 perf
測量
sudo perf stat -r 15 -e cycles,instructions,cache-references,cache-misses, branch,branch-instructions,branch-misses taskset 0x01 ./a.out
Performance counter stats for 'taskset 0x01 ./a.out' (15 runs):
2,0095,7615 cycles ( +- 0.07% ) (51.96%)
3,7231,5532 instructions # 1.85 insn per cycle ( +- 0.03% ) (70.37%)
15,6095 cache-references ( +- 1.02% ) (75.08%)
3,6236 cache-misses # 23.214 % of all cache refs ( +- 11.92% ) (75.43%)
3037,2291 branch ( +- 0.04% ) (75.43%)
3035,1483 branch-instructions ( +- 0.02% ) (72.61%)
5668 branch-misses # 0.02% of all branches ( +- 23.54% ) (49.49%)
0.0654597 +- 0.0000478 seconds time elapsed ( +- 0.07% )
Performance counter stats for 'taskset 0x01 ./a.out' (15 runs):
5,1662,8092 cycles ( +- 0.23% ) (56.97%)
10,7807,0383 instructions # 2.09 insn per cycle ( +- 0.02% ) (71.31%)
17,5168 cache-references ( +- 1.06% ) (71.31%)
4,4469 cache-misses # 25.386 % of all cache refs ( +- 9.10% ) (71.31%)
1,5608,3228 branch ( +- 0.03% ) (71.41%)
1,5656,4076 branch-instructions ( +- 0.03% ) (71.72%)
123,2533 branch-misses # 0.79% of all branches ( +- 2.46% ) (57.28%)
0.167680 +- 0.000369 seconds time elapsed ( +- 0.22% )
Performance counter stats for 'taskset 0x01 ./a.out' (15 runs):
2,5550,7791 cycles ( +- 0.24% ) (56.53%)
5,0979,0980 instructions # 2.00 insn per cycle ( +- 0.06% ) (71.02%)
17,1360 cache-references ( +- 1.23% ) (71.02%)
3,7991 cache-misses # 22.170 % of all cache refs ( +- 7.09% ) (71.02%)
2850,6292 branch ( +- 0.11% ) (71.36%)
2833,2169 branch-instructions ( +- 0.06% ) (72.45%)
18,4534 branch-misses # 0.65% of all branches ( +- 3.07% ) (57.62%)
0.083148 +- 0.000213 seconds time elapsed ( +- 0.26% )
簡單整理如下
itseration | fast doubling old | fast doubling new | |
---|---|---|---|
cycle | 2,0095,7615 | 5,1662,8092 | 2,5550,7791 |
instructions | 3,7231,5532 | 10,7807,0383 | 5,0979,0980 |
cache misses | 3,6236 | 4,4469 | 3,7991 |
branch instructions | 3035,1483 | 1,5656,4076 | 2833,2169 |
branch misses | 5668 | 123,2533 | 18,4534 |
以上的 perf
測試跑第 0 項到第 1000 項的費式數列的總時間。在 cycle 數、 instruction 數、 執行時間方面約是舊版的一半,進步幅度很大。可看出由於迴圈從三層變成兩層, branch instructions
的數量約為原本的 branch misses
約是舊版的
接著將以上程式碼放入 fibdrv
,使用 ktime_t
測量執行時間,並使用 return
回傳執行時間。
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
bn res;
ktime_t kt = ktime_get();
bn_fib(*offset, &res);
// s64 time = ktime_to_ns(ktime_sub(ktime_get(), kt));
char str[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
int len = bn_to_string(res, str);
copy_to_user(buf, str + len, 8 * sizeof(uint32_t) * LENGTH / 3 + 2 - len);
s64 time = ktime_to_ns(ktime_sub(ktime_get(), kt));
return time;
}
我測量了只有bn_fib
的執行時間以及包含 copy_to_usr
的時間,發現這之間有非常大的差距。原本減少 50% 的執行時間因為要傳送龐大的字串造成執行時間的差距減少。由下圖可見時間從約 3萬 ns 增加到約 30多萬 ns ,非常明顯 copy_to_user
為了傳送數十個到數百個的字元而耗費大量時間。
於是將傳送的方式改成只傳送 bn
而非龐大的字串。直接將計算的結果 res
放入 copy_to_user
static ssize_t fib_read(struct file *file,
char *buf,
size_t size,
loff_t *offset)
{
bn res;
ktime_t kt = ktime_get();
bn_fib(*offset, &res);
copy_to_user(buf, &res, sizeof(res));
s64 time = ktime_to_ns(ktime_sub(ktime_get(), kt));
return time;
}
接著再由 client.c
接收並使用 memcpy
將值放入 bn res
,最後呼叫 bn_to_string
將數字轉成字串。
int main()
{
char buf[BUFFSIZE];
int offset = 1000; /* TODO: try test something bigger than the limit */
int fd = open(FIB_DEV, O_RDWR);
if (fd < 0) {
perror("Failed to open character device");
exit(1);
}
bn res;
for (int i = 0; i <= offset; i++) {
lseek(fd, i, SEEK_SET);
long long sz = read(fd, buf, BUFFSIZE);
memcpy(&res,buf,sizeof(res));
bn_to_string_new(res);
}
close(fd);
return 0;
}
最後再計算一次 fibdrv
之執行時間,如下圖所示。有無 copy_to_usr
對執行時間影響非常小,
然而,如此一來,將 bn
轉為十進位必需要在 userspace
進行,因此 userspace
也需要花時間進行計算,得到以下的圖。
由上圖可以發現,隨著費式數列的增長,將 bn
轉成字串所花的時間會越來越長,總花費時間會比單純在 fibdrv
轉換成字串還慢,因此,須採用別的方式減少 userpace 的轉換時間。