Try   HackMD

2022q1 Homework3 (fibdrv)

contributed by <arthurchang09>

排除干擾效能分析的因素抑制

  • 限定 CPU 給特定的程式使用
    進入 /etc/default/grub

    ​​​​$ sudo vim /etc/default/grub
    

    看到以下這行

    ​​​​GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
    

    預計要將 cpu 0 獨立出來,將上方做更改

    ​​​​GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=0"
    

    接著輸入

    ​​​​$ sudo update-grub
    

    重新開機即可,系統管理員顯示 CPU 0 佔用為

    0%
    若須程式放在 cpu 0 執行,使用 taskset

    ​​​​taskset 0x01 ./client
    
  • address space layout randomization (ASLR)

$ sudo sh -c "echo 0 > /proc/sys/kernel/randomize_va_space"
  • 設定 scaling_governor 為 performance。準備以下 shell script,檔名為 performance.sh:
for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
do
    echo performance > ${i}
done

之後再用 $ sudo sh performance.sh 執行

  • 針對 Intel 處理器,關閉 turbo mode
$ sudo sh -c "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"

使用 Fast Doubling

參考 Fast doubling的作法以及以下虛擬碼

Fast_Fib(n)
    a = 0; b = 1;       // m = 0
    for i = (number of binary digit in n) to 1
        t1 = a*(2*b - a);
        t2 = b^2 + a^2;
        a = t1; b = t2; // m *= 2
        if (current binary digit == 1)
            t1 = a + b; // m++
            a = b; b = t1;
    return a;

為了得到 number of binary digit in n ,採用一個 for 迴圈計算 n 的最高 bit 為 1 的位置,並使用 mask 記下,隨著迴圈進行逐步右移,如此一來可以判斷某 bit 是否為 1。

uint64_t my_fib(unsigned int n){
    unsigned int h = 0;
    for (unsigned int i = n; i; i>>=1 ){
        h++;
    }
    uint64_t a = 0;
    uint64_t b = 1;
    for (unsigned int mask = 1 << (h - 1); mask; mask >>= 1){
        uint64_t t1 = a*(2*b - a);
        uint64_t t2 = a*a + b*b;
        a = t1;
        b = t2;
        if (mask & n) {
            t1 = a + b;
            a = b;
            b = t1;
        }
    }
    return a;
}

考慮到硬體加速 F(n) 的手法 中有提到 clz 這一類硬體加速指令。而在 GCC 中有提供 __builtin_clz ,考慮到 __builtin_clz(0) 為 undefined behavior ,增加一個條件判斷 n 是否為 0 ,由於此情況非常少見,參考 lab0 中 list_sortunlikely 提供編譯器優化。

#define unlikely(x) __builtin_expect(!!(x), 0)

uint64_t my_fib(unsigned int n) 
{
    if (unlikely(!n)) {
        return 0;
    }
    uint64_t a = 0;
    uint64_t b = 1;
    for (unsigned int mask = 1 << (__builtin_clz(n)); mask; mask >>= 1) {
        uint64_t t1 = a * (2 * b - a);
        uint64_t t2 = a * a + b * b;
        a = t1;
        b = t2;
        if (mask & n) {
            t1 = a + b;
            a = b;
            b = t1;
        }
    }
    return a;
}

用以下程式碼測量執行時間

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    //return (ssize_t) fib_sequence(*offset);
    ktime_t kt = ktime_get();
    ssize_t add = fib_sequence(*offset);
    kt = ktime_sub(ktime_get(), kt);

    ktime_t kt2 = ktime_get();
    unsigned long long ans = fib_fast_doubling(*offset);
    kt2 = ktime_sub(ktime_get(), kt2);

    ktime_t kt3 = ktime_get();
    unsigned long long ans1 = fib_fast_doubling_clz(*offset);
    kt3 = ktime_sub(ktime_get(), kt3);

    printk(KERN_INFO "%lld %lld %lld %lld", *offset, ktime_to_ns(kt),
           ktime_to_ns(kt2), ktime_to_ns(kt3));
    return add;
}

比較兩這的執行時間得下圖

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

可看出 fast doubling 的兩個版本均比 iteration 還要快,且計算時間相對平穩。然而,有 clz 的版本似乎較沒有 clz 者慢。根據 KYG-yaya573142,可能是被優化掉,因此查看了組合語言

     ktime_t kt3 = ktime_get();
 1ad:	49 89 c7             	mov    %rax,%r15
    if (unlikely(!n)) {
 1b0:	48 85 c9             	test   %rcx,%rcx
 1b3:	74 15                	je     1ca <fib_read+0x10a>
    for (unsigned int mask = 1 << (__builtin_clz(n)); mask; mask >>= 1) {
 1b5:	0f bd c9             	bsr    %ecx,%ecx
 1b8:	b8 01 00 00 00       	mov    $0x1,%eax
 1bd:	83 f1 1f             	xor    $0x1f,%ecx
 1c0:	d3 e0                	shl    %cl,%eax
 1c2:	85 c0                	test   %eax,%eax
 1c4:	74 04                	je     1ca <fib_read+0x10a>
 1c6:	d1 e8                	shr    %eax
 1c8:	75 fc                	jne    1c6 <fib_read+0x106>

並沒有被優化掉,

尚欠效能比較

計算 F93 (包含) 之後的 Fibonacci 數

採用以下之 struct 紀錄大數,目前先採用

32×8=256 bits

#define LENGTH 8
typedef struct bn {
    uint32_t num[LENGTH];
} bn;

接著加法、減法、左移需要作出相應的調整。

void bn_add(bn a, bn b, bn *sum)
{
    uint32_t N[LENGTH];
    uint64_t c = 0;
    for (int i = 0; i < LENGTH; ++i) {
        c += (uint64_t) a.num[i] + b.num[i];
        N[i] = c;
        c >>=32;
    }
    
    for (int i = LENGTH - 1; i >= 0; --i) {
        sum->num[i] = N[i];
    }
}

加法先從較低位元開始相加,並將結果存入 uint64_t 以保留進位,接著將除了進位以外的總和存起來,並將 c 右移 32 位,即會成為下一組 32 bits 的進位。

以 8 bits 為例,如下面的直式,做完加法後產生 carry ,位在第 8 bit 。

a[i]10010110b[i]11001100c101100010

存入對應的 N 陣列後,將 c 右移 8 bit ,即是下一組較高位 8 bits 要再 +1 的位置。

c00000001a[i+1]10010110b[i+1]01001100c001100011

void bn_dec(bn a, bn b, bn *diff) // a - b
{
    uint32_t N[LENGTH];
    uint32_t br = 0;
    for (int i = 0; i < LENGTH; ++i) {
        if (a.num[i] < (b.num[i] + br)) {
            N[i] = UINT32_MAX - b.num[i] + a.num[i] - br + 1;
            br = 1;
        } else {
            N[i] = a.num[i] - b.num[i] - br;
            br = 0;
        }
    }

    for (int i = LENGTH - 1; i >= 0; --i) {
        diff->num[i] = N[i];
    }
}

減法要考慮 borrow ,當 ab + br 小的時候就需要借位。為了避免 overflow , ab 不能先相減,這裡使用 UINT32_MAX 先減掉 b ,再依序加上 a 減去 br 。由於 UINT_MAX

2321 ,最後還要加上 1 。

void bn_lshift(bn a, int shift, bn *res) 
{
    int shift_32b_amount = shift >> 5;
    memcpy(res, &a, sizeof(bn));
    uint32_t tmp[shift_32b_amount + 1];
    uint32_t tmp2[shift_32b_amount + 1];
    uint32_t i = 0;
    int shift_bit = shift & 0x0000001f;
    memset(tmp, 0, sizeof(int) * shift_32b_amount);
    if (shift_32b_amount){
        for (int i = 0; i < shift_32b_amount; ++i) {
            tmp[i] = res->num[i];
            res->num[i] = 0;
        }

        for (i = shift_32b_amount; i <= LENGTH - shift_32b_amount; i += shift_32b_amount) {
            memcpy(tmp2, &res->num[i], sizeof(uint32_t)* shift_32b_amount);
            memcpy(&res->num[i], tmp, sizeof(uint32_t)* shift_32b_amount);
            memcpy(tmp, tmp2, sizeof(uint32_t)*shift_32b_amount);
        }
        if (shift_32b_amount >= (LENGTH >> 1) + 1) {
            memcpy(&res->num[shift_32b_amount], tmp, 
                   sizeof(uint32_t)*(LENGTH - shift_32b_amount));
        }
    }
    if (shift_bit) {
        tmp[0] = (res->num[0] >> (32 - shift_bit));
        res->num[0] <<= shift_bit;
        for (i = 0; i < LENGTH - 1; ++i){
            tmp2[0] = (res->num[i + 1] >> (32 - shift_bit)) ;
            res->num[i + 1] = res->num[i + 1] << shift_bit | tmp[0];
            tmp[0] = tmp2[0]; 
        }
    }
}

上方的程式碼是左移的實作。先計算總共位移幾組 32 bits 和剩下不到 32 bits 之位移量。接著先處理 32 bits 位移,用 memcpy 複製數值到較高位元。剩下不到 32 bits 位移先取最高位的 bit 再和 左移後較高位的 32 bits 做 |

void bn_mul_1(bn a, bn b, bn *res) { //printf("%d\n", clz(0)); uint64_t tmp[LENGTH]; memset(tmp, 0,sizeof(uint64_t) * LENGTH); for (int i = 0; i < LENGTH; ++i) { for (int j = 0; j < LENGTH; ++j) { if ((i + j) < LENGTH ) { tmp[i + j] += (uint64_t) a.num[i] * b.num[j]; } } } for (int i = 1; i < LENGTH; ++i) { tmp[i] += tmp[i - 1] >> 32; tmp[i - 1] &= 0xffffffff; } for (int i = LENGTH - 1; i >= 0; --i) { res->num[i] = (uint32_t) tmp[i]; } }

上方的程式碼為初始版本之乘法實作,為直式乘法的實作。然而,當計算到第 190 項時會出現錯誤,原因是第 9 行的加法式可能出現 overflow。 參考 KYG-yaya573142 作法將乘法後的結果依序加到較高位的 bit 。

void bn_mul_2(bn a, bn b, bn *res) 
{
    uint64_t tmp[LENGTH];
    uint32_t t[LENGTH];
    uint64_t c1 = 0,c2 = 0;
    memset(tmp, 0,sizeof(uint64_t) * LENGTH);
    memset(t, 0,sizeof(uint32_t) * LENGTH);
    for (int i = 0; i < LENGTH; ++i) {
        for (int j = 0; j < LENGTH; ++j) {
            if ((i + j) < LENGTH ) {
                c1 = (uint64_t) a.num[i] * b.num[j];
                c2 = 0;
                for (int k = i + j; k < LENGTH; ++k) { 
                    c2 += (uint64_t) t[k] + (c1 & 0xffffffff);
                    t[k] = c2 & 0xffffffff;
                    c2 >>= 32;
                    c1 >>= 32;
                    if (!c1 && !c2) {
                        break;
                    }
                }
            }
        }
    }
    for (int i = LENGTH - 1; i >= 0; --i) {
        res->num[i] = t[i];
    }
}

為了將這些大數印出,參考 OscarShiang 的方式將大數轉成十進位印出,

char * bn_to_string(bn a){
    char s[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
    uint32_t n[LENGTH];
    int i;
    memset(s, '0', sizeof(s) - 1);
    memcpy(n, a.num, sizeof(uint32_t) * LENGTH);
    s[sizeof(s) - 1] = '\0';
    //printf("%ld\n", sizeof(a));
    for (i = 0 ; i < 8 * sizeof(uint32_t) * LENGTH; ++i) {
        int j, carry;
        carry = (n[LENGTH - 1] >= 0x80000000);
        for (int j = LENGTH - 1; j >= 0; --j) {
            n[j] = ((n[j] << 1) & 0xffffffff) + ((j - 1) >= 0 ? (n[j - 1] >= 0x80000000) : 0);
        }

        for (int j = sizeof(s) - 2; j >= 0; j--) {
            s[j] += s[j] - '0' + carry;
            carry = (s[j] > '9');
            if (carry)
                s[j] -= 10;
        }
    }
    i = 0;
    while (i < sizeof(s) - 2 && s[i] == '0')
        i++;
    char *dec = (char *)malloc(sizeof(s) - i);
    memcpy(dec, &s[i], sizeof(s) - i);
    return dec;
}

以下為 iteration 的作法

void bn_norm_fib(unsigned int n, bn *res)
{
    if (unlikely(!n)) {
        memset(res, 0, sizeof(bn));
        bn_to_string(*res);
        return;
    }
    bn a, b;
    memset(&a, 0, sizeof(bn));
    memset(&b, 0, sizeof(bn));
    memset(res, 0, sizeof(bn));
    a.num[0] = 0;
    b.num[0] = 1;
    res->num[0] = 1;
    for (int i = 1; i < n; ++i){
        bn_add(a, b, res);
        memcpy(&a, &b, sizeof(bn));
        memcpy(&b, res, sizeof(bn));
    }
}
void bn_fib(unsigned int n, bn *res)
{
    if (unlikely(!n)) {
        memset(res, 0, sizeof(bn));
        bn_to_string(*res);
        return ;
    }
    unsigned int h = 0;
    
    bn a,b;
    memset(&a, 0, sizeof(bn));
    memset(&b, 0, sizeof(bn));
    a.num[0] = 0;
    b.num[0] = 1;

    for (unsigned int mask = 1 << (__builtin_clz(n)); mask; mask >>= 1) {
        bn t1;
        bn t2;
        memset(&t1, 0, sizeof(bn));
        memset(&t2, 0, sizeof(bn));
        
        bn_lshift(b, 1, &t1);
        bn_dec(t1, a, &t1);
        bn_mul_2(a, t1, &t1);

        bn_mul_2(a, a, &t2);
        bn_mul_2(b, b, &b);
        bn_add(t2, b, &t2);

        memcpy(&a, &t1, sizeof(bn));
        memcpy(&b, &t2, sizeof(bn));

        if (mask & n) {
            bn_add(a,b, &t1);
            memcpy(&a, &b, sizeof(bn));
            memcpy(&b, &t1, sizeof(bn));
        }
    }
    memcpy(res, &a, sizeof(bn));
}

接著修改 fibdrv.c ,直接使用 copy_to_user 將字串傳到 client.c

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn res;
    bn_fib(*offset, &res);
    char *str = bn_to_string(res);
    size_t len = strlen(str) + 1;
    copy_to_user(buf, str, len);
    kfree(str);
    return fib_sequence(*offset);
}

修改 client.c 使其能接收到字串

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>

#define FIB_DEV "/dev/fibonacci"

int main()
{
    long long sz;

    char buf[1];
    char write_buf[] = "testing writing";
    int offset = 100; /* TODO: try test something bigger than the limit */

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }

    for (int i = 0; i <= offset; i++) {
        sz = write(fd, write_buf, strlen(write_buf));
        printf("Writing to " FIB_DEV ", returned the sequence %lld\n", sz);
    }

    for (int i = 0; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, 1);
        printf("Reading from " FIB_DEV
               " at offset %d, returned the sequence "
               "%s.\n",
               i, buf);
    }

    for (int i = offset; i >= 0; i--) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, 1);
        printf("Reading from " FIB_DEV
               " at offset %d, returned the sequence "
               "%s.\n",
               i, buf);
    }

    close(fd);
    return 0;
}

運行 client.c 發生以下錯誤

*** stack smashing detected ***: terminated
Aborted

參考 blueskyson 修改

int bn_to_string(bn a, char str[])
{
    char s[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
    unsigned int n[LENGTH];
    int i;
    memset(s, '0', sizeof(s) - 1);
    memcpy(n, a.num, sizeof(uint32_t) * LENGTH);
    s[sizeof(s) - 1] = '\0';
    // printf("%ld\n", sizeof(a));
    for (i = 0; i < 8 * sizeof(uint32_t) * LENGTH; ++i) {
        int carry;
        carry = (n[LENGTH - 1] >= 0x80000000);
        for (int j = LENGTH - 1; j >= 0; --j) {
            n[j] = ((n[j] << 1) & 0xffffffff) +
                   ((j - 1) >= 0 ? (n[j - 1] >= 0x80000000) : 0);
        }

        for (int j = sizeof(s) - 2; j >= 0; j--) {
            s[j] += s[j] - '0' + carry;
            carry = (s[j] > '9');
            if (carry)
                s[j] -= 10;
        }
    }
    i = 0;
    while (i < sizeof(s) - 2 && s[i] == '0')
        i++;
    //char *dec = (char *) kmalloc(sizeof(s) - i, GFP_KERNEL);
    //memcpy(dec, &s[i], sizeof(s) - i);
    memcpy(str, s, sizeof(s));

    return i;
}
static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn res;
    bn_fib(*offset, &res);
    char str[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
    int len = bn_to_string(res, str);
    copy_to_user(buf, str + len, 8 * sizeof(uint32_t) * LENGTH / 3 + 2 - len);
    return fib_sequence(*offset);
}
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>

#define FIB_DEV "/dev/fibonacci"
#define LENGTH 22
#define BUFFSIZE 8 * sizeof(int) * LENGTH / 3 + 2

int main()
{
    long long sz;

    char buf[BUFFSIZE];
    char write_buf[] = "testing writing";
    int offset = 1000; /* TODO: try test something bigger than the limit */

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }

    for (int i = 0; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        sz = read(fd, buf, BUFFSIZE);
        printf("  offset %d, returned the sequence "
               "%s.\n",
               i, buf);
    }


    close(fd);
    return 0;
}

效能分析

首先在 userspace 測試

Perf

sudo perf stat -r 15 -e cycles,instructions,cache-references,cache-misses,branch,branch-instructions,branch-misses ./a.out 

採用 perf 查看執行費波納契數列第 0 項到第 500 項之 cache miss 和 branch miss。

  • Fast doubling
 Performance counter stats for './a.out' (15 runs):

         7995,7081      cycles                                                        ( +-  0.30% )  (53.67%)
       1,8737,9429      instructions              #    2.34  insn per cycle           ( +-  0.06% )  (69.11%)
            7,6359      cache-references                                              ( +-  1.39% )  (69.11%)
            1,8337      cache-misses              #   24.015 % of all cache refs      ( +-  6.72% )  (69.11%)
         2689,1258      branch                                                        ( +-  0.05% )  (71.15%)
         2727,4701      branch-instructions                                           ( +-  0.04% )  (77.22%)
            4,4548      branch-misses             #    0.16% of all branches          ( +- 10.93% )  (59.74%)

         0.0261203 +- 0.0000789 seconds time elapsed  ( +-  0.30% )

  • Iteration
 Performance counter stats for './a.out' (15 runs):

         2951,5320      cycles                                                        ( +-  0.47% )  (24.03%)
         5480,8003      instructions              #    1.86  insn per cycle           ( +-  0.42% )  (65.18%)
            4,8197      cache-references                                              ( +-  2.72% )  (98.36%)
            1,1488      cache-misses              #   23.836 % of all cache refs      ( +- 12.41% )
          489,4013      branch                                                        ( +-  0.01% )
          491,1414      branch-instructions                                           ( +-  0.09% )  (75.97%)
            3,3682      branch-misses             #    0.69% of all branches          ( +- 37.92% )  (1.64%)

          0.009944 +- 0.000117 seconds time elapsed  ( +-  1.17% )

Fast Doubling 在 Instruction per cycle(IPC) 和 Branch miss 上表現均優於 Iteration ,然而總執行的指令多非常多,因此執行時間也較多。下方的圖也顯示這一點。

數列各項執行時間比較

下圖是執行 0 到 1000 項之結果

參考 Risheng1128 將 CPU 0 獨立出來執行

接著按照 Linux 效能分析的提示 抑制 address space layout randomization (ASLR),設定 scaling_governor 為 performance ,關閉 turbo mode。得到以下圖表

非常明顯 bn-fast-doublingbn_iteration 慢。

cachegrind

  • fast doubling
$ valgrind --tool=cachegrind --branch-sim=yes ./a.out
==23370== Cachegrind, a cache and branch-prediction profiler
==23370== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==23370== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==23370== Command: ./a.out
==23370== 
--23370-- warning: L3 cache found, using its data for the LL simulation.
==23370== 
==23370== I   refs:      187,585,847
==23370== I1  misses:          1,075
==23370== LLi misses:          1,053
==23370== I1  miss rate:        0.00%
==23370== LLi miss rate:        0.00%
==23370== 
==23370== D   refs:      102,874,241  (86,658,760 rd   + 16,215,481 wr)
==23370== D1  misses:          3,201  (     2,542 rd   +        659 wr)
==23370== LLd misses:          2,663  (     2,069 rd   +        594 wr)
==23370== D1  miss rate:         0.0% (       0.0%     +        0.0%  )
==23370== LLd miss rate:         0.0% (       0.0%     +        0.0%  )
==23370== 
==23370== LL refs:             4,276  (     3,617 rd   +        659 wr)
==23370== LL misses:           3,716  (     3,122 rd   +        594 wr)
==23370== LL miss rate:          0.0% (       0.0%     +        0.0%  )
==23370== 
==23370== Branches:       22,906,501  (22,779,656 cond +    126,845 ind)
==23370== Mispredicts:     1,147,376  ( 1,147,235 cond +        141 ind)
==23370== Mispred rate:          5.0% (       5.0%     +        0.1%   )

--------------------------------------------------------------------------------
I1 cache:         32768 B, 64 B, 8-way associative
D1 cache:         32768 B, 64 B, 8-way associative
LL cache:         3145728 B, 64 B, 12-way associative
Command:          ./a.out
Data file:        cachegrind.out.26188
Events recorded:  Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Events shown:     Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Thresholds:       0.1 100 100 100 100 100 100 100 100 100 100 100 100
Include dirs:     
User annotated:   
Auto-annotation:  on

--------------------------------------------------------------------------------
Ir          I1mr ILmr Dr         D1mr  DLmr  Dw         D1mw DLmw Bc         Bcm       Bi      Bim 
--------------------------------------------------------------------------------
186,025,668  971  951 86,059,765 1,336 1,146 16,109,504  566  545 22,649,428 1,144,320 126,819 128  PROGRAM TOTALS

--------------------------------------------------------------------------------
Ir          I1mr ILmr Dr         D1mr DLmr Dw         D1mw DLmw Bc         Bcm       Bi      Bim  file:function
--------------------------------------------------------------------------------
163,840,484    8    8 75,946,074    0    0 12,966,362    0    0 21,096,210 1,070,152       0   0  /home/arthurchang09/quiz2/test.c:bn_mul_2
  6,913,606   13   13  2,850,456    0    0    725,116    0    0    237,538    12,525       0   0  /home/arthurchang09/quiz2/test.c:bn_lshift
  5,839,347    4    4  2,937,970    0    0    525,084    0    0    487,578    26,360       0   0  /home/arthurchang09/quiz2/test.c:bn_dec
  5,462,604    2    2  2,930,076    0    0    441,720    0    0    397,548    29,486       0   0  /home/arthurchang09/quiz2/test.c:bn_add
  1,763,169   18   18  1,082,999    0    0  1,136,227    2    1     26,506     3,128       0   0  /home/arthurchang09/quiz2/test.c:bn_fib
  1,662,276    5    5    113,519    0    0    277,046    3    2    352,058         9       0   0  /build/glibc-eX1tMB/glibc-2.31/string/../sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:__memset_avx2_unaligned_erms
    253,223   17   16    126,584    1    0         11    1    1         10         5 126,562  11  ???:???

  • iteration
==25942== Cachegrind, a cache and branch-prediction profiler
==25942== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25942== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25942== Command: ./a.out
==25942== 
--25942-- warning: L3 cache found, using its data for the LL simulation.
==25942== 
==25942== I   refs:      52,946,905
==25942== I1  misses:           927
==25942== LLi misses:           912
==25942== I1  miss rate:       0.00%
==25942== LLi miss rate:       0.00%
==25942== 
==25942== D   refs:      34,742,105  (28,234,653 rd   + 6,507,452 wr)
==25942== D1  misses:         1,902  (     1,336 rd   +       566 wr)
==25942== LLd misses:         1,691  (     1,146 rd   +       545 wr)
==25942== D1  miss rate:        0.0% (       0.0%     +       0.0%  )
==25942== LLd miss rate:        0.0% (       0.0%     +       0.0%  )
==25942== 
==25942== LL refs:            2,829  (     2,263 rd   +       566 wr)
==25942== LL misses:          2,603  (     2,058 rd   +       545 wr)
==25942== LL miss rate:         0.0% (       0.0%     +       0.0%  )
==25942== 
==25942== Branches:       3,899,539  ( 3,772,990 cond +   126,549 ind)
==25942== Mispredicts:      252,834  (   252,706 cond +       128 ind)
==25942== Mispred rate:         6.5% (       6.7%     +       0.1%   )
--------------------------------------------------------------------------------
I1 cache:         32768 B, 64 B, 8-way associative
D1 cache:         32768 B, 64 B, 8-way associative
LL cache:         3145728 B, 64 B, 12-way associative
Command:          ./a.out
Data file:        bn_norm_cachegrind.out.25942
Events recorded:  Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Events shown:     Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw Bc Bcm Bi Bim
Thresholds:       0.1 100 100 100 100 100 100 100 100 100 100 100 100
Include dirs:     
User annotated:   
Auto-annotation:  on

--------------------------------------------------------------------------------
Ir         I1mr ILmr Dr         D1mr  DLmr  Dw        D1mw DLmw Bc        Bcm     Bi      Bim 
--------------------------------------------------------------------------------
52,946,905  927  912 28,234,653 1,336 1,146 6,507,452  566  545 3,772,990 252,706 126,549 128  PROGRAM TOTALS

--------------------------------------------------------------------------------
Ir         I1mr ILmr Dr         D1mr DLmr Dw        D1mw DLmw Bc        Bcm     Bi      Bim  file:function
--------------------------------------------------------------------------------
46,282,250    3    3 24,825,250    0    0 3,742,500    1    0 3,368,250 249,538       0   0  /home/arthurchang09/quiz2/test.c:bn_add
 4,762,025    6    6  2,874,257    0    0 2,500,505    3    3   126,252     510       0   0  /home/arthurchang09/quiz2/test.c:bn_norm_fib
 1,497,000    2    2    374,250    0    0   249,500    0    0   249,500       0       0   0  /build/glibc-eX1tMB/glibc-2.31/string/../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:__memcpy_avx_unaligned_erms
   252,683   17   16    126,314    1    0        11    1    1        10       5 126,292  11  ???:???

將以上重要資訊整理如下

Bn Fast Doubling Bn Iteration
perf instructions 1,8737,9429 5480,8003
perf cache-references 7,6359 4,8197
perf cache miss 1,8337 1,1488
perf branch 2689,1258 489,4013
perf branch miss 4,4548 3,3682
perf branch miss rate 0.16% 0.69%
cachegrind LL cache miss 3,716 2,603
cachegrind Branch 22,906,501 3,899,539
cachegrind Branch miss 1,147,376 252,834
cachegrind Branch miss rate 5.0% 6.5%

可以發現 Bn Fast doubling 執行的指令數約為 iteration 版的 3 倍多。雖然有較低 cache miss rate 和 Branch miss rate ,但仍無助於減少執行時間。

先從 cachegrind 的數據觀察,可以發現 bn_mul2Ir Dr Dw 比其他高出非常多,即此部分程式執行最多次,大部分的 Branch miss 發生在此,因此這個部份極有可能是拖慢執行速度的原因。

--------------------------------------------------------------------------------
Ir          I1mr ILmr Dr         D1mr DLmr Dw         D1mw DLmw Bc         Bcm       Bi      Bim  file:function
--------------------------------------------------------------------------------
163,840,484    8    8 75,946,074    0    0 12,966,362    0    0 21,096,210 1,070,152       0   0  /home/arthurchang09/quiz2/test.c:bn_mul_2

接著看函式執行的狀況,三層的巢狀迴圈是造成大量 branch miss ,也是最多 Ir 之處,大部分 cycle 都是執行此處指令。

. . . . . . . . . . . . . void bn_mul_2(bn a, bn b, bn *res) 300,048 2 2 37,506 0 0 112,518 0 0 0 0 0 0 { . . . . . . . . . . . . . uint64_t tmp[LENGTH]; . . . . . . . . . . . . . uint32_t t[LENGTH]; 75,012 0 0 0 0 0 75,012 0 0 0 0 0 0 uint64_t c1 = 0,c2 = 0; 187,530 0 0 0 0 0 37,506 0 0 0 0 0 0 memset(tmp, 0,sizeof(uint64_t) * LENGTH); 187,530 1 1 0 0 0 37,506 0 0 0 0 0 0 memset(t, 0,sizeof(uint32_t) * LENGTH); 1,500,240 1 1 937,650 0 0 37,506 0 0 487,578 37,516 0 0 for (int i = 0; i < LENGTH; ++i) { 18,002,880 0 0 11,251,800 0 0 450,072 0 0 5,850,936 525,100 0 0 for (int j = 0; j < LENGTH; ++j) { 27,004,320 1 1 10,801,728 0 0 0 0 0 5,400,864 450,217 0 0 if ((i + j) < LENGTH ) { 29,254,680 0 0 11,701,872 0 0 2,925,468 0 0 0 0 0 0 c1 = (uint64_t) a.num[i] * b.num[j]; 2,925,468 0 0 0 0 0 2,925,468 0 0 0 0 0 0 c2 = 0; 20,558,652 2 2 8,829,988 0 0 2,925,468 0 0 2,952,260 16 0 0 for (int k = i + j; k < LENGTH; ++k) { 23,618,080 0 0 11,809,040 0 0 0 0 0 0 0 0 0 c2 += (uint64_t) t[k] + (c1 & 0xffffffff); 14,761,300 0 0 5,904,520 0 0 2,952,260 0 0 0 0 0 0 t[k] = c2 & 0xffffffff; 2,952,260 0 0 2,952,260 0 0 0 0 0 0 0 0 0 c2 >>= 32; 2,952,260 0 0 2,952,260 0 0 0 0 0 0 0 0 0 c1 >>= 32; 11,758,976 0 0 5,879,488 0 0 0 0 0 5,879,488 19,779 0 0 if (!c1 && !c2) { 2,925,468 0 0 0 0 0 0 0 0 0 0 0 0 break; . . . . . . . . . . . . . } . . . . . . . . . . . . . } . . . . . . . . . . . . . } . . . . . . . . . . . . . } . . . . . . . . . . . . . } 1,500,240 1 1 937,650 0 0 37,506 0 0 487,578 37,522 0 0 for (int i = LENGTH - 1; i >= 0; --i) { 3,150,504 0 0 1,800,288 0 0 450,072 0 0 0 0 0 0 res->num[i] = t[i]; . . . . . . . . . . . . . } 225,036 0 0 150,024 0 0 0 0 0 37,506 2 0 0 }

接著使用 perf record 觀察,

sudo perf record -g --call-graph dwarf ./a.out
sudo perf report --stdio -g graph,0.5,caller

以下節錄其中的片段

# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 659  of event 'cycles'
# Event count (approx.): 502472023
#
# Children      Self  Command  Shared Object      Symbol                            
# ........  ........  .......  .................  ..................................
#
    99.41%     0.00%  a.out    a.out              [.] _start
            |
            ---_start
               __libc_start_main
               main
               bn_fib
               |          
               |--89.69%--bn_mul_2
               |          |          
               |           --1.53%--__memset_avx2_unaligned_erms
               |          
               |--3.06%--bn_lshift
               |          
               |--2.51%--bn_dec
               |          
               |--2.17%--bn_add
               |          
                --0.76%--__memset_avx2_unaligned_erms

可發現 bn_mul_2 佔據 89% ,剩下的 bn_lshiftbn_decbn_add 占比就比較少,因此必須設法提高乘法的效率。

因此修改 bn_mul_2 ,修改長乘法的方式,

abc×defafbfcfaebeceadbdcd

之前的作法是依照長乘法的習慣先由右而左,由上而下做乘法,即先算

cf
bf
af
,再算
ce
be
ae
並計算
bf+ce
,以此類推。為處理計算
bf+ce
產生的 overflow ,原始的版本每做完一次加法,需如 bn_add 一樣將可能的 carry 向高位數加,因此需要三層巢狀迴圈。因此我將計算順序稍作修改,先由上而下,由右而左,也就是計算
bf+ce
,得出 carry 後再計算
af+be+cd+carry
以此類推。因此只需要兩層 for 迴圈。

void bn_mul_3(bn a, bn b, bn *res) {
    uint64_t c1 = 0,c2 = 0,carry;
    memset(res, 0, sizeof(bn));
    for (int i = 0; i < LENGTH; ++i) {
        c1 = carry;
        carry = 0;
        for (int j = 0; j <= i; ++j) {
            c2 = (uint64_t) a.num[j] * b.num[i - j];
            c1 += c2;
            carry += c1 < c2;
        }
        res->num[i] = c1 & 0xffffffff;
        carry = (carry << 32) + (c1 >> 32);   
    }
}

接著在 userspace 用timespec 測量時間,如下:

很明顯減少了約 50% 的執行時間

使用 perf 測量

sudo perf stat -r 15 -e cycles,instructions,cache-references,cache-misses, branch,branch-instructions,branch-misses taskset 0x01 ./a.out 
  • iteration
 Performance counter stats for 'taskset 0x01 ./a.out' (15 runs):

       2,0095,7615      cycles                                                        ( +-  0.07% )  (51.96%)
       3,7231,5532      instructions              #    1.85  insn per cycle           ( +-  0.03% )  (70.37%)
           15,6095      cache-references                                              ( +-  1.02% )  (75.08%)
            3,6236      cache-misses              #   23.214 % of all cache refs      ( +- 11.92% )  (75.43%)
         3037,2291      branch                                                        ( +-  0.04% )  (75.43%)
         3035,1483      branch-instructions                                           ( +-  0.02% )  (72.61%)
              5668      branch-misses             #    0.02% of all branches          ( +- 23.54% )  (49.49%)

         0.0654597 +- 0.0000478 seconds time elapsed  ( +-  0.07% )

  • fast doubling old
 Performance counter stats for 'taskset 0x01 ./a.out' (15 runs):

       5,1662,8092      cycles                                                        ( +-  0.23% )  (56.97%)
      10,7807,0383      instructions              #    2.09  insn per cycle           ( +-  0.02% )  (71.31%)
           17,5168      cache-references                                              ( +-  1.06% )  (71.31%)
            4,4469      cache-misses              #   25.386 % of all cache refs      ( +-  9.10% )  (71.31%)
       1,5608,3228      branch                                                        ( +-  0.03% )  (71.41%)
       1,5656,4076      branch-instructions                                           ( +-  0.03% )  (71.72%)
          123,2533      branch-misses             #    0.79% of all branches          ( +-  2.46% )  (57.28%)

          0.167680 +- 0.000369 seconds time elapsed  ( +-  0.22% )

  • fast doubling new
Performance counter stats for 'taskset 0x01 ./a.out' (15 runs):

       2,5550,7791      cycles                                                        ( +-  0.24% )  (56.53%)
       5,0979,0980      instructions              #    2.00  insn per cycle           ( +-  0.06% )  (71.02%)
           17,1360      cache-references                                              ( +-  1.23% )  (71.02%)
            3,7991      cache-misses              #   22.170 % of all cache refs      ( +-  7.09% )  (71.02%)
         2850,6292      branch                                                        ( +-  0.11% )  (71.36%)
         2833,2169      branch-instructions                                           ( +-  0.06% )  (72.45%)
           18,4534      branch-misses             #    0.65% of all branches          ( +-  3.07% )  (57.62%)

          0.083148 +- 0.000213 seconds time elapsed  ( +-  0.26% )

簡單整理如下

itseration fast doubling old fast doubling new
cycle 2,0095,7615 5,1662,8092 2,5550,7791
instructions 3,7231,5532 10,7807,0383 5,0979,0980
cache misses 3,6236 4,4469 3,7991
branch instructions 3035,1483 1,5656,4076 2833,2169
branch misses 5668 123,2533 18,4534

以上的 perf 測試跑第 0 項到第 1000 項的費式數列的總時間。在 cycle 數、 instruction 數、 執行時間方面約是舊版的一半,進步幅度很大。可看出由於迴圈從三層變成兩層, branch instructions 的數量約為原本的

15branch misses 約是舊版的
16
減少幅度相當可觀。

copy_to_user 之成本

接著將以上程式碼放入 fibdrv ,使用 ktime_t 測量執行時間,並使用 return 回傳執行時間。

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn res;
    ktime_t kt = ktime_get();
    bn_fib(*offset, &res);
    // s64 time = ktime_to_ns(ktime_sub(ktime_get(), kt));
    char str[8 * sizeof(uint32_t) * LENGTH / 3 + 2];
    int len = bn_to_string(res, str);
    copy_to_user(buf, str + len, 8 * sizeof(uint32_t) * LENGTH / 3 + 2 - len);
    s64 time = ktime_to_ns(ktime_sub(ktime_get(), kt));
    return time;
}

我測量了只有bn_fib 的執行時間以及包含 copy_to_usr 的時間,發現這之間有非常大的差距。原本減少 50% 的執行時間因為要傳送龐大的字串造成執行時間的差距減少。由下圖可見時間從約 3萬 ns 增加到約 30多萬 ns ,非常明顯 copy_to_user 為了傳送數十個到數百個的字元而耗費大量時間。

於是將傳送的方式改成只傳送 bn 而非龐大的字串。直接將計算的結果 res 放入 copy_to_user

static ssize_t fib_read(struct file *file,
                        char *buf,
                        size_t size,
                        loff_t *offset)
{
    bn res;
    ktime_t kt = ktime_get();
    bn_fib(*offset, &res);
    copy_to_user(buf, &res, sizeof(res));
    s64 time = ktime_to_ns(ktime_sub(ktime_get(), kt));
    return time;
}

接著再由 client.c 接收並使用 memcpy 將值放入 bn res ,最後呼叫 bn_to_string 將數字轉成字串。

int main()
{
    char buf[BUFFSIZE];
    int offset = 1000; /* TODO: try test something bigger than the limit */

    int fd = open(FIB_DEV, O_RDWR);
    if (fd < 0) {
        perror("Failed to open character device");
        exit(1);
    }

    bn res;
    for (int i = 0; i <= offset; i++) {
        lseek(fd, i, SEEK_SET);
        long long sz = read(fd, buf, BUFFSIZE);
        memcpy(&res,buf,sizeof(res));
        bn_to_string_new(res);
    }

    close(fd);
    return 0;
}

最後再計算一次 fibdrv 之執行時間,如下圖所示。有無 copy_to_usr 對執行時間影響非常小,

然而,如此一來,將 bn 轉為十進位必需要在 userspace 進行,因此 userspace 也需要花時間進行計算,得到以下的圖。

由上圖可以發現,隨著費式數列的增長,將 bn 轉成字串所花的時間會越來越長,總花費時間會比單純在 fibdrv 轉換成字串還慢,因此,須採用別的方式減少 userpace 的轉換時間。