# 2019-06-04 林聖堯 ## Q1: 針對 x86_64 架構,改善以下程式碼,將 byte-by-byte 操作改為 word-by-word,並確認 word-aligned: ```cpp #include <stdint.h> void *my_memcpy(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; while (count--) *dst8++ = *src8++; return dest; } ``` Reference: * [macro ALIGN_SIZE](https://www.codeproject.com/Articles/567335/Essential-Macros-for-C-Programming) ==> ```cpp int x = src % 8; int y = dst % 8; int count1 = count / 8; int count2 = count - count1 * 8 - x; if(x != y) { while (count--) *dst8++ = *src8++; } else if (x > 0) { while (x--) *dst8++ = *src8++; while (count1--) *(long long *)dst8++ = *(long long *)src8++; while(count2--) *dst8++ = *src8++; } else { while(count1--) *(long long *)dst8++ = *(long long *)src8++; while(count2--) *dst8++ = *src8++; } return dest; ``` --- deman page memcpy # 檢討與修正上面程式碼 ### 缺陷 - 地址無法做除或取餘數的動作 - 記憶體長度為 `64 bit`,不能用 `int` 存取地址 - x 與 y 的值錯誤 - x 與 y 分別代表 src8 和 dst8 差幾個 byte 才達到 word_aligned,例如:src8 -> 0xa1 且一個 word 單位為 8 byte ,則 x = 7 代表 src8 差 8 個 byte 達到 word_aligned ### 修正 - 先將 `src8` 的地址用一個 `unsigned long long` 變數記起來 ```cpp unsigned long long int_src8 = src ``` - 將取餘數的動作改用 `bitwise operation` 操作 `src8 % 8` --> `src8 & 0x7` - 更正 `x` 、`y` ```cpp int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1 ``` - `case2` 、 `case3` 可以合併 ### 程式碼 程式針對三個不同的情況去做 memory copy 1. `dest` 和 `src` 沒有辦法 aligned - 例如: `dest` 的位址為 `0xa1` , `src` 的位址為 `0xa2` 2. `dest` 和 `src` 部份可以 aligned - 例如: `dest` 的不只為 `0xa1` , `src` 的地址為 `0xb1`,且 size > 15 - 例如: `dest` 的位址為 `0xa0` , `src` 的位址為 `0xb0` , 且 size % 8 = 0 其中 `x` 、 `y` 變數用來檢查 `dest` 和 `src` 可不可以做 aligned `counter1` 代表在做 `word_assign` 前要先做幾個 `byte_assign` `counter2` 代表做幾個 `word_assign` `counter3` 代表做完 `word_assign` 後做了多少個 `byte_assign` ```cpp void *my_memcpy1(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; int counter1 = 0; // calculate byte_assign1 int counter2 = 0; // calculate word_assign int counter3 = 0; // calculate byte_assign2 unsigned long long int_src8 = src8; unsigned long long int_dst8 = dst8; int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1 int y = (8 - int_dst8 & 0x7) & 0x7; //number of byte_assign1 int word_count = (count > x) ? (count - x) / 8 : 0; int byte_count2 = (count > x) ? (count - word_count * 8 - x) : 0; //case1: src and dest not are aligned (ex: src->0x01 dest->0xa2) if (x != y){ while (count--){ *dst8++ = *src8++; counter1++; } } //case2: src and dest are aligned else { while (x--){ *dst8++ = *src8++; counter1++; } while (word_count--){ *(long long *) dst8 = *(long long *) src8; dst8 += 8; src8 += 8; counter2++; } while(byte_count2--){ *dst8++ = *src8++; counter3++; } } printf("counter1 = %d\ncounter2 = %d\ncounter3 = %d\n", counter1, counter2, counter3); return dest; } ``` ### 簡單測試 ```cpp int main() { char a[26]; char b[26]; for(int i = 0; i < 26; i++){ b[i] = 97 + i; } test1 ( size from 1 to 26) int error = 0; for(int i = 0; i < 26; i++){ my_memcpy1(&a[i], &b[i], 26-i); for(int j = i; j < 26; j++){ if(a[j] != b[j]) error ++; } printf("size=%d error: %d\n",26-i ,error); error = 0; } //test2 memory can't be alinged my_memcpy1(&a[0], &b[1], 25); int error = 0; for(int i = 0; i < 25; i++){ if(a[i] != b[i+1]) error++; } printf("error: %d\n", error); return 0; } ``` ``` size = 26 counter1 = 0 counter2 = 3 counter3 = 2 size=26 error: 0 size = 25 counter1 = 1 counter2 = 2 counter3 = 2 size=25 error: 0 size = 24 counter1 = 2 counter2 = 2 counter3 = 2 size=24 error: 0 size = 23 counter1 = 3 counter2 = 2 counter3 = 2 size=23 error: 0 size = 22 counter1 = 4 counter2 = 2 counter3 = 2 size=22 error: 0 size = 21 counter1 = 5 counter2 = 2 counter3 = 2 size=21 error: 0 size = 20 counter1 = 6 counter2 = 2 counter3 = 2 size=20 error: 0 size = 19 counter1 = 7 counter2 = 2 counter3 = 2 size=19 error: 0 size = 18 counter1 = 0 counter2 = 2 counter3 = 2 size=18 error: 0 size = 17 counter1 = 1 counter2 = 1 counter3 = 2 ... ... ... size = 3 counter1 = 7 counter2 = 0 counter3 = 2 size=3 error: 0 size = 2 counter1 = 0 counter2 = 0 counter3 = 2 size=2 error: 0 size = 1 counter1 = 1 counter2 = 0 counter3 = 0 size=1 error: 0 counter1 = 25 counter2 = 0 counter3 = 0 error: 0 ``` ### GPROF 測試效能 給定同樣大小陣列,以及同樣 copy 次數,比較 `my_memcpy` 、 `my_memcpy1` 使用 cpu 時間,前者是以 `byte` 為單位做 copy ,後者考慮到用 `word` 為單位做 copy 測試用 main() ```cpp int main() { char a[test_size]; char b[test_size]; memset(b, 'a', test_size); for(int i = 0; i < 10000; i++){ my_memcpy(&a[0], &b[0], test_size); my_memcpy1(&a[0], &b[0], test_size); } return 0; } ``` ``` $ gcc -Og -pg -g align_test.c -o align_test $ ./align_test file.txt $ gprof align_test Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls us/call us/call name 88.21 8.05 8.05 10000 805.36 805.36 my_memcpy 12.21 9.17 1.11 10000 111.46 111.46 my_memcpy1 ``` 從上面可以看出 `my_memcpy` 和 `my_memcpy1` 所花的時間接近 1:8 假設 CPU 做一次 `read` 真的是 `read` 一個 `word` 也就是 `64 bit` ,由上面實驗我們可以得知在實作 `memcpy` 函數的時候用 `copy_by_word` 的效率會比 `copy_by_byte` 好上 8 倍左右。因為 `cpy_by_word` 所需要的 `while` 迴圈數比 `copy_by_byte` 少了 8 被左右。 ### 驗證 read a word 上面的實驗我們假設 CPU 到 memory 一次會 read 64 bit,這裡我們就來驗證 CPU 真的一次會 read 64 bit 到 CPU 的 cache 內 以下實作 4 個不同版本的 memcpy ,分別用來複製 4 種不同大小資料型態的 array ```cpp #include <stdint.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #define data_size 100000 #define loop_size 100000 // 128 bit struct struct bit_128{ unsigned long long a; unsigned long long a1; }; typedef struct bit_128 bit_128 ; void *my_memcpy(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; while(count--) *dst8++ = *src8++; return dest; } void *my_memcpy1(void *dest, const void *src, size_t count) { int *dst32 = (int *) dest, *src32 = (int *) src; while(count--) *dst32++ = *src32++; return dest; } void *my_memcpy2(void *dest, const void *src, size_t count) { unsigned long long *dst64 = (unsigned long long*) dest; unsigned long long *src64 = (unsigned long long*) src; while(count--) *dst64++ = *src64++; return dest; } void *my_memcpy3(void *dest, const void *src, size_t count) { bit_128 *dst128 = (bit_128*) dest; bit_128 *src128 = (bit_128*) src; while(count--) *dst128++ = *src128++; return dest; } ``` 測試用 main ```cpp int main() { char src8[data_size]; char dest8[data_size]; int src32[data_size]; int dest32[data_size]; unsigned long long src64[data_size]; unsigned long long dest64[data_size]; bit_128 src128[data_size]; bit_128 dest128[data_size]; memset(src8, 1, sizeof(char) * data_size); memset(src32, 1, sizeof(int) * data_size); memset(src64, 1, sizeof(unsigned long long) * data_size); memset(src128, 1, sizeof(bit_128) * data_size); for(int i = 0; i < loop_size; i++){ my_memcpy(dest8, src8, data_size); my_memcpy1(dest32, src32, data_size); my_memcpy2(dest64, src64, data_size); my_memcpy3(dest128, src128, data_size); } return 0; } ``` 測試結果 ``` $ gcc -Og -pg -g align2.c -o align2 $ ./align2 file.txt $ gprof align2 Flat profile: Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls us/call us/call name 38.36 16.54 16.54 100000 165.39 165.39 my_memcpy3 21.77 25.92 9.39 100000 93.86 93.86 my_memcpy1 20.32 34.68 8.76 100000 87.59 87.59 my_memcpy2 19.93 43.28 8.59 100000 85.93 85.93 my_memcpy ``` 由上面的測試結果可以看出一次 copy 的大小如果小於等於一個 `64 bit` ,copy 的時間都大約是 9 秒,若一次要 copy `128 bit` ,CPU 就需要到 memory 讀取兩次的資料,而讀取 memory 的時間相對於其他指令而言多的非常多,所以 CPU 所花的時間會接近原本的兩倍 ### 小結: 到這裡我們知道 CPU 做一次 read 會到 memory 讀取 64 bit 的資料到 cache 當中,所以在實作 `memcpy` 的時候我們可以使用 `copy_by_word` 的方式來實作,由此可以省略 while 迴圈的圈數。 但這裡都還沒有探討到 `memory_alignment` 的問題,其實 CPU 到 memory 讀取資料的時候會從固定的 `address` 開始讀取,以一個 word 為 64 bit 來說,`address` 會是 0 或 8 結尾,若我們在做讀取的時候資料分別放在兩個不同的 `word` ,此時就必須做兩次 `read` 的動作才能拿到我們需要的資料 ### memory_alignment 設計兩個 `memcpy` 函式,其中一個有做 `memory_alignment` ,比較兩者執行結果,其中 `my_memcpy` 是有做 `alignment` 的動作,`my_memcpy1` 有做 ```cpp #include <stdlib.h> #include <stdio.h> #include <string.h> #define data_size 100000000 #define loop_size 100 void my_memcpy(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; while(count >= 8){ *(long long *) dst8 = *(long long *) src8; dst8 += 8; src8 += 8; count -= 8; } while(count --){ *dst8++ = *src8++; } } void *my_memcpy1(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; int counter1 = 0; // calculate byte_assign1 int counter2 = 0; // calculate word_assign int counter3 = 0; // calculate byte_assign2 unsigned long long int_src8 = src8; unsigned long long int_dst8 = dst8; int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1 int y = (8 - int_dst8 & 0x7) & 0x7; //number of byte_assign1 int word_count = (count > x) ? (count - x) / 8 : 0; int byte_count2 = (count > x) ? (count - word_count * 8 - x) : 0; while (x--){ *dst8++ = *src8++; counter1++; } while (word_count-- > 0){ *(long long *) dst8 = *(long long *) src8; dst8 += 8; src8 += 8; } while(byte_count2--){ *dst8++ = *src8++; } return dest; } ``` 測試程式: 將 ```cpp int main() { char *src; char *dest; char *dest1; src = malloc(sizeof(char) * data_size); dest = malloc(sizeof(char) * data_size); dest1 = malloc(sizeof(char) * data_size); memset(src, 1, sizeof(char) * data_size); for(int i = 0; i < loop_size; i++) my_memcpy(&dest[1], &src[1], data_size); for(int i = 0; i < loop_size; i++) my_memcpy1(&dest1[1], &src[1], data_size); return 0; } ``` &dest[1] , &src[1] 兩者記憶體位址尾數都為 1 >這裡用動態分配記憶體的方式做測試是因為 `heap` 預設可以容納的資料量比 `stack` 大滿多的,如果直接宣告 array 的大小會將變數存在 `stack` 內,當 array 太大的時候就會造成 `segmentation fault` > ``` $ gcc -Og -pg -g align3.c -o align3 $ ./align3 file.txt $ gprof align3 Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls ms/call ms/call name 50.72 1.52 1.52 100 15.17 15.17 my_memcpy1 49.72 3.00 1.49 100 14.87 14.87 my_memcpy ``` 從測試結果看起來兩函式所花的時間幾乎是沒有差別的,甚至用了 `memory_alignment` 的函式花的時間還多出一點點。 :::warning 在 x86_64 上,mis-alignment 的效能代價很低,你需要用 perf 搭配 raw performance counter 來確認 :notes: jserv ::: :::info 了解! 之後補上~ ::: 猜想是因為兩個函數對 memory 做 read 的次數其實是一樣的,都一樣是 (data_size / 8) + 1 次,而 `memcpy` 大部分的時間都會花在 `read` 上,所以兩函數花的時間非常接近 ### CPE 效能分析 將第一部份的 `my_memcpy` 和 `my_memcpy1` 所花費的時間作圖,圖中橫軸為 `size` ,縱軸為所消耗的時間,計算時間的部份所使用的是 lib `time.h` 中 `clock()` 函式 **`my_memcpy` 與 `my_memcpy1`:** ```cpp void *my_memcpy(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; while(count--) *dst8++ = *src8++; return dest; } void *my_memcpy1(void *dest, const void *src, size_t count) { char *dst8 = (char *) dest, *src8 = (char *) src; unsigned long long int_src8 = src8; unsigned long long int_dst8 = dst8; int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1 int y = (8 - int_dst8 & 0x7) & 0x7; //number of byte_assign1 int word_count = (count > x) ? (count - x) / 8 : 0; int byte_count2 = (count > x) ? (count - word_count * 8 - x) : 0; while (x--){ *dst8++ = *src8++; } while (word_count--){ *(long long *) dst8 = *(long long *) src8; dst8 += 8; src8 += 8; } while(byte_count2--){ *dst8++ = *src8++; } return dest; } ``` **`main` 如下:** ```cpp int main() { FILE *f; FILE *f1; char *a; char *b; char *c; clock_t start, end; double diff = 0; char output_time[100]; //for fwrite char output_num[100];//for fwrite a = malloc(sizeof(char) * data_size_end); b = malloc(sizeof(char) * data_size_end); c = malloc(sizeof(char) * data_size_end); memset(b, 'a', data_size_end); f = fopen("my_memcpy.txt", "w"); f1 = fopen("my_memcpy1.txt", "w"); //test my_memcpy1 for(int j = data_size_start; j < data_size_end; j += 20000){ memset(output_time, 0, 100); memset(output_num, 0, 100); start = clock(); for(int i = 0; i < loop_size; i++) my_memcpy1(&a[0], &b[0], j); end = clock(); diff = (double) (end - start) / CLOCKS_PER_SEC; sprintf(output_time, "%f", diff); sprintf(output_num, "%d", j); strcat(output_num, " "); strcat(output_num, output_time); strcat(output_num, "\n"); fwrite(output_num, sizeof(char), strlen(output_num), f1); } //test my_memcpy for(int j = data_size_start; j < data_size_end; j += 20000){ memset(output_time, 0, 100); memset(output_num, 0, 100); start = clock(); for(int i = 0; i < loop_size; i++){ my_memcpy(&c[0], &b[0], j); } end = clock(); diff = (double) (end - start) / CLOCKS_PER_SEC; sprintf(output_time, "%f", diff); sprintf(output_num, "%d", j); strcat(output_num, " "); strcat(output_num, output_time); } return 0; } ``` 根據 CS:APP 第五章的方法,用 `least squares fit` (y = a*x+b) 來作圖 **腳本如下:** ``` set xlabel "copy size" set ylabel "time(sec)" set output 'align.png' set terminal png font " Times_New_Roman,8 " f(x)=a*x+b g(x)=c*x+d fit f(x) "my_memcpy.txt" via a,b fit g(x) "my_memcpy1.txt" via c,d plot f(x) with lines, "my_memcpy.txt" using 1:2 w points title "copy by byte",\ g(x) with lines, "my_memcpy1.txt" using 1:2 w points title "copy by word" ``` ``` $ gnuplot align.gp Final set of parameters Asymptotic Standard Error ======================= ========================== a = 7.45605e-08 +/- 3.736e-10 (0.5011%) b = 0.00265487 +/- 0.001509 (56.85%) Final set of parameters Asymptotic Standard Error ======================= ========================== c = 1.56844e-08 +/- 2.109e-10 (1.345%) d = -0.00371954 +/- 0.0008518 (22.9%) ``` 上表中的 `a` 和 `c` 分別表 `f(x)` 和 `g(x)` 的斜率,也就是 `CPI` ![](https://i.imgur.com/ge8qBvz.png) 當 `x` 的值越大時也就是需要 copy 的字串長度越長時,兩者的效率大概相差 5 倍,也就是斜率相差 5 倍 這裡再使用 `GPROF` 觀察,也差不多是相差 5 倍 ``` Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls ms/call ms/call name 83.96 73.89 73.89 28000 2.64 2.64 my_memcpy 16.28 88.22 14.33 28000 0.51 0.51 my_memcpy1 0.01 88.23 0.01 main ``` 結果和最前面做出來的 7 倍左右有一點點落差,可能因為採樣次數的不同造成,但落差並沒有太大 :::info 待查: `read` 需要以 word 為單位,`write` 不知道須不需要,如果不需要的話程式碼中的 `case 1` 就不需要存在 ::: :::danger TODO: 比照 CS:APP 第 5 章的方法,做 CPE 效能分析 ::: ###### tags: `Linux 核心設計 2019`