owned this note
owned this note
Published
Linked with GitHub
# 2019-06-04 林聖堯
## Q1: 針對 x86_64 架構,改善以下程式碼,將 byte-by-byte 操作改為 word-by-word,並確認 word-aligned:
```cpp
#include <stdint.h>
void *my_memcpy(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
while (count--)
*dst8++ = *src8++;
return dest;
}
```
Reference:
* [macro ALIGN_SIZE](https://www.codeproject.com/Articles/567335/Essential-Macros-for-C-Programming)
==>
```cpp
int x = src % 8;
int y = dst % 8;
int count1 = count / 8;
int count2 = count - count1 * 8 - x;
if(x != y)
{
while (count--)
*dst8++ = *src8++;
}
else if (x > 0)
{
while (x--)
*dst8++ = *src8++;
while (count1--)
*(long long *)dst8++ = *(long long *)src8++;
while(count2--)
*dst8++ = *src8++;
}
else
{
while(count1--)
*(long long *)dst8++ = *(long long *)src8++;
while(count2--)
*dst8++ = *src8++;
}
return dest;
```
---
deman page memcpy
# 檢討與修正上面程式碼
### 缺陷
- 地址無法做除或取餘數的動作
- 記憶體長度為 `64 bit`,不能用 `int` 存取地址
- x 與 y 的值錯誤
- x 與 y 分別代表 src8 和 dst8 差幾個 byte 才達到 word_aligned,例如:src8 -> 0xa1 且一個 word 單位為 8 byte ,則 x = 7 代表 src8 差 8 個 byte 達到 word_aligned
### 修正
- 先將 `src8` 的地址用一個 `unsigned long long` 變數記起來
```cpp
unsigned long long int_src8 = src
```
- 將取餘數的動作改用 `bitwise operation` 操作
`src8 % 8` --> `src8 & 0x7`
- 更正 `x` 、`y`
```cpp
int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1
```
- `case2` 、 `case3` 可以合併
### 程式碼
程式針對三個不同的情況去做 memory copy
1. `dest` 和 `src` 沒有辦法 aligned
- 例如: `dest` 的位址為 `0xa1` , `src` 的位址為 `0xa2`
2. `dest` 和 `src` 部份可以 aligned
- 例如: `dest` 的不只為 `0xa1` , `src` 的地址為 `0xb1`,且 size > 15
- 例如: `dest` 的位址為 `0xa0` , `src` 的位址為 `0xb0` , 且 size % 8 = 0
其中 `x` 、 `y` 變數用來檢查 `dest` 和 `src` 可不可以做 aligned
`counter1` 代表在做 `word_assign` 前要先做幾個 `byte_assign`
`counter2` 代表做幾個 `word_assign`
`counter3` 代表做完 `word_assign` 後做了多少個 `byte_assign`
```cpp
void *my_memcpy1(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
int counter1 = 0; // calculate byte_assign1
int counter2 = 0; // calculate word_assign
int counter3 = 0; // calculate byte_assign2
unsigned long long int_src8 = src8;
unsigned long long int_dst8 = dst8;
int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1
int y = (8 - int_dst8 & 0x7) & 0x7; //number of byte_assign1
int word_count = (count > x) ? (count - x) / 8 : 0;
int byte_count2 = (count > x) ? (count - word_count * 8 - x) : 0;
//case1: src and dest not are aligned (ex: src->0x01 dest->0xa2)
if (x != y){
while (count--){
*dst8++ = *src8++;
counter1++;
}
}
//case2: src and dest are aligned
else {
while (x--){
*dst8++ = *src8++;
counter1++;
}
while (word_count--){
*(long long *) dst8 = *(long long *) src8;
dst8 += 8;
src8 += 8;
counter2++;
}
while(byte_count2--){
*dst8++ = *src8++;
counter3++;
}
}
printf("counter1 = %d\ncounter2 = %d\ncounter3 = %d\n", counter1, counter2, counter3);
return dest;
}
```
### 簡單測試
```cpp
int main()
{
char a[26];
char b[26];
for(int i = 0; i < 26; i++){
b[i] = 97 + i;
}
test1 ( size from 1 to 26)
int error = 0;
for(int i = 0; i < 26; i++){
my_memcpy1(&a[i], &b[i], 26-i);
for(int j = i; j < 26; j++){
if(a[j] != b[j])
error ++;
}
printf("size=%d error: %d\n",26-i ,error);
error = 0;
}
//test2 memory can't be alinged
my_memcpy1(&a[0], &b[1], 25);
int error = 0;
for(int i = 0; i < 25; i++){
if(a[i] != b[i+1])
error++;
}
printf("error: %d\n", error);
return 0;
}
```
```
size = 26
counter1 = 0
counter2 = 3
counter3 = 2
size=26 error: 0
size = 25
counter1 = 1
counter2 = 2
counter3 = 2
size=25 error: 0
size = 24
counter1 = 2
counter2 = 2
counter3 = 2
size=24 error: 0
size = 23
counter1 = 3
counter2 = 2
counter3 = 2
size=23 error: 0
size = 22
counter1 = 4
counter2 = 2
counter3 = 2
size=22 error: 0
size = 21
counter1 = 5
counter2 = 2
counter3 = 2
size=21 error: 0
size = 20
counter1 = 6
counter2 = 2
counter3 = 2
size=20 error: 0
size = 19
counter1 = 7
counter2 = 2
counter3 = 2
size=19 error: 0
size = 18
counter1 = 0
counter2 = 2
counter3 = 2
size=18 error: 0
size = 17
counter1 = 1
counter2 = 1
counter3 = 2
...
...
...
size = 3
counter1 = 7
counter2 = 0
counter3 = 2
size=3 error: 0
size = 2
counter1 = 0
counter2 = 0
counter3 = 2
size=2 error: 0
size = 1
counter1 = 1
counter2 = 0
counter3 = 0
size=1 error: 0
counter1 = 25
counter2 = 0
counter3 = 0
error: 0
```
### GPROF 測試效能
給定同樣大小陣列,以及同樣 copy 次數,比較 `my_memcpy` 、 `my_memcpy1` 使用 cpu 時間,前者是以 `byte` 為單位做 copy ,後者考慮到用 `word` 為單位做 copy
測試用 main()
```cpp
int main()
{
char a[test_size];
char b[test_size];
memset(b, 'a', test_size);
for(int i = 0; i < 10000; i++){
my_memcpy(&a[0], &b[0], test_size);
my_memcpy1(&a[0], &b[0], test_size);
}
return 0;
}
```
```
$ gcc -Og -pg -g align_test.c -o align_test
$ ./align_test file.txt
$ gprof align_test
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls us/call us/call name
88.21 8.05 8.05 10000 805.36 805.36 my_memcpy
12.21 9.17 1.11 10000 111.46 111.46 my_memcpy1
```
從上面可以看出 `my_memcpy` 和 `my_memcpy1` 所花的時間接近 1:8
假設 CPU 做一次 `read` 真的是 `read` 一個 `word` 也就是 `64 bit` ,由上面實驗我們可以得知在實作 `memcpy` 函數的時候用 `copy_by_word` 的效率會比 `copy_by_byte` 好上 8 倍左右。因為 `cpy_by_word` 所需要的 `while` 迴圈數比 `copy_by_byte` 少了 8 被左右。
### 驗證 read a word
上面的實驗我們假設 CPU 到 memory 一次會 read 64 bit,這裡我們就來驗證 CPU 真的一次會 read 64 bit 到 CPU 的 cache 內
以下實作 4 個不同版本的 memcpy ,分別用來複製 4 種不同大小資料型態的 array
```cpp
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define data_size 100000
#define loop_size 100000
// 128 bit struct
struct bit_128{
unsigned long long a;
unsigned long long a1;
};
typedef struct bit_128 bit_128 ;
void *my_memcpy(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
while(count--)
*dst8++ = *src8++;
return dest;
}
void *my_memcpy1(void *dest, const void *src, size_t count)
{
int *dst32 = (int *) dest, *src32 = (int *) src;
while(count--)
*dst32++ = *src32++;
return dest;
}
void *my_memcpy2(void *dest, const void *src, size_t count)
{
unsigned long long *dst64 = (unsigned long long*) dest;
unsigned long long *src64 = (unsigned long long*) src;
while(count--)
*dst64++ = *src64++;
return dest;
}
void *my_memcpy3(void *dest, const void *src, size_t count)
{
bit_128 *dst128 = (bit_128*) dest;
bit_128 *src128 = (bit_128*) src;
while(count--)
*dst128++ = *src128++;
return dest;
}
```
測試用 main
```cpp
int main()
{
char src8[data_size];
char dest8[data_size];
int src32[data_size];
int dest32[data_size];
unsigned long long src64[data_size];
unsigned long long dest64[data_size];
bit_128 src128[data_size];
bit_128 dest128[data_size];
memset(src8, 1, sizeof(char) * data_size);
memset(src32, 1, sizeof(int) * data_size);
memset(src64, 1, sizeof(unsigned long long) * data_size);
memset(src128, 1, sizeof(bit_128) * data_size);
for(int i = 0; i < loop_size; i++){
my_memcpy(dest8, src8, data_size);
my_memcpy1(dest32, src32, data_size);
my_memcpy2(dest64, src64, data_size);
my_memcpy3(dest128, src128, data_size);
}
return 0;
}
```
測試結果
```
$ gcc -Og -pg -g align2.c -o align2
$ ./align2 file.txt
$ gprof align2
Flat profile:
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls us/call us/call name
38.36 16.54 16.54 100000 165.39 165.39 my_memcpy3
21.77 25.92 9.39 100000 93.86 93.86 my_memcpy1
20.32 34.68 8.76 100000 87.59 87.59 my_memcpy2
19.93 43.28 8.59 100000 85.93 85.93 my_memcpy
```
由上面的測試結果可以看出一次 copy 的大小如果小於等於一個 `64 bit` ,copy 的時間都大約是 9 秒,若一次要 copy `128 bit` ,CPU 就需要到 memory 讀取兩次的資料,而讀取 memory 的時間相對於其他指令而言多的非常多,所以 CPU 所花的時間會接近原本的兩倍
### 小結:
到這裡我們知道 CPU 做一次 read 會到 memory 讀取 64 bit 的資料到 cache 當中,所以在實作 `memcpy` 的時候我們可以使用 `copy_by_word` 的方式來實作,由此可以省略 while 迴圈的圈數。
但這裡都還沒有探討到 `memory_alignment` 的問題,其實 CPU 到 memory 讀取資料的時候會從固定的 `address` 開始讀取,以一個 word 為 64 bit 來說,`address` 會是 0 或 8 結尾,若我們在做讀取的時候資料分別放在兩個不同的 `word` ,此時就必須做兩次 `read` 的動作才能拿到我們需要的資料
### memory_alignment
設計兩個 `memcpy` 函式,其中一個有做 `memory_alignment` ,比較兩者執行結果,其中 `my_memcpy` 是有做 `alignment` 的動作,`my_memcpy1` 有做
```cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define data_size 100000000
#define loop_size 100
void my_memcpy(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
while(count >= 8){
*(long long *) dst8 = *(long long *) src8;
dst8 += 8;
src8 += 8;
count -= 8;
}
while(count --){
*dst8++ = *src8++;
}
}
void *my_memcpy1(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
int counter1 = 0; // calculate byte_assign1
int counter2 = 0; // calculate word_assign
int counter3 = 0; // calculate byte_assign2
unsigned long long int_src8 = src8;
unsigned long long int_dst8 = dst8;
int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1
int y = (8 - int_dst8 & 0x7) & 0x7; //number of byte_assign1
int word_count = (count > x) ? (count - x) / 8 : 0;
int byte_count2 = (count > x) ? (count - word_count * 8 - x) : 0;
while (x--){
*dst8++ = *src8++;
counter1++;
}
while (word_count-- > 0){
*(long long *) dst8 = *(long long *) src8;
dst8 += 8;
src8 += 8;
}
while(byte_count2--){
*dst8++ = *src8++;
}
return dest;
}
```
測試程式:
將
```cpp
int main()
{
char *src;
char *dest;
char *dest1;
src = malloc(sizeof(char) * data_size);
dest = malloc(sizeof(char) * data_size);
dest1 = malloc(sizeof(char) * data_size);
memset(src, 1, sizeof(char) * data_size);
for(int i = 0; i < loop_size; i++)
my_memcpy(&dest[1], &src[1], data_size);
for(int i = 0; i < loop_size; i++)
my_memcpy1(&dest1[1], &src[1], data_size);
return 0;
}
```
&dest[1] , &src[1] 兩者記憶體位址尾數都為 1
>這裡用動態分配記憶體的方式做測試是因為 `heap` 預設可以容納的資料量比 `stack` 大滿多的,如果直接宣告 array 的大小會將變數存在 `stack` 內,當 array 太大的時候就會造成 `segmentation fault`
>
```
$ gcc -Og -pg -g align3.c -o align3
$ ./align3 file.txt
$ gprof align3
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls ms/call ms/call name
50.72 1.52 1.52 100 15.17 15.17 my_memcpy1
49.72 3.00 1.49 100 14.87 14.87 my_memcpy
```
從測試結果看起來兩函式所花的時間幾乎是沒有差別的,甚至用了 `memory_alignment` 的函式花的時間還多出一點點。
:::warning
在 x86_64 上,mis-alignment 的效能代價很低,你需要用 perf 搭配 raw performance counter 來確認
:notes: jserv
:::
:::info
了解!
之後補上~
:::
猜想是因為兩個函數對 memory 做 read 的次數其實是一樣的,都一樣是 (data_size / 8) + 1 次,而 `memcpy` 大部分的時間都會花在 `read` 上,所以兩函數花的時間非常接近
### CPE 效能分析
將第一部份的 `my_memcpy` 和 `my_memcpy1` 所花費的時間作圖,圖中橫軸為 `size` ,縱軸為所消耗的時間,計算時間的部份所使用的是 lib `time.h` 中 `clock()` 函式
**`my_memcpy` 與 `my_memcpy1`:**
```cpp
void *my_memcpy(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
while(count--)
*dst8++ = *src8++;
return dest;
}
void *my_memcpy1(void *dest, const void *src, size_t count)
{
char *dst8 = (char *) dest, *src8 = (char *) src;
unsigned long long int_src8 = src8;
unsigned long long int_dst8 = dst8;
int x = (8 - int_src8 & 0x7) & 0x7; //number of byte_assign1
int y = (8 - int_dst8 & 0x7) & 0x7; //number of byte_assign1
int word_count = (count > x) ? (count - x) / 8 : 0;
int byte_count2 = (count > x) ? (count - word_count * 8 - x) : 0;
while (x--){
*dst8++ = *src8++;
}
while (word_count--){
*(long long *) dst8 = *(long long *) src8;
dst8 += 8;
src8 += 8;
}
while(byte_count2--){
*dst8++ = *src8++;
}
return dest;
}
```
**`main` 如下:**
```cpp
int main()
{
FILE *f;
FILE *f1;
char *a;
char *b;
char *c;
clock_t start, end;
double diff = 0;
char output_time[100]; //for fwrite
char output_num[100];//for fwrite
a = malloc(sizeof(char) * data_size_end);
b = malloc(sizeof(char) * data_size_end);
c = malloc(sizeof(char) * data_size_end);
memset(b, 'a', data_size_end);
f = fopen("my_memcpy.txt", "w");
f1 = fopen("my_memcpy1.txt", "w");
//test my_memcpy1
for(int j = data_size_start; j < data_size_end; j += 20000){
memset(output_time, 0, 100);
memset(output_num, 0, 100);
start = clock();
for(int i = 0; i < loop_size; i++)
my_memcpy1(&a[0], &b[0], j);
end = clock();
diff = (double) (end - start) / CLOCKS_PER_SEC;
sprintf(output_time, "%f", diff);
sprintf(output_num, "%d", j);
strcat(output_num, " ");
strcat(output_num, output_time);
strcat(output_num, "\n");
fwrite(output_num, sizeof(char), strlen(output_num), f1);
}
//test my_memcpy
for(int j = data_size_start; j < data_size_end; j += 20000){
memset(output_time, 0, 100);
memset(output_num, 0, 100);
start = clock();
for(int i = 0; i < loop_size; i++){
my_memcpy(&c[0], &b[0], j);
}
end = clock();
diff = (double) (end - start) / CLOCKS_PER_SEC;
sprintf(output_time, "%f", diff);
sprintf(output_num, "%d", j);
strcat(output_num, " ");
strcat(output_num, output_time);
}
return 0;
}
```
根據 CS:APP 第五章的方法,用 `least squares fit` (y = a*x+b) 來作圖
**腳本如下:**
```
set xlabel "copy size"
set ylabel "time(sec)"
set output 'align.png'
set terminal png font " Times_New_Roman,8 "
f(x)=a*x+b
g(x)=c*x+d
fit f(x) "my_memcpy.txt" via a,b
fit g(x) "my_memcpy1.txt" via c,d
plot f(x) with lines, "my_memcpy.txt" using 1:2 w points title "copy by byte",\
g(x) with lines, "my_memcpy1.txt" using 1:2 w points title "copy by word"
```
```
$ gnuplot align.gp
Final set of parameters Asymptotic Standard Error
======================= ==========================
a = 7.45605e-08 +/- 3.736e-10 (0.5011%)
b = 0.00265487 +/- 0.001509 (56.85%)
Final set of parameters Asymptotic Standard Error
======================= ==========================
c = 1.56844e-08 +/- 2.109e-10 (1.345%)
d = -0.00371954 +/- 0.0008518 (22.9%)
```
上表中的 `a` 和 `c` 分別表 `f(x)` 和 `g(x)` 的斜率,也就是 `CPI`

當 `x` 的值越大時也就是需要 copy 的字串長度越長時,兩者的效率大概相差 5 倍,也就是斜率相差 5 倍
這裡再使用 `GPROF` 觀察,也差不多是相差 5 倍
```
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls ms/call ms/call name
83.96 73.89 73.89 28000 2.64 2.64 my_memcpy
16.28 88.22 14.33 28000 0.51 0.51 my_memcpy1
0.01 88.23 0.01 main
```
結果和最前面做出來的 7 倍左右有一點點落差,可能因為採樣次數的不同造成,但落差並沒有太大
:::info
待查: `read` 需要以 word 為單位,`write` 不知道須不需要,如果不需要的話程式碼中的 `case 1` 就不需要存在
:::
:::danger
TODO: 比照 CS:APP 第 5 章的方法,做 CPE 效能分析
:::
###### tags: `Linux 核心設計 2019`