owned this note
owned this note
Published
Linked with GitHub
# 2016q3 Homework1(compute-pi)
contributed by <`janetwei`>
### Reviewed by <`ierosodin`>
* 列出不同function之間, 計算π結果與實際值的誤差圖(理論應相同, 但實際並非)
* 嘗試使用不同計算π的數學方式, 例如Leibniz formula或蒙地卡羅
* 可以比較不同thread number對效能的影響
* avx或avx_unroll的實作, code在N的計數上是有殘缺
## Gnuplot
因為這次所做的圖是.csv檔,跟之前不太一樣,因此和phonebook中runtime.c的寫法有一些的差別
我建了一個runtime.gp裡面放make plot 的指令
在Makefile裡面增加以下程式碼
```
plot: result_clock_gettime.csv
gnuplot runtime.gp
```
Makefile其中的gencsv
seq的用法為 期始值 區間 終止值
```
gencsv: default
for i in `seq 100 500 250000`; do \
printf "%d," $$i;\
./benchmark_clock_gettime $$i; \
done > result_clock_gettime.csv
```
因為.csv檔,所以要加上這個指令,不然會出錯
`set datafile separator ","`
```
set xlabel 'N'
set ylabel 'Time(sec)'
set style fill solid
set title 'Compute_pi Time by clock_gettime() '
set term png enhanced font 'Verdana,10'
set output 'runtime.png'
set datafile separator ","
plot "result_clock_gettime.csv" using 1:2 title 'baseline' with lines lt rgb 'red' , \
"result_clock_gettime.csv" using 1:3 title 'openmp_2' with lines lt rgb 'blue' , \
"result_clock_gettime.csv" using 1:4 title 'openmp_4' with lines lt rgb 'green' ,\
"result_clock_gettime.csv" using 1:5 title 'AVX' with lines lt rgb 'orange' ,\
"result_clock_gettime.csv" using 1:6 title 'AVX + loop untolling' with lines lt rgb 'brown'
```
![](https://i.imgur.com/P2DSyRY.png)
[參考同學的共筆](https://hackmd.io/EwRghgzBAsCcsFpoBMDGykAZUA4ECMRN8DMRpoRkBWWuIA
## Avx
```C
double compute_pi_avx(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
ymm0 = _mm256_set1_pd(1.0);
ymm1 = _mm256_set1_pd(dt);
ymm2 = _mm256_set_pd(dt * 3, dt * 2, dt * 1, 0.0);
ymm4 = _mm256_setzero_pd(); // sum of pi
for (int i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set1_pd(i * dt); // i*dt, i*dt, i*dt, i*dt
ymm3 = _mm256_add_pd(ymm3, ymm2); // x = i*dt+3*dt, i*dt+2*dt, i*dt+dt, i*dt+0.0
ymm3 = _mm256_mul_pd(ymm3, ymm3); // x^2 = (i*dt+3*dt)^2, (i*dt+2*dt)^2, ...
ymm3 = _mm256_add_pd(ymm0, ymm3); // 1+x^2 = 1+(i*dt+3*dt)^2, 1+(i*dt+2*dt)^2, ...
ymm3 = _mm256_div_pd(ymm1, ymm3); // dt/(1+x^2)
ymm4 = _mm256_add_pd(ymm4, ymm3); // pi += dt/(1+x^2)
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}
```
Synopsis
- __m256d _mm256_set1_pd (double a)
#include "immintrin.h"
CPUID Flags: AVX
Description
- Broadcast double-precision (64-bit) floating-point value a to all elements of dst.
Operation
- FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
__attribute__((aligned(32))):編譯器將以32字節(注意是byte不是bit)對齊的方式分配一個變量
32byte=256bit
[attribute參考資料](http://huenlil.pixnet.net/blog/post/26078382-%5B%E8%BD%89%5Dgnu-c-__attribute__-%E6%A9%9F%E5%88%B6%E7%B0%A1%E4%BB%8B)
load:從 memory 到 register
store:從 register 到 memory
move:從 register 到 register
## 信賴區間
將time取95%的信賴區間
Lower Endpoint = average − 2 * σ
Upper Endpoint = average + 2 * σ
```
struct timespec start = {0, 0};
struct timespec end = {0, 0};
if (argc < 2) return -1;
int N = atoi(argv[1]);
int i, loop = 1000;
double avg[loop];
double time;
// Baseline
for(i = 0; i < loop; i++) {
clock_gettime(CLOCK_ID, &start);
compute_pi_baseline(N);
clock_gettime(CLOCK_ID, &end);
avg[i]=(end.tv_sec - start.tv_sec) +(end.tv_nsec - start.tv_nsec)/ONE_SEC;
}
calculate_SD(avg,loop);
do
{
clock_gettime(CLOCK_ID, &start);
compute_pi_baseline(N);
clock_gettime(CLOCK_ID, &end);
time=(end.tv_sec - start.tv_sec) +(end.tv_nsec - start.tv_nsec)/ONE_SEC;
}while(time>outcome[0] || time<outcome[1]);
printf("%lf,", (double) time);
```
計算標準差的function
```
static double outcome[2];
void calculate_SD(double *a,int loop)
{
double average,pow_sum,SD,sum;
for(int j=0;j<loop;j++)
{
sum+=a[j];
}
average=sum/loop;
for(int i=0;i<loop;i++)
{
a[i]=pow(a[i]-average,2);
pow_sum+=a[i];
}
SD=sqrt(pow_sum/loop);
outcome[0]=average + 2.0 * SD;
outcome[1]=average - 2.0 * SD;
}
}
```
因為有運算到平方(pow)以及開根號(sqrt),因此要加入
`#include <math.h`
但是之後出現了Error
```
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DBASELINE -o time_test_baseline
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVX -o time_test_avx
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime
/tmp/ccESiGZA.o: In function `calculate_SD':
benchmark_clock_gettime.c:(.text+0xd6): undefined reference to `pow'
benchmark_clock_gettime.c:(.text+0x129): undefined reference to `sqrt'
collect2: error: ld returned 1 exit status
Makefile:9: recipe for target 'default' failed
make: *** [default] Error 1
```
查了一下資料發現因為`#include <math.h`所以在Mackfile裡要加上-lm
```
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DBASELINE -o time_test_baseline
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DAVX -o time_test_avx
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime
/tmp/ccWy0NQf.o: In function `calculate_SD':
benchmark_clock_gettime.c:(.text+0xd6): undefined reference to `pow'
benchmark_clock_gettime.c:(.text+0x129): undefined reference to `sqrt'
collect2: error: ld returned 1 exit status
Makefile:9: recipe for target 'default' failed
make: *** [default] Error 1
```
但是還是有Error,之後發現`-lm`要加在整段最後面,但是我還是不知道確切的原因
```
default: computepi.o
$(CC) $(CFLAGS) computepi.o time_test.c -DBASELINE -o time_test_baseline
$(CC) $(CFLAGS) computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
$(CC) $(CFLAGS) computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
$(CC) $(CFLAGS) computepi.o time_test.c -DAVX -o time_test_avx
$(CC) $(CFLAGS) computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
$(CC) $(CFLAGS) computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime -lm
```
loop=10
![](https://i.imgur.com/rWcpIxB.png)
loop=100
![](https://i.imgur.com/1r0C4Jl.png)
loop=1000
![](https://i.imgur.com/1WyU89Y.png)
和loop=10以及loop=100相比震盪幅度有明顯減低,但是openmp4特別奇怪,在某個時候突然暴增,這個原因還需要再查,目前我無法解釋