contributed by <janetwei
>
ierosodin
>因為這次所做的圖是.csv檔,跟之前不太一樣,因此和phonebook中runtime.c的寫法有一些的差別
我建了一個runtime.gp裡面放make plot 的指令
在Makefile裡面增加以下程式碼
plot: result_clock_gettime.csv
gnuplot runtime.gp
Makefile其中的gencsv
seq的用法為 期始值 區間 終止值
gencsv: default
for i in `seq 100 500 250000`; do \
printf "%d," $$i;\
./benchmark_clock_gettime $$i; \
done > result_clock_gettime.csv
因為.csv檔,所以要加上這個指令,不然會出錯
set datafile separator ","
set xlabel 'N'
set ylabel 'Time(sec)'
set style fill solid
set title 'Compute_pi Time by clock_gettime() '
set term png enhanced font 'Verdana,10'
set output 'runtime.png'
set datafile separator ","
plot "result_clock_gettime.csv" using 1:2 title 'baseline' with lines lt rgb 'red' , \
"result_clock_gettime.csv" using 1:3 title 'openmp_2' with lines lt rgb 'blue' , \
"result_clock_gettime.csv" using 1:4 title 'openmp_4' with lines lt rgb 'green' ,\
"result_clock_gettime.csv" using 1:5 title 'AVX' with lines lt rgb 'orange' ,\
"result_clock_gettime.csv" using 1:6 title 'AVX + loop untolling' with lines lt rgb 'brown'
[參考同學的共筆](https://hackmd.io/EwRghgzBAsCcsFpoBMDGykAZUA4ECMRN8DMRpoRkBWWuIA
double compute_pi_avx(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
ymm0 = _mm256_set1_pd(1.0);
ymm1 = _mm256_set1_pd(dt);
ymm2 = _mm256_set_pd(dt * 3, dt * 2, dt * 1, 0.0);
ymm4 = _mm256_setzero_pd(); // sum of pi
for (int i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set1_pd(i * dt); // i*dt, i*dt, i*dt, i*dt
ymm3 = _mm256_add_pd(ymm3, ymm2); // x = i*dt+3*dt, i*dt+2*dt, i*dt+dt, i*dt+0.0
ymm3 = _mm256_mul_pd(ymm3, ymm3); // x^2 = (i*dt+3*dt)^2, (i*dt+2*dt)^2, ...
ymm3 = _mm256_add_pd(ymm0, ymm3); // 1+x^2 = 1+(i*dt+3*dt)^2, 1+(i*dt+2*dt)^2, ...
ymm3 = _mm256_div_pd(ymm1, ymm3); // dt/(1+x^2)
ymm4 = _mm256_add_pd(ymm4, ymm3); // pi += dt/(1+x^2)
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}
Synopsis
Description
Operation
attribute((aligned(32))):編譯器將以32字節(注意是byte不是bit)對齊的方式分配一個變量
32byte=256bit
attribute參考資料
load:從 memory 到 register
store:從 register 到 memory
move:從 register 到 register
將time取95%的信賴區間
Lower Endpoint = average − 2 * σ
Upper Endpoint = average + 2 * σ
struct timespec start = {0, 0};
struct timespec end = {0, 0};
if (argc < 2) return -1;
int N = atoi(argv[1]);
int i, loop = 1000;
double avg[loop];
double time;
// Baseline
for(i = 0; i < loop; i++) {
clock_gettime(CLOCK_ID, &start);
compute_pi_baseline(N);
clock_gettime(CLOCK_ID, &end);
avg[i]=(end.tv_sec - start.tv_sec) +(end.tv_nsec - start.tv_nsec)/ONE_SEC;
}
calculate_SD(avg,loop);
do
{
clock_gettime(CLOCK_ID, &start);
compute_pi_baseline(N);
clock_gettime(CLOCK_ID, &end);
time=(end.tv_sec - start.tv_sec) +(end.tv_nsec - start.tv_nsec)/ONE_SEC;
}while(time>outcome[0] || time<outcome[1]);
printf("%lf,", (double) time);
計算標準差的function
static double outcome[2];
void calculate_SD(double *a,int loop)
{
double average,pow_sum,SD,sum;
for(int j=0;j<loop;j++)
{
sum+=a[j];
}
average=sum/loop;
for(int i=0;i<loop;i++)
{
a[i]=pow(a[i]-average,2);
pow_sum+=a[i];
}
SD=sqrt(pow_sum/loop);
outcome[0]=average + 2.0 * SD;
outcome[1]=average - 2.0 * SD;
}
}
因為有運算到平方(pow)以及開根號(sqrt),因此要加入
#include <math.h
但是之後出現了Error
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DBASELINE -o time_test_baseline
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVX -o time_test_avx
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime
/tmp/ccESiGZA.o: In function `calculate_SD':
benchmark_clock_gettime.c:(.text+0xd6): undefined reference to `pow'
benchmark_clock_gettime.c:(.text+0x129): undefined reference to `sqrt'
collect2: error: ld returned 1 exit status
Makefile:9: recipe for target 'default' failed
make: *** [default] Error 1
查了一下資料發現因為#include <math.h
所以在Mackfile裡要加上-lm
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DBASELINE -o time_test_baseline
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DAVX -o time_test_avx
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime
/tmp/ccWy0NQf.o: In function `calculate_SD':
benchmark_clock_gettime.c:(.text+0xd6): undefined reference to `pow'
benchmark_clock_gettime.c:(.text+0x129): undefined reference to `sqrt'
collect2: error: ld returned 1 exit status
Makefile:9: recipe for target 'default' failed
make: *** [default] Error 1
但是還是有Error,之後發現-lm
要加在整段最後面,但是我還是不知道確切的原因
default: computepi.o
$(CC) $(CFLAGS) computepi.o time_test.c -DBASELINE -o time_test_baseline
$(CC) $(CFLAGS) computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
$(CC) $(CFLAGS) computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
$(CC) $(CFLAGS) computepi.o time_test.c -DAVX -o time_test_avx
$(CC) $(CFLAGS) computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
$(CC) $(CFLAGS) computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime -lm
loop=10
loop=100
loop=1000
和loop=10以及loop=100相比震盪幅度有明顯減低,但是openmp4特別奇怪,在某個時候突然暴增,這個原因還需要再查,目前我無法解釋