# 2016q3 Homework1(compute-pi) contributed by <`janetwei`> ### Reviewed by <`ierosodin`> * 列出不同function之間, 計算π結果與實際值的誤差圖(理論應相同, 但實際並非) * 嘗試使用不同計算π的數學方式, 例如Leibniz formula或蒙地卡羅 * 可以比較不同thread number對效能的影響 * avx或avx_unroll的實作, code在N的計數上是有殘缺 ## Gnuplot 因為這次所做的圖是.csv檔,跟之前不太一樣,因此和phonebook中runtime.c的寫法有一些的差別 我建了一個runtime.gp裡面放make plot 的指令 在Makefile裡面增加以下程式碼 ``` plot: result_clock_gettime.csv gnuplot runtime.gp ``` Makefile其中的gencsv seq的用法為 期始值 區間 終止值 ``` gencsv: default for i in `seq 100 500 250000`; do \ printf "%d," $$i;\ ./benchmark_clock_gettime $$i; \ done > result_clock_gettime.csv ``` 因為.csv檔,所以要加上這個指令,不然會出錯 `set datafile separator ","` ``` set xlabel 'N' set ylabel 'Time(sec)' set style fill solid set title 'Compute_pi Time by clock_gettime() ' set term png enhanced font 'Verdana,10' set output 'runtime.png' set datafile separator "," plot "result_clock_gettime.csv" using 1:2 title 'baseline' with lines lt rgb 'red' , \ "result_clock_gettime.csv" using 1:3 title 'openmp_2' with lines lt rgb 'blue' , \ "result_clock_gettime.csv" using 1:4 title 'openmp_4' with lines lt rgb 'green' ,\ "result_clock_gettime.csv" using 1:5 title 'AVX' with lines lt rgb 'orange' ,\ "result_clock_gettime.csv" using 1:6 title 'AVX + loop untolling' with lines lt rgb 'brown' ``` ![](https://i.imgur.com/P2DSyRY.png) [參考同學的共筆](https://hackmd.io/EwRghgzBAsCcsFpoBMDGykAZUA4ECMRN8DMRpoRkBWWuIA ## Avx ```C double compute_pi_avx(size_t N) { double pi = 0.0; double dt = 1.0 / N; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set1_pd(1.0); ymm1 = _mm256_set1_pd(dt); ymm2 = _mm256_set_pd(dt * 3, dt * 2, dt * 1, 0.0); ymm4 = _mm256_setzero_pd(); // sum of pi for (int i = 0; i <= N - 4; i += 4) { ymm3 = _mm256_set1_pd(i * dt); // i*dt, i*dt, i*dt, i*dt ymm3 = _mm256_add_pd(ymm3, ymm2); // x = i*dt+3*dt, i*dt+2*dt, i*dt+dt, i*dt+0.0 ymm3 = _mm256_mul_pd(ymm3, ymm3); // x^2 = (i*dt+3*dt)^2, (i*dt+2*dt)^2, ... ymm3 = _mm256_add_pd(ymm0, ymm3); // 1+x^2 = 1+(i*dt+3*dt)^2, 1+(i*dt+2*dt)^2, ... ymm3 = _mm256_div_pd(ymm1, ymm3); // dt/(1+x^2) ymm4 = _mm256_add_pd(ymm4, ymm3); // pi += dt/(1+x^2) } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; } ``` Synopsis - __m256d _mm256_set1_pd (double a) #include "immintrin.h" CPUID Flags: AVX Description - Broadcast double-precision (64-bit) floating-point value a to all elements of dst. Operation - FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0 __attribute__((aligned(32))):編譯器將以32字節(注意是byte不是bit)對齊的方式分配一個變量 32byte=256bit [attribute參考資料](http://huenlil.pixnet.net/blog/post/26078382-%5B%E8%BD%89%5Dgnu-c-__attribute__-%E6%A9%9F%E5%88%B6%E7%B0%A1%E4%BB%8B) load:從 memory 到 register store:從 register 到 memory move:從 register 到 register ## 信賴區間 將time取95%的信賴區間 Lower Endpoint = average − 2 * σ Upper Endpoint = average + 2 * σ ``` struct timespec start = {0, 0}; struct timespec end = {0, 0}; if (argc < 2) return -1; int N = atoi(argv[1]); int i, loop = 1000; double avg[loop]; double time; // Baseline for(i = 0; i < loop; i++) { clock_gettime(CLOCK_ID, &start); compute_pi_baseline(N); clock_gettime(CLOCK_ID, &end); avg[i]=(end.tv_sec - start.tv_sec) +(end.tv_nsec - start.tv_nsec)/ONE_SEC; } calculate_SD(avg,loop); do { clock_gettime(CLOCK_ID, &start); compute_pi_baseline(N); clock_gettime(CLOCK_ID, &end); time=(end.tv_sec - start.tv_sec) +(end.tv_nsec - start.tv_nsec)/ONE_SEC; }while(time>outcome[0] || time<outcome[1]); printf("%lf,", (double) time); ``` 計算標準差的function ``` static double outcome[2]; void calculate_SD(double *a,int loop) { double average,pow_sum,SD,sum; for(int j=0;j<loop;j++) { sum+=a[j]; } average=sum/loop; for(int i=0;i<loop;i++) { a[i]=pow(a[i]-average,2); pow_sum+=a[i]; } SD=sqrt(pow_sum/loop); outcome[0]=average + 2.0 * SD; outcome[1]=average - 2.0 * SD; } } ``` 因為有運算到平方(pow)以及開根號(sqrt),因此要加入 `#include <math.h` 但是之後出現了Error ``` cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DBASELINE -o time_test_baseline cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2 cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4 cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVX -o time_test_avx cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll cc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime /tmp/ccESiGZA.o: In function `calculate_SD': benchmark_clock_gettime.c:(.text+0xd6): undefined reference to `pow' benchmark_clock_gettime.c:(.text+0x129): undefined reference to `sqrt' collect2: error: ld returned 1 exit status Makefile:9: recipe for target 'default' failed make: *** [default] Error 1 ``` 查了一下資料發現因為`#include <math.h`所以在Mackfile裡要加上-lm ``` cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DBASELINE -o time_test_baseline cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2 cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4 cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DAVX -o time_test_avx cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll cc -O0 -std=gnu99 -Wall -fopenmp -mavx -lm computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime /tmp/ccWy0NQf.o: In function `calculate_SD': benchmark_clock_gettime.c:(.text+0xd6): undefined reference to `pow' benchmark_clock_gettime.c:(.text+0x129): undefined reference to `sqrt' collect2: error: ld returned 1 exit status Makefile:9: recipe for target 'default' failed make: *** [default] Error 1 ``` 但是還是有Error,之後發現`-lm`要加在整段最後面,但是我還是不知道確切的原因 ``` default: computepi.o $(CC) $(CFLAGS) computepi.o time_test.c -DBASELINE -o time_test_baseline $(CC) $(CFLAGS) computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2 $(CC) $(CFLAGS) computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4 $(CC) $(CFLAGS) computepi.o time_test.c -DAVX -o time_test_avx $(CC) $(CFLAGS) computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll $(CC) $(CFLAGS) computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime -lm ``` loop=10 ![](https://i.imgur.com/rWcpIxB.png) loop=100 ![](https://i.imgur.com/1r0C4Jl.png) loop=1000 ![](https://i.imgur.com/1WyU89Y.png) 和loop=10以及loop=100相比震盪幅度有明顯減低,但是openmp4特別奇怪,在某個時候突然暴增,這個原因還需要再查,目前我無法解釋