Try   HackMD

2016q3 Homework1 (compute-pi)

執行環境

Linux 4.4.0-31-generic #50-Ubuntu
x86_64 x86_64 x86_64 GNU/Linux

L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 3072K

先編譯看看

$ make check

使用 clock_gettime() 來測量不同實做的執行時間
需要等待

benchmark_clock_gettime
time ./time_test_baseline
N = 400000000 , pi = 3.141593
4.37user 0.00system 0:04.37elapsed 99%CPU (0avgtext+0avgdata 1696maxresident)k
0inputs+0outputs (0major+83minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 400000000 , pi = 3.141593
6.22user 0.00system 0:03.11elapsed 199%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 400000000 , pi = 3.141593
8.66user 0.00system 0:02.18elapsed 396%CPU (0avgtext+0avgdata 1824maxresident)k
0inputs+0outputs (0major+91minor)pagefaults 0swaps
time ./time_test_avx
N = 400000000 , pi = 3.141593
1.90user 0.00system 0:01.90elapsed 100%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 400000000 , pi = 3.141593
1.73user 0.00system 0:01.73elapsed 99%CPU (0avgtext+0avgdata 1768maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps

接著

$ make gencsv

會將數據輸出至 result_clock_gettime.csv

接下來就是將我們在phonebook所學的繪圖用上的時間啦!!!

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →


Pi 的計算方式

Baseline 版本

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

  • 證明:
    微積分公式

  • 這段程式碼使用黎曼積分

    Image Not Showing Possible Reasons
    • The image file may be corrupted
    • The server hosting the image is unavailable
    • The image path is incorrect
    • The image format is not supported
    Learn More →
    (快速勾起大家高中回憶)
    下面的N表示切成幾塊,如果切越多快就越接近曲線下的面積

double compute_pi_baseline(size_t N) { double pi = 0.0; double dt = 1.0 / N; // dt = (b-a)/N, b = 1, a = 0 for (size_t i = 0; i < N; i++) { double x = (double) i / N; // x = ti = a+(b-a)*i/N = i/N pi += dt / (1.0 + x * x); // integrate 1/(1+x^2), i = 0....N } return pi * 4.0; //因為算出來是pi/4 所以要*4倍 }

OpenMP

  • 跟baseline一樣的積分式求得pi

  • threads間的資料沒有先後運算的差別

  • 程式碼:
    因為這裡使用的積分公式全為加法,可以分區積分後再加總

    011x2+1dt=01/21x2+1dt+1/211x2+1dt

double compute_pi_openmp(size_t N, int threads) { double pi = 0.0; double dt = 1.0 / N; double x; #pragma omp parallel num_threads(threads) { #pragma omp for private(x) reduction(+:pi) for (size_t i = 0; i < N; i++) { x = (double) i / N; pi += dt / (1.0 + x * x); } } return pi * 4.0; }

AVX SIMD

AVX (Advanced Vector Extensions) 是 Intel 一套用來作 Single Instruction Multiple Data 的指令集

  • 128 位 SIMD 暫存器 xmm0 - xmm15 擴展為 256 位的 ymm0 - ymm15 暫存器
  • 支持 256 位的向量運算,由原來 128 位擴展為 256 位
  • 指令可支持最多 4 個 operand,實現目標操作數無需損毀原來的內容

AVX SIMD + Unroll looping

為何 AVX SIMD 的版本沒辦法達到預期 4x 效能提昇呢?

因為浮點 scalar 除法和 vector 除法的延遲,導致沒辦法有效率的執行。考慮到填充 vector instruction 的 pipeline 未能充分使用,我們可以預先展開迴圈,以便使用更多的 scalar 暫存器,從而減少資料相依性。

for (int i = 0; i <= dt - 4; i += 4) {
    ymm3 = _mm256_set1_pd(i * delta);
    ymm3 = _mm256_add_pd(ymm3, ymm2);
    ymm3 = _mm256_mul_pd(ymm3, ymm3);
    ymm3 = _mm256_add_pd(ymm0, ymm3);
    ymm3 = _mm256_div_pd(ymm1, ymm3);
    ymm4 = _mm256_add_pd(ymm4, ymm3);
}

可以看到 ymm3 有很強的相依性,而且看起來無法在縮得更小了。

效能分析圖

我們將數據的範圍調大

gencsv: default
	for i in `seq 100 5000 800000`; do \
		printf "%d," $$i;\
		./benchmark_clock_gettime $$i; \
	done > result_clock_gettime.csv	

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →


Leibniz 版本

Baseline

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

  • 程式:
double compute_pi_Leibniz(size_t N) { double pi = 0.0; double x; for (size_t i = 0; i < N; i++) { x = pow(-1,i) / (2*i +1); pi += x; } return pi * 4.0; }

AVX SIMD

double compute_pi_leibizavx(size_t N) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set_pd(1.0,-1.0,1.0,-1.0); //for pow(-1,i) ymm1 = _mm256_set1_pd(2.0); ymm2 = _mm256_set1_pd(1.0); ymm4 = _mm256_setzero_pd(); // sum of pi for (int i = 0; i <= N - 4; i += 4) { ymm3 = _mm256_set_pd(i, i+1.0, i+2.0, i+3.0); //i i+1 i+2 i+3 ymm3 = _mm256_mul_pd(ymm1, ymm3); //2*i 2*(i+1)... ymm3 = _mm256_add_pd(ymm3,ymm2); //2*i+1 2*(i+1)+1 ... ymm3 = _mm256_div_pd(ymm0,ymm3); //(-1)^i/(2*i+1) ... ymm4 = _mm256_add_pd(ymm3,ymm4); //sum } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }

效能分析圖

參考資料