2016q3 Homework1 (compute-pi)

執行環境

Linux 4.4.0-31-generic #50-Ubuntu
x86_64 x86_64 x86_64 GNU/Linux

L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 3072K

先編譯看看

$ make check

使用 clock_gettime() 來測量不同實做的執行時間
需要等待

benchmark_clock_gettime
time ./time_test_baseline
N = 400000000 , pi = 3.141593
4.37user 0.00system 0:04.37elapsed 99%CPU (0avgtext+0avgdata 1696maxresident)k
0inputs+0outputs (0major+83minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 400000000 , pi = 3.141593
6.22user 0.00system 0:03.11elapsed 199%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 400000000 , pi = 3.141593
8.66user 0.00system 0:02.18elapsed 396%CPU (0avgtext+0avgdata 1824maxresident)k
0inputs+0outputs (0major+91minor)pagefaults 0swaps
time ./time_test_avx
N = 400000000 , pi = 3.141593
1.90user 0.00system 0:01.90elapsed 100%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 400000000 , pi = 3.141593
1.73user 0.00system 0:01.73elapsed 99%CPU (0avgtext+0avgdata 1768maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps

接著

$ make gencsv

會將數據輸出至 result_clock_gettime.csv

接下來就是將我們在phonebook所學的繪圖用上的時間啦!!!

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Pi 的計算方式

Baseline 版本

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

證明:
微積分公式
這段程式碼使用黎曼積分
Image Not Showing Possible Reasons
- The image file may be corrupted
- The server hosting the image is unavailable
- The image path is incorrect
- The image format is not supported
Learn More →
(快速勾起大家高中回憶)
下面的N表示切成幾塊，如果切越多快就越接近曲線下的面積










double compute_pi_baseline(size_t N)
{
    double pi = 0.0;
    double dt = 1.0 / N;                // dt = (b-a)/N, b = 1, a = 0
    for (size_t i = 0; i < N; i++) {
        double x = (double) i / N;      // x = ti = a+(b-a)*i/N = i/N
        pi += dt / (1.0 + x * x);       // integrate 1/(1+x^2), i = 0....N
    }
    return pi * 4.0;   //因為算出來是pi/4 所以要*4倍
}

OpenMP

跟baseline一樣的積分式求得pi
threads間的資料沒有先後運算的差別
程式碼：
因為這裡使用的積分公式全為加法，可以分區積分後再加總

$\int_{0}^{1} \frac{1}{x^{2} + 1} d t = \int_{0}^{1 / 2} \frac{1}{x^{2} + 1} d t + \int_{1 / 2}^{1} \frac{1}{x^{2} + 1} d t$ 。















double compute_pi_openmp(size_t N, int threads)
{
    double pi = 0.0;
    double dt = 1.0 / N;
    double x;
    #pragma omp parallel num_threads(threads)
    {   
        #pragma omp for private(x) reduction(+:pi)
        for (size_t i = 0; i < N; i++) {
            x = (double) i / N;
            pi += dt / (1.0 + x * x); 
        }   
    }   
    return pi * 4.0;
}

AVX SIMD

AVX (Advanced Vector Extensions) 是 Intel 一套用來作 Single Instruction Multiple Data 的指令集

128 位 SIMD 暫存器 xmm0 - xmm15 擴展為 256 位的 ymm0 - ymm15 暫存器
支持 256 位的向量運算，由原來 128 位擴展為 256 位
指令可支持最多 4 個 operand，實現目標操作數無需損毀原來的內容

AVX SIMD + Unroll looping

為何 AVX SIMD 的版本沒辦法達到預期 4x 效能提昇呢？

因為浮點 scalar 除法和 vector 除法的延遲，導致沒辦法有效率的執行。考慮到填充 vector instruction 的 pipeline 未能充分使用，我們可以預先展開迴圈，以便使用更多的 scalar 暫存器，從而減少資料相依性。

for (int i = 0; i <= dt - 4; i += 4) {
    ymm3 = _mm256_set1_pd(i * delta);
    ymm3 = _mm256_add_pd(ymm3, ymm2);
    ymm3 = _mm256_mul_pd(ymm3, ymm3);
    ymm3 = _mm256_add_pd(ymm0, ymm3);
    ymm3 = _mm256_div_pd(ymm1, ymm3);
    ymm4 = _mm256_add_pd(ymm4, ymm3);
}

可以看到 ymm3 有很強的相依性，而且看起來無法在縮得更小了。

效能分析圖

我們將數據的範圍調大

gencsv: default
	for i in `seq 100 5000 800000`; do \
		printf "%d," $$i;\
		./benchmark_clock_gettime $$i; \
	done > result_clock_gettime.csv

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Leibniz 版本

Baseline

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

程式：










double compute_pi_Leibniz(size_t N)
{
    double pi = 0.0;
    double x;
    for (size_t i = 0; i < N; i++) {
        x = pow(-1,i) / (2*i +1);
        pi += x;
    }
    return pi * 4.0;
}

AVX SIMD






















double compute_pi_leibizavx(size_t N)
{
    double pi = 0.0;
    register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
    ymm0 = _mm256_set_pd(1.0,-1.0,1.0,-1.0); //for pow(-1,i)
    ymm1 = _mm256_set1_pd(2.0);
	ymm2 = _mm256_set1_pd(1.0);
    ymm4 = _mm256_setzero_pd();             // sum of pi

    for (int i = 0; i <= N - 4; i += 4) {
		ymm3 = _mm256_set_pd(i, i+1.0, i+2.0, i+3.0); //i i+1 i+2 i+3
		ymm3 = _mm256_mul_pd(ymm1, ymm3);	//2*i 2*(i+1)...
		ymm3 = _mm256_add_pd(ymm3,ymm2);	//2*i+1 2*(i+1)+1 ...
		ymm3 = _mm256_div_pd(ymm0,ymm3);	//(-1)^i/(2*i+1) ...
		ymm4 = _mm256_add_pd(ymm3,ymm4);	//sum
		 
    }
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm4);             // move packed float64 values to  256-bit aligned memory location
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
    return pi * 4.0;
}

2016q3 Homework1 (compute-pi)

執行環境

先編譯看看

Pi 的計算方式

Baseline 版本

OpenMP

AVX SIMD

AVX SIMD + Unroll looping

效能分析圖

Leibniz 版本

Baseline

AVX SIMD

效能分析圖

參考資料