Linux 4.4.0-31-generic #50-Ubuntu
x86_64 x86_64 x86_64 GNU/Linux
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 3072K
$ make check
使用 clock_gettime() 來測量不同實做的執行時間
需要等待
benchmark_clock_gettime
time ./time_test_baseline
N = 400000000 , pi = 3.141593
4.37user 0.00system 0:04.37elapsed 99%CPU (0avgtext+0avgdata 1696maxresident)k
0inputs+0outputs (0major+83minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 400000000 , pi = 3.141593
6.22user 0.00system 0:03.11elapsed 199%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 400000000 , pi = 3.141593
8.66user 0.00system 0:02.18elapsed 396%CPU (0avgtext+0avgdata 1824maxresident)k
0inputs+0outputs (0major+91minor)pagefaults 0swaps
time ./time_test_avx
N = 400000000 , pi = 3.141593
1.90user 0.00system 0:01.90elapsed 100%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 400000000 , pi = 3.141593
1.73user 0.00system 0:01.73elapsed 99%CPU (0avgtext+0avgdata 1768maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps
接著
$ make gencsv
會將數據輸出至 result_clock_gettime.csv
接下來就是將我們在phonebook所學的繪圖用上的時間啦!!!
證明:
微積分公式
這段程式碼使用黎曼積分
double compute_pi_baseline(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N; // dt = (b-a)/N, b = 1, a = 0
for (size_t i = 0; i < N; i++) {
double x = (double) i / N; // x = ti = a+(b-a)*i/N = i/N
pi += dt / (1.0 + x * x); // integrate 1/(1+x^2), i = 0....N
}
return pi * 4.0; //因為算出來是pi/4 所以要*4倍
}
跟baseline一樣的積分式求得pi
threads間的資料沒有先後運算的差別
程式碼:
因為這裡使用的積分公式全為加法,可以分區積分後再加總
double compute_pi_openmp(size_t N, int threads)
{
double pi = 0.0;
double dt = 1.0 / N;
double x;
#pragma omp parallel num_threads(threads)
{
#pragma omp for private(x) reduction(+:pi)
for (size_t i = 0; i < N; i++) {
x = (double) i / N;
pi += dt / (1.0 + x * x);
}
}
return pi * 4.0;
}
AVX (Advanced Vector Extensions) 是 Intel 一套用來作 Single Instruction Multiple Data 的指令集
為何 AVX SIMD 的版本沒辦法達到預期 4x 效能提昇呢?
因為浮點 scalar 除法和 vector 除法的延遲,導致沒辦法有效率的執行。考慮到填充 vector instruction 的 pipeline 未能充分使用,我們可以預先展開迴圈,以便使用更多的 scalar 暫存器,從而減少資料相依性。
for (int i = 0; i <= dt - 4; i += 4) {
ymm3 = _mm256_set1_pd(i * delta);
ymm3 = _mm256_add_pd(ymm3, ymm2);
ymm3 = _mm256_mul_pd(ymm3, ymm3);
ymm3 = _mm256_add_pd(ymm0, ymm3);
ymm3 = _mm256_div_pd(ymm1, ymm3);
ymm4 = _mm256_add_pd(ymm4, ymm3);
}
可以看到 ymm3 有很強的相依性,而且看起來無法在縮得更小了。
我們將數據的範圍調大
gencsv: default
for i in `seq 100 5000 800000`; do \
printf "%d," $$i;\
./benchmark_clock_gettime $$i; \
done > result_clock_gettime.csv
double compute_pi_Leibniz(size_t N)
{
double pi = 0.0;
double x;
for (size_t i = 0; i < N; i++) {
x = pow(-1,i) / (2*i +1);
pi += x;
}
return pi * 4.0;
}
double compute_pi_leibizavx(size_t N)
{
double pi = 0.0;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
ymm0 = _mm256_set_pd(1.0,-1.0,1.0,-1.0); //for pow(-1,i)
ymm1 = _mm256_set1_pd(2.0);
ymm2 = _mm256_set1_pd(1.0);
ymm4 = _mm256_setzero_pd(); // sum of pi
for (int i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set_pd(i, i+1.0, i+2.0, i+3.0); //i i+1 i+2 i+3
ymm3 = _mm256_mul_pd(ymm1, ymm3); //2*i 2*(i+1)...
ymm3 = _mm256_add_pd(ymm3,ymm2); //2*i+1 2*(i+1)+1 ...
ymm3 = _mm256_div_pd(ymm0,ymm3); //(-1)^i/(2*i+1) ...
ymm4 = _mm256_add_pd(ymm3,ymm4); //sum
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}