contributed by < illusion030
>
rayleigh0407
>illusion030@illusion030-X550LD:~/Desktop/2017sysprog/compute-pi$ lscpu
Architecture: x86_64
CPU 作業模式: 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
每核心執行緒數:2
每通訊端核心數:2
Socket(s): 1
NUMA 節點: 1
供應商識別號: GenuineIntel
CPU 家族: 6
型號: 69
Model name: Intel(R) Core(TM) i5-4200U CPU @ 1.60GHz
製程: 1
CPU MHz: 1479.754
CPU max MHz: 2600.0000
CPU min MHz: 800.0000
BogoMIPS: 4588.93
虛擬: VT-x
L1d 快取: 32K
L1i 快取: 32K
L2 快取: 256K
L3 快取: 3072K
NUMA node0 CPU(s): 0-3
$ git clone https://github.com/illusion030/compute-pi
$ cd compute-pi
$ make check
time ./time_test_baseline
N = 400000000 , pi = 3.141593
4.47user 0.00system 0:04.47elapsed 99%CPU (0avgtext+0avgdata 1812maxresident)k
0inputs+0outputs (0major+84minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 400000000 , pi = 3.141593
4.91user 0.00system 0:02.46elapsed 199%CPU (0avgtext+0avgdata 1796maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 400000000 , pi = 3.141593
9.35user 0.00system 0:02.46elapsed 380%CPU (0avgtext+0avgdata 1812maxresident)k
0inputs+0outputs (0major+89minor)pagefaults 0swaps
time ./time_test_avx
N = 400000000 , pi = 3.141593
1.27user 0.00system 0:01.27elapsed 99%CPU (0avgtext+0avgdata 1792maxresident)k
0inputs+0outputs (0major+83minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 400000000 , pi = 3.141593
1.22user 0.00system 0:01.23elapsed 99%CPU (0avgtext+0avgdata 1704maxresident)k
0inputs+0outputs (0major+83minor)pagefaults 0swaps
Makefile 裡統計資料的程式碼
gencsv: default
for i in `seq 100 5000 25000`; do \
printf "%d," $$i;\
./benchmark_clock_gettime $$i; \
done > result_clock_gettime.csv
for 迴圈裡是指從 100 開始,每次 +5000,加到 25000,所以會畫 100、5100、10100、15100、20100 總共五次
請修改 X label 排版,參考美圖欣賞第三張
參考廖健富提供的詳盡分析
error rate = | Approximate Value − Exact Value | / |Exact Value|
用 N=100:5000:50000 畫出 perfomance 跟 error rate 的比較
發現 anx+unroll 的 error rate 會浮動
再看細一點
用 baseline 的 error rate 小於 0.0001
double compute_pi_leibniz(size_t N)
{
double pi = 0.0;
double one = 1.0;
for(size_t i = 0; i < N; i++){
pi += one / ( 2 * i + 1);
one = -one;
}
return pi * 4;
}
time ./time_test_leibniz
N = 400000000 , pi = 3.141593
2.18user 0.00system 0:02.18elapsed 99%CPU (0avgtext+0avgdata 1716maxresident)k
0inputs+0outputs (0major+82minor)pagefaults 0swaps
時間比 baseline 還快
double compute_pi_leibniz_avx(size_t N)
{
double pi = 0.0;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5;
ymm0 = _mm256_set1_pd(2.0);
ymm1 = _mm256_set1_pd(1.0);
ymm2 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
ymm4 = _mm256_setzero_pd();
ymm5 = _mm256_set_pd(0.0, 1.0, 2.0, 3.0);
for(size_t i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set1_pd(i);
ymm3 = _mm256_add_pd(ymm3, ymm5);
ymm3 = _mm256_mul_pd(ymm3, ymm0);
ymm3 = _mm256_add_pd(ymm3, ymm1);
ymm3 = _mm256_div_pd(ymm2, ymm3);
ymm4 = _mm256_add_pd(ymm4, ymm3);
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4);
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}
time ./time_test_leibniz_avx
N = 400000000 , pi = 3.141593
1.22user 0.00system 0:01.22elapsed 100%CPU (0avgtext+0avgdata 1720maxresident)k
0inputs+0outputs (0major+82minor)pagefaults 0swaps
速度增快了