github contributed by <Diana Ho
>
d0651
sys
Ubuntu 14.04 LTS
clock_t begin = clock();
compute_pi;
clock_t end = clock();
double time = (double)(end - begin) / CLOCKS_PER_SEC;
struct timespec {
time_t tv_sec; // seconds
long tv_nsec; //nanoseconds
}
struct timespec start,end;
clock_gettime(CLOCK_MONOTONIC_RAW,&start);
compute_pi;
clock_gettime(CLOCK_MONOTONIC_RAW,&end);
double time = (double)((end.tv_sec-start.tv_sec)+(end.tv_nsec-start.tv_nsec)/1000000000);
預期執行 $ make plot
後,可透過 gnuplot 產生效能分析比較圖表
$ make check
make: *** [check] Error 132
$ make gencsv
Illegal instruction (core dumped)
Illegal instruction (core dumped)
Illegal instruction (core dumped)
Illegal instruction (core dumped)
Illegal instruction (core dumped)
make: *** [gencsv] Error 132
$ time ./time_test_avxunroll
不合法的命令 (core dumped)
real 0m1.219s
user 0m0.000s
sys 0m0.000s
參考
如果在跑make check時發現下面的回應Illegal instruction (core dumped)代表電腦不支援AVX SIMD
不知道是什麼原因造成無法順利執行
舊電腦的開發環境:
$ cat /etc/issue
$ cat /proc/version
gcc -c -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.c -o computepi.o
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DBASELINE -o time_test_baseline
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVX -o time_test_avx
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime
time ./time_test_baseline
N = 400000000 , pi = 3.141593
4.90user 0.00system 0:04.90elapsed 99%CPU (0avgtext+0avgdata 1760maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 400000000 , pi = 3.141593
5.04user 0.00system 0:02.52elapsed 199%CPU (0avgtext+0avgdata 1780maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 400000000 , pi = 3.141593
5.54user 0.00system 0:01.49elapsed 370%CPU (0avgtext+0avgdata 1784maxresident)k
0inputs+0outputs (0major+95minor)pagefaults 0swaps
time ./time_test_avx
N = 400000000 , pi = 3.141593
1.57user 0.00system 0:01.58elapsed 99%CPU (0avgtext+0avgdata 1768maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 400000000 , pi = 3.141593
1.70user 0.00system 0:01.70elapsed 99%CPU (0avgtext+0avgdata 1688maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DBASELINE -o time_test_baseline
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_2 -o time_test_openmp_2
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DOPENMP_4 -o time_test_openmp_4
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVX -o time_test_avx
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o time_test.c -DAVXUNROLL -o time_test_avxunroll
gcc -O0 -std=gnu99 -Wall -fopenmp -mavx computepi.o benchmark_clock_gettime.c -o benchmark_clock_gettime
for i in `seq 100 5000 25000`; do \
printf "%d," $i;\
./benchmark_clock_gettime $i; \
done > result_clock_gettime.csv
Introduction to Intel® Advanced Vector Extensions
The hardware supporting Intel® AVX (and FMA) consists of the 16 256-bit YMM registers YMM0-YMM15 and a 32-bit control/status register called MXCSR.
runtime.gp
中編輯繪圖語法Makefile
中定義製圖的 rule
plot: default
gnuplot runtime.gp
$ make plot
產生runtime.png
$ eog runtime.png
看到圖像原來程式碼輸出的圖形(25倍時間),明顯看出資料分佈抖動的很厲害
clock 疊代了25次,所以應將程式改為取25次運算時間之平均
clock_gettime(CLOCK_ID, &start);
for(i = 0; i < loop; i++){
compute_pi_baseline(N);
}
clock_gettime(CLOCK_ID, &end);
printf("%lf",(double)((end.tv_sec-start.tv_sec)+(end.tv_nsec-start.tv_nsec)/ONE_SEC))/loop);
用信賴區間消去極端值
再將X軸取logscale
由樣本資料定義一段數值區間,宣稱有多少信心以估計母體的參數包含於此區間內。
該數值區間上、下限稱為信賴界限(confidence limit)。
用以估計的信心程度稱為信賴(心)水準(confidence level)。
clock()
clock_gettime()
data 飄動, 嘗試增加 Makefile 定義 data 的資料數量:
for i in `seq 100 100 25000`; do \
printf "%d " $$i;\
./benchmark_clock_gettime $$i; \
done > result_clock_gettime.csv
The higher the error rate, the less reliable the connection or data transfer will be.
<math.h>
中的M_PI與computepi中的計算結果對比
double compute_pi_baseline(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N; // dt = (b-a)/N, b = 1, a = 0
for (size_t i = 0; i < N; i++) {
double x = (double) i / N; // x = ti = a+(b-a)*i/N = i/N
pi += dt / (1.0 + x * x); // integrate 1/(1+x^2), i = 0....N
}
return pi * 4.0;
}
double compute_pi_leibniz(size_t N)
{
double pi = 0.0;
for(int i=0; i<N; i++)
{
int temp = (i%2) ? -1 : 1;
pi += (double) temp / (2*i+1);
}
return pi * 4.0;
}
…
double compute_pi_euler(size_t N)
{
double pi = 0.0;
for(int i=0; i<N; i++)
pi += (double)(1 / pow(i, 2));
pi *= 6;
return sqrt(pi);
}
若呼叫 #include <math.h>
的函式, 需要花的時間更長
…
…
int main()
{
double x,y;
int count=0; /* # of points in the 1st quadrant of unit circle */
double z;
double pi;
count=0;
for(size_t i=0; i<N; i++) {
x = (double)rand()/RAND_MAX;
y = (double)rand()/RAND_MAX;
z = x*x+y*y;
if (z<=1) count++;
}
printf("pi = %lf \n",(double)count/n*4);
return 0;
}
…
SIMD
SIMD = Single Instruction Multiple Data,主要把原本一個指令中的數值分成多個部份,分配給多個執行緒執行完後再統整程最後結果,加速執行速度
#pragma
)