每寫一份功課又是學習Makefile的時候=w=
-DBASELINE
等等,其中的-D
指的是#define BASELINE 1
-D
就好了!!make check
結果:
time ./time_test_baseline
N = 400000000 , pi = 3.141593
7.59user 0.00system 0:07.59elapsed 100%CPU (0avgtext+0avgdata 1560maxresident)k
8inputs+0outputs (0major+82minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 400000000 , pi = 3.141593
8.03user 0.00system 0:04.01elapsed 200%CPU (0avgtext+0avgdata 1572maxresident)k
0inputs+0outputs (0major+84minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 400000000 , pi = 3.141593
8.32user 0.00system 0:02.16elapsed 384%CPU (0avgtext+0avgdata 1676maxresident)k
0inputs+0outputs (0major+89minor)pagefaults 0swaps
time ./time_test_avx
N = 400000000 , pi = 3.141593
3.30user 0.00system 0:03.30elapsed 100%CPU (0avgtext+0avgdata 1608maxresident)k
0inputs+0outputs (0major+83minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 400000000 , pi = 3.141593
1.73user 0.00system 0:01.73elapsed 100%CPU (0avgtext+0avgdata 1608maxresident)k
0inputs+0outputs (0major+82minor)pagefaults 0swaps
make gencsv
==> 輸出至表單$$i
for i in `seq 100 5000 25000`; do \
printf "%d," $i;\
./benchmark_clock_gettime $i; \
done > result_clock_gettime.csv
原來除了寫進txt檔之外,還可以寫進libre office的表單裡,好酷!
for(i=100; i<25000; i+=5000)
{
printf("%d",i);
benchmake_clock_gettime(i);
}
然後把結果輸出至.csv檔
細節的語法待會查 先highlight
在Makefile中,在執行各檔時前面多加了time
直接執行time ./time_test_baseline
N = 400000000 , pi = 3.141593
real 0m7.805s
user 0m7.808s
sys 0m0.000s
顯示了三種時間:real,user和sys
Real refers to actual elapsed time; User and Sys refer to CPU time used only by the process.
real = user + sys
double compute_pi_baseline(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N; // dt = (b-a)/N, b = 1, a = 0
for (size_t i = 0; i < N; i++) {
double x = (double) i / N; // x = ti = a+(b-a)*i/N = i/N
pi += dt / (1.0 + x * x); // integrate 1/(1+x^2), i = 0....N
}
return pi * 4.0;
}
time_test.c
中分別傳入2個和4個threads的實驗
double compute_pi_openmp(size_t N, int threads)
{
double pi = 0.0;
double dt = 1.0 / N;
double x;
#pragma omp parallel num_threads(threads)
{
#pragma omp for private(x) reduction(+:pi)
for (size_t i = 0; i < N; i++) {
x = (double) i / N;
pi += dt / (1.0 + x * x);
}
}
return pi * 4.0;
}
甚麼是AVX?
prototype:
配上註解與下方的連結終於稍微看懂了~
__attribute__
機制,可以用來設置函數屬性(Function Attribute)、變數屬性(Variable Attribute)和類型屬性(Type Attribute)。aligned
則規定變數或結構的最小對齊格式,以 Byte 為單位。不知道為甚麼沒有highlight@@
double compute_pi_avx(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
ymm0 = _mm256_set1_pd(1.0);
ymm1 = _mm256_set1_pd(dt);
ymm2 = _mm256_set_pd(dt * 3, dt * 2, dt * 1, 0.0);
ymm4 = _mm256_setzero_pd(); // sum of pi
for (int i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set1_pd(i * dt); // i*dt, i*dt, i*dt, i*dt
ymm3 = _mm256_add_pd(ymm3, ymm2); // x = i*dt+3*dt, i*dt+2*dt, i*dt+dt, i*dt+0.0
ymm3 = _mm256_mul_pd(ymm3, ymm3); // x^2 = (i*dt+3*dt)^2, (i*dt+2*dt)^2, ...
ymm3 = _mm256_add_pd(ymm0, ymm3); // 1+x^2 = 1+(i*dt+3*dt)^2, 1+(i*dt+2*dt)^2, ...
ymm3 = _mm256_div_pd(ymm1, ymm3); // dt/(1+x^2)
ymm4 = _mm256_add_pd(ymm4, ymm3); // pi += dt/(1+x^2)
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}
__m256d
:存放double,256-bit可用,因此可以存放256/64 = 4 個double(aligned(32))
是因為double有8個byte,又宣告 double tmp[4] ==> 8*4= 32(byte)
double compute_pi_Leibniz(size_t N)
{
double pi = 0.0;
double x;
for (size_t i = 0; i < N; i++) {
x = pow(-1,i) / (2*i +1);
pi += x;
}
return pi * 4.0;
}
一直出現錯誤資訊:
應該是link的問題,但是即使加了-lm
還是沒用@@
computepi.c:(.text+0xd80): 未定義參考到 pow
collect2: error: ld returned 1 exit status
make: *** [default] Error 1
computepi.c 有included <math.h>嗎? 或許可以試試,我花很多時間處理這個…SarahYuHanCheng
int tmp = (i%2) ? -1 : 1;
x = (double)tmp / (2*i +1) ; //記得轉型
pi += x;
看了一些資料在搭配原有的程式碼,很快就可以寫出來了~ 覺得開心=w=
double compute_pi_leibizavx(size_t N)
{
double pi = 0.0;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
ymm0 = _mm256_set_pd(1.0,-1.0,1.0,-1.0); //for pow(-1,i)
ymm1 = _mm256_set1_pd(2.0);
ymm2 = _mm256_set1_pd(1.0);
ymm4 = _mm256_setzero_pd(); // sum of pi
for (int i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set_pd(i, i+1.0, i+2.0, i+3.0); //i i+1 i+2 i+3
ymm3 = _mm256_mul_pd(ymm1, ymm3); //2*i 2*(i+1)...
ymm3 = _mm256_add_pd(ymm3,ymm2); //2*i+1 2*(i+1)+1 ...
ymm3 = _mm256_div_pd(ymm0,ymm3); //(-1)^i/(2*i+1) ...
ymm4 = _mm256_add_pd(ymm3,ymm4); //sum
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}
double compute_pi_MonteCarlo(size_t N)
{
int count = 0;
srand(time(NULL));
for (size_t i = 0; i < N; i++) {
double x = (double)rand()/RAND_MAX;
double y = (double)rand()/RAND_MAX;
if((x*x+y*y)<=1) count++;
}
return 4.0 * count / N ;
}
using [a:b]
: 以第a column當x座標,第b column當y座標製圖時產生的warning:
warning: Skipping data file with no valid points
查了一段時間,玄機就在副檔名.csv
和gnuplot他吃數據的方式啊!!
csv : comma separated value
所以資料之間是以 逗號 區隔,因此要在gnuplot script裡加一行set datafile separator ","
才讀的到資料
抖的超厲害!
取樣從N=1000~N=250000,間隔1000
隔一天跑出來的結果差好多@@
omp_4沒那麼劇烈了,還不知道為甚麼會這樣
新增Leibniz
Time
OpenMP
AVX
system embedded
HW1-3