contributed by<yanang
><donbader
>
yanang
donbader
github-yanang & github-donbader / youtube
比照 B03: compute-pi 要求,特別留意測量時間的機制 (Wall-clock time vs. CPU time)
$ make gencsv
以外的輸出機制,預期執行 $ make plot
後,可透過 gnuplot 產生前述多種實做的效能分析比較圖表當 error rate 收斂時 –- run 的次數
當 error rate 收斂時 –- run 的時間
double compute_pi_baseline(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N; // dt = (b-a)/N, b = 1, a = 0
for (size_t i = 0; i < N; i++) {
double x = (double) i / N; // x = ti = a+(b-a)*i/N = i/N
pi += dt / (1.0 + x * x); // integrate 1/(1+x^2), i = 0....N
}
return pi * 4.0;
}
double compute_pi_openmp(size_t N, int threads)
{
double pi = 0.0;
double dt = 1.0 / N;
double x;
#pragma omp parallel for num_threads(threads) private(x) reduction(+:pi)
for (size_t i = 0; i < N; i++) {
x = (double) i / N;
pi += dt / (1.0 + x * x);
}
return pi * 4.0;
}
比較 thread 數量
double compute_pi_omp_simd(size_t N, int threads)
{
double pi = 0.0;
double dt = 1.0 / N;
double x;
#pragma omp parallel for simd num_threads(threads) private(x) reduction(+:pi)
for (size_t i = 0; i < N; ++i) {
x = (double) i / N;
pi += dt / (1.0 + x * x);
}
return pi * 4.0;
}
比較 thread 數量
double compute_pi_avx(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N;
register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
ymm0 = _mm256_set1_pd(1.0);
ymm1 = _mm256_set1_pd(dt);
ymm2 = _mm256_set_pd(dt * 3, dt * 2, dt * 1, 0.0);
ymm4 = _mm256_setzero_pd(); // sum of pi
for (int i = 0; i <= N - 4; i += 4) {
ymm3 = _mm256_set1_pd(i * dt); // i*dt, i*dt, i*dt, i*dt
ymm3 = _mm256_add_pd(ymm3, ymm2); // x = i*dt+3*dt, i*dt+2*dt, i*dt+dt, i*dt+0.0
ymm3 = _mm256_mul_pd(ymm3, ymm3); // x^2 = (i*dt+3*dt)^2, (i*dt+2*dt)^2, ...
ymm3 = _mm256_add_pd(ymm0, ymm3); // 1+x^2 = 1+(i*dt+3*dt)^2, 1+(i*dt+2*dt)^2, ...
ymm3 = _mm256_div_pd(ymm1, ymm3); // dt/(1+x^2)
ymm4 = _mm256_add_pd(ymm4, ymm3); // pi += dt/(1+x^2)
}
double tmp[4] __attribute__((aligned(32)));
_mm256_store_pd(tmp, ymm4); // move packed float64 values to 256-bit aligned memory location
pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return pi * 4.0;
}
gnuplot with lines
gnuplot smooth bezier
Linux man page :
The clock() function returns an approximation of processor time used by the program
linux man page :
The times() function, which explicitly returns (separate) information about the caller and its children, may be preferable
linux man page :
times() returns the number of clock ticks that have elapsed since an arbitrary point in the past. The return value may overflow the possible range of type clock_t
struct tms {
clock_t tms_utime; /* user time */
clock_t tms_stime; /* system time */
clock_t tms_cutime; /* user time of children */
clock_t tms_cstime; /* system time of children */
}
The functions clock_gettime() retrieve and set the time of the specified clock clk_id
當 clock_gettime() clock_id 指定為 CLOCK_REALTIME 時,它與 gettimeofday 完全一樣
只不過它返回的是奈秒,而 gettimeofday 返回的是微秒
為什麼 clock_gettime() 結果飄忽不定?
function的執行時間單位以 nanoseconds 來決定,執行時間很短時並無法排除 Linux 背景處理其他事情,導致額外的 cache miss ,所以需要很多針對某一筆參數作累積再平均,才能避免飄忽不定
繪圖方面可以使用以下方法去除雜訊:
為什麼 time() 和 gettimeofday() 不適合拿來作 benchmark ?
time() 和 gettimeofday() 未必是單調遞增的。
NTP 會使得 wall-clock 加速或減速以對齊(drift)到正確的時間,當它在對齊時,抓到的時間並不能完全符合真實的時間流逝。
在 MathJax 語法中可使用 &= 對齊式子,不過 hackmd 似乎不支援 yanag
除了 &= 前後還須加上 begin{align} end{align} yanag
pf :
pf :
by using Euler's infinite product for sine
Nilakantha 比 Leibniz 有更快的計算精準率,N=3 時 Leibniz(pi) = 2.8952… 而 Nilakantha(pi) = 3.1452… wiki
據 function 每多計算一位可增加14為有效小數
方法一 : N = N+1 每次做 compute_pi(N) 都判斷 error rate 是否符合要求
// judge function
int judge(int n) {
double pi = compute_pi_formula(formula_id, n);
double error = abs(pi - M_PI);
double error_diff = error - max_error;
return error_diff > 0 ? 1 : error_diff < 0 ? -1 : 0;
}
int n = 0;
while(judge(n) > 0 && ++n);
方法二 : N = Nx10 ,達到 error rate 後退回前一個區間後,用 binary search 尋找答案
int binary_search(int lower, int upper, int (*judge)(int))
{
int mid, decision;
while (lower < upper) {
mid = (upper + lower) / 2;
decision = (*judge)(mid);
if (decision > 0)
lower = mid + 1;
else if (decision < 0)
upper = mid - 1;
else
return mid;
}
return lower;
}
#define abs(num) (num) > 0 ? (num) : -(num)
int convergence(double max_error, int formula_id)
{
if (formula_id > MAX_FORMULA_ID || formula_id < 0) {
return -1;
}
// judge function
int judge(int n) {
double pi = compute_pi_formula(formula_id, n);
double error = abs(pi - M_PI);
double error_diff = error - max_error;
return error_diff > 0 ? 1 : error_diff < 0 ? -1 : 0;
}
long long N = 1;
while (judge(N) > 0)
N *= 10;
if (N == 1) return 1;
return binary_search(N / 10 + 1, N, judge) + 1;
}
set datafile sep ','
plot 'result_clock_gettime.csv' using 1:2 with linespoints \
linewidth 2 title 'baseline', \
plot 'result_clock_gettime.csv' using 1:2 smooth bezier \
linewidth 2 title 'baseline', \
plot 'result_clock_gettime.csv' \
using 2:xtic(1) with histogram title 'baseline', \
Pi algorithms
GPGPU + SIMD
First OpenCL Program
ierosodin 的共筆
Diana Ho 的共筆
why is clcock_gettime so erratic?
gettimeofday() should never be used to measure time
Why is threading increasing execution time ?
Richard's blog
5:00 Chudnovsky 證明
error rate: gmp math (大數運算庫)
改善 error rate計算