tundergod
hw1
2016q3
contributed by <tundergod
>
Linux 版本: Ubuntu 16.04 LTS
硬體資訊:
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 2
Core(s) per socket: 2
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 69
Model name: Intel(R) Core(TM) i5-4200U CPU @ 1.60GHz
Stepping: 1
CPU MHz: 1661.660
CPU max MHz: 2600.0000
CPU min MHz: 800.0000
BogoMIPS: 4589.54
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 3072K
NUMA node0 CPU(s): 0-3
computepi.c
computepi.h
是主程式time_test.c
定義了N的大小benchmark_clock_gettime
利用lock_gettime()計算程式執行時間make file
中使用了-Dmacro依次執行不同的程式time ./time_test_baseline
N = 1000000000 , pi = 3.141593
11.01user 0.00system 0:11.01elapsed 99%CPU (0avgtext+0avgdata 1812maxresident)k
0inputs+0outputs (0major+85minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 1000000000 , pi = 3.141593
12.22user 0.02system 0:06.14elapsed 199%CPU (0avgtext+0avgdata 1724maxresident)k
0inputs+0outputs (0major+89minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 1000000000 , pi = 3.141593
24.18user 0.00system 0:06.13elapsed 394%CPU (0avgtext+0avgdata 1808maxresident)k
0inputs+0outputs (0major+93minor)pagefaults 0swaps
time ./time_test_avx
N = 1000000000 , pi = 3.141593
3.16user 0.00system 0:03.16elapsed 99%CPU (0avgtext+0avgdata 1932maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 1000000000 , pi = 3.141593
3.06user 0.00system 0:03.06elapsed 99%CPU (0avgtext+0avgdata 1784maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps
不是很看得懂上面所顯示數據的後半部分,如果是手動輸入的時間只會有real time,user time,system time,如下:
time ./time_test_baseline
N = 1000000000 , pi = 3.141593
real 0m11.085s
user 0m11.076s
sys 0m0.008s
references:
計算pi的公式:
double compute_pi_baseline(size_t N)
{
double pi = 0.0;
double dt = 1.0 / N; // dt = (b-a)/N, b = 1, a = 0
for (size_t i = 0; i < N; i++) {
double x = (double) i / N; // x = ti = a+(b-a)*i/N = i/N
pi += dt / (1.0 + x * x); // integrate 1/(1+x^2), i = 0....N
}
return pi * 4.0;
}
int main()
{
double n = 500000000;
double pi = 0.0;
for(size_t i = 0; i < n; i++)
{
int sign = i % 2 == 0 ? 1 : -1;
pi += (sign / (2.0 * (double)i + 1.0));
}
printf("pi = %lf\n",pi*4);
return 0;
}
int main()
{
double n=500000000;
double x,y;
int count=0; /* # of points in the 1st quadrant of unit circle */
double z;
double pi;
count=0;
for(size_t i=0; i<n; i++) {
x = (double)rand()/RAND_MAX;
y = (double)rand()/RAND_MAX;
z = x*x+y*y;
if (z<=1) count++;
}
printf("pi = %lf \n",(double)count/n*4);
return 0;
}
result of monte carlo: pi = 3.141539
time ./time_test_baseline
N = 500000000 , pi = 3.141593
5.76user 0.00system 0:05.76elapsed 99%CPU (0avgtext+0avgdata 1772maxresident)k
0inputs+0outputs (0major+84minor)pagefaults 0swaps
time ./time_test_openmp_2
N = 500000000 , pi = 3.141593
6.16user 0.02system 0:03.13elapsed 197%CPU (0avgtext+0avgdata 1824maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_openmp_4
N = 500000000 , pi = 3.141593
11.43user 0.02system 0:03.16elapsed 362%CPU (0avgtext+0avgdata 1828maxresident)k
0inputs+0outputs (0major+94minor)pagefaults 0swaps
time ./time_test_avx
N = 500000000 , pi = 3.141593
1.77user 0.00system 0:01.77elapsed 99%CPU (0avgtext+0avgdata 1940maxresident)k
0inputs+0outputs (0major+90minor)pagefaults 0swaps
time ./time_test_avxunroll
N = 500000000 , pi = 3.141593
1.69user 0.00system 0:01.69elapsed 99%CPU (0avgtext+0avgdata 1724maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps
time ./time_test_leibniz
N = 500000000 , pi = 3.141593
2.88user 0.00system 0:02.88elapsed 99%CPU (0avgtext+0avgdata 1724maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps
time ./time_test_leibniz_avx
N = 500000000 , pi = 3.141593
1.85user 0.00system 0:01.85elapsed 99%CPU (0avgtext+0avgdata 1764maxresident)k
0inputs+0outputs (0major+86minor)pagefaults 0swaps
time ./time_test_leibniz_avx_unroll
N = 500000000 , pi = 3.141593
1.54user 0.00system 0:01.54elapsed 99%CPU (0avgtext+0avgdata 1720maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_montecarlo
N = 500000000 , pi = 3.141539
15.24user 0.04system 0:15.29elapsed 99%CPU (0avgtext+0avgdata 1784maxresident)k
0inputs+0outputs (0major+84minor)pagefaults 0swaps
time ./time_test_montecarlo_openmp_2
N = 500000000 , pi = 3.141519
142.99user 186.48system 2:47.89elapsed 196%CPU (0avgtext+0avgdata 1816maxresident)k
0inputs+0outputs (0major+87minor)pagefaults 0swaps
time ./time_test_montecarlo_openmp_4
N = 500000000 , pi = 3.141506
129.12user 254.31system 2:07.48elapsed 300%CPU (0avgtext+0avgdata 1748maxresident)k
0inputs+0outputs (0major+92minor)pagefaults 0swaps
leibniz算式用avx和avx+unrolling做有明顯的效能增強,但是montecarlo利用openmp做卻變得非常緩慢,但是system time卻是最多的.
time ./time_test_montecarlo_openmp_4
N = 500000000 , pi = 3.141526
real 2m3.363s
user 2m1.964s
sys 3m58.884s
程式花費的時間分布:system time > real time = user time
#pragma omp parallel num_threads(threads)
{
#pragma omp for private(x,y) reduction(+:z)
for(size_t i=0; i<N; i++) {
x = (double)rand()/RAND_MAX;
y = (double)rand()/RAND_MAX;
if (x*x+y*y <= 1) z++;
}
}
數據圖表1(baseline):
數據圖表2(leibniz):
數據圖表3(monte carlo method):
以上數據的sequences爲'seq 100 5000 100100'
pi found online = 3.14159265358979323846
pi define in c = 3.14159265358979311599
可以看到monte carlo method一直讀在跳動(隨機分布問題),而baseline和leibniz非常有趣(奇怪)的會以完全相同的誤差進行.並且精準值接近小數6位止(已以區近水平線的方式延展),在去掉logscale的第4張圖片會更加清楚的看到大約在N=6000的地方兩個演算法都以非常快的速度趨向最大精準值,而在6000之後對於pi精準小數的計算的效能非常差.
Lim Wen Sheng
老師的公筆上說wall-clock time是從xtime上更新的,xtime以微妙microsecond(µs)計算,但是在一些網站看它是以秒來計算
struct timespec {
time_t tv_sec; /* seconds */
long tv_nsec; /* nanoseconds */
};
struct rusage {
struct timeval ru_utime; /* user CPU time used */
struct timeval ru_stime; /* system CPU time used */
long ru_maxrss; /* maximum resident set size */
long ru_ixrss; /* integral shared memory size */
long ru_idrss; /* integral unshared data size */
long ru_isrss; /* integral unshared stack size */
long ru_minflt; /* page reclaims (soft page faults) */
long ru_majflt; /* page faults (hard page faults) */
long ru_nswap; /* swaps */
long ru_inblock; /* block input operations */
long ru_oublock; /* block output operations */
long ru_msgsnd; /* IPC messages sent */
long ru_msgrcv; /* IPC messages received */
long ru_nsignals; /* signals received */
long ru_nvcsw; /* voluntary context switches */
long ru_nivcsw; /* involuntary context switches */
};
The CLOCK_PROCESS_CPUTIME_ID and CLOCK_THREAD_CPUTIME_ID clocks are realized on
many platforms using timers from the CPUs (TSC on i386, AR.ITC on Itanium).
These registers may differ between CPUs and as a consequence these clocks may
return bogus results if a process is migrated to another CPU.