contributed by <beieve7028
>
make PROFILE=1
重新編譯程式碼,並且學習 gprofmath-toolkit.h
在內的函式實做Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 8
On-line CPU(s) list: 0-7
Thread(s) per core: 2
Core(s) per socket: 4
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 94
Stepping: 3
CPU MHz: 3491.640
BogoMIPS: 6815.99
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 8192K
NUMA node0 CPU(s): 0-7
執行時間:
$ git checkout master && make clean && make && ./raytracing
Execution time of raytracing() : 2.168869 sec
gprof分析:
$ git checkout master && make clean && make PROFILE=1 && ./raytracing && gprof ./raytracing | less
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
30.02 0.72 0.72 69646433 0.00 0.00 dot_product
15.01 1.08 0.36 56956357 0.00 0.00 subtract_vector
10.84 1.34 0.26 13861875 0.00 0.00 rayRectangularIntersection
8.13 1.54 0.20 31410180 0.00 0.00 multiply_vector
7.09 1.71 0.17 13861875 0.00 0.00 raySphereIntersection
4.59 1.82 0.11 17836094 0.00 0.00 add_vector
4.17 1.92 0.10 4620625 0.00 0.00 ray_hit_object
3.96 2.01 0.10 17821809 0.00 0.00 cross_product
3.75 2.10 0.09 10598450 0.00 0.00 normalize
2.08 2.15 0.05 2110576 0.00 0.00 compute_specular_diffuse
1.67 2.19 0.04 4221152 0.00 0.00 multiply_vectors
1.67 2.23 0.04 2110576 0.00 0.00 localColor
1.25 2.26 0.03 1048576 0.00 0.00 ray_color
1.25 2.29 0.03 1 0.03 2.39 raytracing
0.83 2.31 0.02 2558386 0.00 0.00 idx_stack_empty
0.83 2.33 0.02 1241598 0.00 0.00 refraction
0.83 2.35 0.02 1204003 0.00 0.00 idx_stack_push
0.42 2.36 0.01 3838091 0.00 0.00 length
可以見到前幾名佔了很大的比例,因此去查看函式內容
math-toolkit.h:
static inline
void add_vector(const double *a, const double *b, double *out)
{
for (int i = 0; i < 3; i++)
out[i] = a[i] + b[i];
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
for (int i = 0; i < 3; i++)
out[i] = a[i] - b[i];
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
for (int i = 0; i < 3; i++)
out[i] = a[i] * b[i];
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
for (int i = 0; i < 3; i++)
out[i] = a[i] * b;
}
static inline
double dot_product(const double *v1, const double *v2)
{
double dp = 0.0;
for (int i = 0; i < 3; i++)
dp += v1[i] * v2[i];
return dp;
}
可以見到都是迴圈的計算,於是採用解說所提到的loop unrolling技巧進行優化:
math-toolkit.h
static inline
void add_vector(const double *a, const double *b, double *out)
{
out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
out[0] = a[0] - b[0];
out[1] = a[1] - b[1];
out[2] = a[2] - b[2];
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
out[0] = a[0] * b[0];
out[1] = a[1] * b[1];
out[2] = a[2] * b[2];
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
out[0] = a[0] * b;
out[1] = a[1] * b;
out[2] = a[2] * b;
}
static inline
double dot_product(const double *v1, const double *v2)
{
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
}
經過優化後的執行時間有明顯改善,節省了30%的計算時間。
$ git checkout loop-unrooling && make clean && make && ./raytracing
Execution time of raytracing() : 1.516588 sec
這時再以gprof分析:
$ git checkout loop-unrooling && make clean && make PROFILE=1 && ./raytracing && gprof ./raytracing | less
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
20.01 0.33 0.33 69646433 0.00 0.00 dot_product
15.16 0.58 0.25 56956357 0.00 0.00 subtract_vector
10.31 0.75 0.17 13861875 0.00 0.00 raySphereIntersection
10.01 0.92 0.17 13861875 0.00 0.00 rayRectangularIntersection
9.10 1.07 0.15 10598450 0.00 0.00 normalize
6.67 1.18 0.11 17821809 0.00 0.00 cross_product
4.85 1.26 0.08 31410180 0.00 0.00 multiply_vector
4.25 1.33 0.07 4620625 0.00 0.00 ray_hit_object
3.34 1.38 0.06 2110576 0.00 0.00 localColor
3.03 1.43 0.05 17836094 0.00 0.00 add_vector
3.03 1.48 0.05 3838091 0.00 0.00 length
3.03 1.53 0.05 1241598 0.00 0.00 refraction
2.43 1.57 0.04 4221152 0.00 0.00 multiply_vectors
1.82 1.60 0.03 2110576 0.00 0.00 compute_specular_diffuse
1.21 1.62 0.02 2520791 0.00 0.00 idx_stack_top
0.61 1.63 0.01 2558386 0.00 0.00 idx_stack_empty
0.61 1.64 0.01 1048576 0.00 0.00 ray_color
0.61 1.65 0.01 1 0.01 1.65 raytracing
也會發現計算所耗費開銷的排名也有變動。
在影片中有提到編譯器的inline技巧,而在gcc當中有提到
GCC does not inline any functions when not optimizing unless you specify the ‘always_inline’ attribute for the function.
由於在Makefile裡面我們在編譯指定最佳化的選項為:
CFLAGS = \
-std=gnu99 -Wall -O0 -g
表示不啟用編譯器的最佳化操作,因此在程式碼當中inline建議會被忽略:
static inline
這時就要使用
__attribute__((always_inline))
來表示要啟用inline,可以見到使用了inline技巧後執行的時間也有改善了10%:
$ git checkout inline && make clean && make && ./raytracing
Execution time of raytracing() : 1.967007 sec
上網有找到這篇把openMP的用法整理蠻清楚的,當中也有提到OpenMP用法某些情況會導致的錯誤,例如多个執行序寫同一個變數時,這篇有提到使用private(var)
的指令讓變數互有獨立副本以免發生不預期的錯誤(race conditions),而在這篇當中有提到也可使用#pragma omp atomic
的指示來避免變個被多個執行同時寫入,或是指示#pragma omp for reduction( +:sum)
這種各自運算最後才整合的方式。
基於上面的說明,我決定在math-toolkit.h
這個標頭檔裡面使用#pragma omp for reduction( +:sum)
為基礎的平行指示,不過在該標頭檔當中只有一個方法會有race conditions的問題:
static inline
double dot_product(const double *v1, const double *v2)
{
double dp = 0.0;
for (int i = 0; i < 3; i++)
dp += v1[i] * v2[i];
return dp;
}
static inline
double dot_product(const double *v1, const double *v2)
{
double dp = 0.0;
#pragma omp parallel for num_threads(3) reduction( +:dp)
for (int i = 0; i < 3; i++)
dp += v1[i] * v2[i];
return dp;
}
出了一點問題:
math-toolkit.h: In function ‘dot_product’:
math-toolkit.h:70:0: warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
#pragma omp parallel for num_threads(3) reduction( +:dp)
於是在Makefile裡修改為:
CFLAGS = \
-std=gnu99 -Wall -O0 -g -fopenmp
LDFLAGS = \
-lm -fopenmp
就可以順利執行了,不過在執行後會發現執行的時間反而增加,推測是以下的cod所導致,為了使用平行計算而增加的成本不足以抵銷平行計算所帶來的效益,如以下的code而言,我指示openmp分為3個threads去計算,在執行時間則從baseline的2.168869 sec
增加到104.624172 sec
。
num_threads=3:Execution time of raytracing() : 104.624172 sec
static inline
void add_vector(const double *a, const double *b, double *out)
{
#pragma omp parallel for num_threads(3)
for (int i = 0; i < 3; i++)
out[i] = a[i] + b[i];
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
#pragma omp parallel for num_threads(3)
for (int i = 0; i < 3; i++)
out[i] = a[i] - b[i];
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
#pragma omp parallel for num_threads(3)
for (int i = 0; i < 3; i++)
out[i] = a[i] * b[i];
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
#pragma omp parallel for num_threads(3)
for (int i = 0; i < 3; i++)
out[i] = a[i] * b;
}
static inline
double dot_product(const double *v1, const double *v2)
{
double dp = 0.0;
#pragma omp parallel for num_threads(3) reduction(+:dp)
for (int i = 0; i < 3; i++)
dp += v1[i] * v2[i];
return dp;
}
num_threads=1:Execution time of raytracing() : 30.393997 sec
static inline
void add_vector(const double *a, const double *b, double *out)
{
#pragma omp parallel for num_threads(1)
for (int i = 0; i < 3; i++)
out[i] = a[i] + b[i];
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
#pragma omp parallel for num_threads(1)
for (int i = 0; i < 3; i++)
out[i] = a[i] - b[i];
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
#pragma omp parallel for num_threads(1)
for (int i = 0; i < 3; i++)
out[i] = a[i] * b[i];
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
#pragma omp parallel for num_threads(1)
for (int i = 0; i < 3; i++)
out[i] = a[i] * b;
}
static inline
double dot_product(const double *v1, const double *v2)
{
double dp = 0.0;
#pragma omp parallel for num_threads(1) reduction(+:dp)
for (int i = 0; i < 3; i++)
dp += v1[i] * v2[i];
return dp;
}
因此現在要平行計算的目標必須放置計算較大的單元區塊,於是從dot_product
搜尋呼叫函式,在raytracing.c檔案內的raytracing函式發現一個可以嘗試優化的地方,於是將該處程式增加openMP的平行計算指令,並且以private對平行區塊外的變數進行副本拷貝,以免發生race conditions的錯誤,經過計算後時間縮至0.625264 sec
,不過相較於採用-Ofast
進行編譯所得的0.407022 sec
還有一段差距。
#pragma omp parallel for private(stk), private(d) private(object_color)
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
double r = 0, g = 0, b = 0;
/* MSAA */
for (int s = 0; s < SAMPLES; s++) {
idx_stack_init(&stk);
rayConstruction(d, u, v, w,
i * factor + s / factor,
j * factor + s % factor,
view,
width * factor, height * factor);
if (ray_color(view->vrp, 0.0, d, &stk, rectangulars, spheres,
lights, object_color,
MAX_REFLECTION_BOUNCES)) {
r += object_color[0];
g += object_color[1];
b += object_color[2];
} else {
r += background_color[0];
g += background_color[1];
b += background_color[2];
}
pixels[((i + (j * width)) * 3) + 0] = r * 255 / SAMPLES;
pixels[((i + (j * width)) * 3) + 1] = g * 255 / SAMPLES;
pixels[((i + (j * width)) * 3) + 2] = b * 255 / SAMPLES;
}
}
}
若是以openMP加上__attribute__((always_inline))
指示,則能達到0.413672 sec
,與編譯最佳化的-Ofast
計算所得的0.407022 sec
相當接近。