Try   HackMD

2016q3 Homework1 (raytracing)

contributed by <beieve7028>


預期目標

  • 在 GitHub 上 fork raytracing,並思考如何改善程式效能
  • 以 make PROFILE=1 重新編譯程式碼,並且學習 gprof
  • 以 gprof 指出效能瓶頸,並且著手改寫檔案 math-toolkit.h 在內的函式實做
  • 可善用 POSIX Thread, OpenMP, software pipelining, 以及 loop unrolling 一類的技巧來加速程式運作

實驗環境

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
On-line CPU(s) list:   0-7
Thread(s) per core:    2
Core(s) per socket:    4
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 94
Stepping:              3
CPU MHz:               3491.640
BogoMIPS:              6815.99
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              8192K
NUMA node0 CPU(s):     0-7

gprof分析(baseline)

執行時間:

$ git checkout master && make clean && make && ./raytracing
Execution time of raytracing() : 2.168869 sec

gprof分析:

$ git checkout master && make clean && make PROFILE=1 && ./raytracing && gprof ./raytracing | less
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 30.02      0.72     0.72 69646433     0.00     0.00  dot_product
 15.01      1.08     0.36 56956357     0.00     0.00  subtract_vector
 10.84      1.34     0.26 13861875     0.00     0.00  rayRectangularIntersection
  8.13      1.54     0.20 31410180     0.00     0.00  multiply_vector
  7.09      1.71     0.17 13861875     0.00     0.00  raySphereIntersection
  4.59      1.82     0.11 17836094     0.00     0.00  add_vector
  4.17      1.92     0.10  4620625     0.00     0.00  ray_hit_object
  3.96      2.01     0.10 17821809     0.00     0.00  cross_product
  3.75      2.10     0.09 10598450     0.00     0.00  normalize
  2.08      2.15     0.05  2110576     0.00     0.00  compute_specular_diffuse
  1.67      2.19     0.04  4221152     0.00     0.00  multiply_vectors
  1.67      2.23     0.04  2110576     0.00     0.00  localColor
  1.25      2.26     0.03  1048576     0.00     0.00  ray_color
  1.25      2.29     0.03        1     0.03     2.39  raytracing
  0.83      2.31     0.02  2558386     0.00     0.00  idx_stack_empty
  0.83      2.33     0.02  1241598     0.00     0.00  refraction
  0.83      2.35     0.02  1204003     0.00     0.00  idx_stack_push
  0.42      2.36     0.01  3838091     0.00     0.00  length

可以見到前幾名佔了很大的比例,因此去查看函式內容

math-toolkit.h:

static inline void add_vector(const double *a, const double *b, double *out) { for (int i = 0; i < 3; i++) out[i] = a[i] + b[i]; } static inline void subtract_vector(const double *a, const double *b, double *out) { for (int i = 0; i < 3; i++) out[i] = a[i] - b[i]; } static inline void multiply_vectors(const double *a, const double *b, double *out) { for (int i = 0; i < 3; i++) out[i] = a[i] * b[i]; } static inline void multiply_vector(const double *a, double b, double *out) { for (int i = 0; i < 3; i++) out[i] = a[i] * b; } static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; for (int i = 0; i < 3; i++) dp += v1[i] * v2[i]; return dp; }

可以見到都是迴圈的計算,於是採用解說所提到的loop unrolling技巧進行優化:
math-toolkit.h

static inline void add_vector(const double *a, const double *b, double *out) { out[0] = a[0] + b[0]; out[1] = a[1] + b[1]; out[2] = a[2] + b[2]; } static inline void subtract_vector(const double *a, const double *b, double *out) { out[0] = a[0] - b[0]; out[1] = a[1] - b[1]; out[2] = a[2] - b[2]; } static inline void multiply_vectors(const double *a, const double *b, double *out) { out[0] = a[0] * b[0]; out[1] = a[1] * b[1]; out[2] = a[2] * b[2]; } static inline void multiply_vector(const double *a, double b, double *out) { out[0] = a[0] * b; out[1] = a[1] * b; out[2] = a[2] * b; } static inline double dot_product(const double *v1, const double *v2) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; }

經過優化後的執行時間有明顯改善,節省了30%的計算時間。

$ git checkout loop-unrooling  && make clean && make && ./raytracing
Execution time of raytracing() : 1.516588 sec

這時再以gprof分析:

$ git checkout loop-unrooling  && make clean && make PROFILE=1 && ./raytracing && gprof ./raytracing | less
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 20.01      0.33     0.33 69646433     0.00     0.00  dot_product
 15.16      0.58     0.25 56956357     0.00     0.00  subtract_vector
 10.31      0.75     0.17 13861875     0.00     0.00  raySphereIntersection
 10.01      0.92     0.17 13861875     0.00     0.00  rayRectangularIntersection
  9.10      1.07     0.15 10598450     0.00     0.00  normalize
  6.67      1.18     0.11 17821809     0.00     0.00  cross_product
  4.85      1.26     0.08 31410180     0.00     0.00  multiply_vector
  4.25      1.33     0.07  4620625     0.00     0.00  ray_hit_object
  3.34      1.38     0.06  2110576     0.00     0.00  localColor
  3.03      1.43     0.05 17836094     0.00     0.00  add_vector
  3.03      1.48     0.05  3838091     0.00     0.00  length
  3.03      1.53     0.05  1241598     0.00     0.00  refraction
  2.43      1.57     0.04  4221152     0.00     0.00  multiply_vectors
  1.82      1.60     0.03  2110576     0.00     0.00  compute_specular_diffuse
  1.21      1.62     0.02  2520791     0.00     0.00  idx_stack_top
  0.61      1.63     0.01  2558386     0.00     0.00  idx_stack_empty
  0.61      1.64     0.01  1048576     0.00     0.00  ray_color
  0.61      1.65     0.01        1     0.01     1.65  raytracing

也會發現計算所耗費開銷的排名也有變動。


Inline

影片中有提到編譯器的inline技巧,而在gcc當中有提到
GCC does not inline any functions when not optimizing unless you specify the ‘always_inline’ attribute for the function.
由於在Makefile裡面我們在編譯指定最佳化的選項為:

CFLAGS = \
	-std=gnu99 -Wall -O0 -g

表示不啟用編譯器的最佳化操作,因此在程式碼當中inline建議會被忽略:

static inline

這時就要使用

__attribute__((always_inline))

來表示要啟用inline,可以見到使用了inline技巧後執行的時間也有改善了10%:

$ git checkout inline && make clean && make && ./raytracing
Execution time of raytracing() : 1.967007 sec

OpenMP

上網有找到這篇把openMP的用法整理蠻清楚的,當中也有提到OpenMP用法某些情況會導致的錯誤,例如多个執行序寫同一個變數時,這篇有提到使用private(var)的指令讓變數互有獨立副本以免發生不預期的錯誤(race conditions),而在這篇當中有提到也可使用#pragma omp atomic的指示來避免變個被多個執行同時寫入,或是指示#pragma omp for reduction( +:sum)這種各自運算最後才整合的方式。


基於上面的說明,我決定在math-toolkit.h這個標頭檔裡面使用#pragma omp for reduction( +:sum)為基礎的平行指示,不過在該標頭檔當中只有一個方法會有race conditions的問題:

static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; for (int i = 0; i < 3; i++) dp += v1[i] * v2[i]; return dp; }
static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; #pragma omp parallel for num_threads(3) reduction( +:dp) for (int i = 0; i < 3; i++) dp += v1[i] * v2[i]; return dp; }

出了一點問題:

math-toolkit.h: In function ‘dot_product’:
math-toolkit.h:70:0: warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
     #pragma omp parallel for num_threads(3) reduction( +:dp)

於是在Makefile裡修改為:

CFLAGS = \ -std=gnu99 -Wall -O0 -g -fopenmp LDFLAGS = \ -lm -fopenmp

就可以順利執行了,不過在執行後會發現執行的時間反而增加,推測是以下的cod所導致,為了使用平行計算而增加的成本不足以抵銷平行計算所帶來的效益,如以下的code而言,我指示openmp分為3個threads去計算,在執行時間則從baseline的2.168869 sec增加到104.624172 sec

num_threads=3:Execution time of raytracing() : 104.624172 sec

static inline void add_vector(const double *a, const double *b, double *out) { #pragma omp parallel for num_threads(3) for (int i = 0; i < 3; i++) out[i] = a[i] + b[i]; } static inline void subtract_vector(const double *a, const double *b, double *out) { #pragma omp parallel for num_threads(3) for (int i = 0; i < 3; i++) out[i] = a[i] - b[i]; } static inline void multiply_vectors(const double *a, const double *b, double *out) { #pragma omp parallel for num_threads(3) for (int i = 0; i < 3; i++) out[i] = a[i] * b[i]; } static inline void multiply_vector(const double *a, double b, double *out) { #pragma omp parallel for num_threads(3) for (int i = 0; i < 3; i++) out[i] = a[i] * b; } static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; #pragma omp parallel for num_threads(3) reduction(+:dp) for (int i = 0; i < 3; i++) dp += v1[i] * v2[i]; return dp; }

num_threads=1:Execution time of raytracing() : 30.393997 sec

static inline void add_vector(const double *a, const double *b, double *out) { #pragma omp parallel for num_threads(1) for (int i = 0; i < 3; i++) out[i] = a[i] + b[i]; } static inline void subtract_vector(const double *a, const double *b, double *out) { #pragma omp parallel for num_threads(1) for (int i = 0; i < 3; i++) out[i] = a[i] - b[i]; } static inline void multiply_vectors(const double *a, const double *b, double *out) { #pragma omp parallel for num_threads(1) for (int i = 0; i < 3; i++) out[i] = a[i] * b[i]; } static inline void multiply_vector(const double *a, double b, double *out) { #pragma omp parallel for num_threads(1) for (int i = 0; i < 3; i++) out[i] = a[i] * b; } static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; #pragma omp parallel for num_threads(1) reduction(+:dp) for (int i = 0; i < 3; i++) dp += v1[i] * v2[i]; return dp; }

因此現在要平行計算的目標必須放置計算較大的單元區塊,於是從dot_product搜尋呼叫函式,在raytracing.c檔案內的raytracing函式發現一個可以嘗試優化的地方,於是將該處程式增加openMP的平行計算指令,並且以private對平行區塊外的變數進行副本拷貝,以免發生race conditions的錯誤,經過計算後時間縮至0.625264 sec,不過相較於採用-Ofast進行編譯所得的0.407022 sec還有一段差距。

#pragma omp parallel for private(stk), private(d) private(object_color) for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { double r = 0, g = 0, b = 0; /* MSAA */ for (int s = 0; s < SAMPLES; s++) { idx_stack_init(&stk); rayConstruction(d, u, v, w, i * factor + s / factor, j * factor + s % factor, view, width * factor, height * factor); if (ray_color(view->vrp, 0.0, d, &stk, rectangulars, spheres, lights, object_color, MAX_REFLECTION_BOUNCES)) { r += object_color[0]; g += object_color[1]; b += object_color[2]; } else { r += background_color[0]; g += background_color[1]; b += background_color[2]; } pixels[((i + (j * width)) * 3) + 0] = r * 255 / SAMPLES; pixels[((i + (j * width)) * 3) + 1] = g * 255 / SAMPLES; pixels[((i + (j * width)) * 3) + 2] = b * 255 / SAMPLES; } } }

若是以openMP加上__attribute__((always_inline))
指示,則能達到0.413672 sec,與編譯最佳化的-Ofast計算所得的0.407022 sec相當接近。


參考資料