2016q3 Homework1 (raytracing)

contributed by <beieve7028>

預期目標

在 GitHub 上 fork raytracing，並思考如何改善程式效能
以 make PROFILE=1 重新編譯程式碼，並且學習 gprof
以 gprof 指出效能瓶頸，並且著手改寫檔案 math-toolkit.h 在內的函式實做
可善用 POSIX Thread, OpenMP, software pipelining, 以及 loop unrolling 一類的技巧來加速程式運作

實驗環境

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
On-line CPU(s) list:   0-7
Thread(s) per core:    2
Core(s) per socket:    4
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 94
Stepping:              3
CPU MHz:               3491.640
BogoMIPS:              6815.99
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              8192K
NUMA node0 CPU(s):     0-7

gprof分析(baseline)

執行時間：

$ git checkout master && make clean && make && ./raytracing
Execution time of raytracing() : 2.168869 sec

gprof分析：

$ git checkout master && make clean && make PROFILE=1 && ./raytracing && gprof ./raytracing | less
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 30.02      0.72     0.72 69646433     0.00     0.00  dot_product
 15.01      1.08     0.36 56956357     0.00     0.00  subtract_vector
 10.84      1.34     0.26 13861875     0.00     0.00  rayRectangularIntersection
  8.13      1.54     0.20 31410180     0.00     0.00  multiply_vector
  7.09      1.71     0.17 13861875     0.00     0.00  raySphereIntersection
  4.59      1.82     0.11 17836094     0.00     0.00  add_vector
  4.17      1.92     0.10  4620625     0.00     0.00  ray_hit_object
  3.96      2.01     0.10 17821809     0.00     0.00  cross_product
  3.75      2.10     0.09 10598450     0.00     0.00  normalize
  2.08      2.15     0.05  2110576     0.00     0.00  compute_specular_diffuse
  1.67      2.19     0.04  4221152     0.00     0.00  multiply_vectors
  1.67      2.23     0.04  2110576     0.00     0.00  localColor
  1.25      2.26     0.03  1048576     0.00     0.00  ray_color
  1.25      2.29     0.03        1     0.03     2.39  raytracing
  0.83      2.31     0.02  2558386     0.00     0.00  idx_stack_empty
  0.83      2.33     0.02  1241598     0.00     0.00  refraction
  0.83      2.35     0.02  1204003     0.00     0.00  idx_stack_push
  0.42      2.36     0.01  3838091     0.00     0.00  length

可以見到前幾名佔了很大的比例，因此去查看函式內容

math-toolkit.h:




































static inline
void add_vector(const double *a, const double *b, double *out)
{
    for (int i = 0; i < 3; i++)
        out[i] = a[i] + b[i];
}

static inline
void subtract_vector(const double *a, const double *b, double *out)
{
    for (int i = 0; i < 3; i++)
        out[i] = a[i] - b[i];
}

static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
    for (int i = 0; i < 3; i++)
        out[i] = a[i] * b[i];
}

static inline
void multiply_vector(const double *a, double b, double *out)
{
    for (int i = 0; i < 3; i++)
        out[i] = a[i] * b;
}

static inline
double dot_product(const double *v1, const double *v2)
{
    double dp = 0.0;
    for (int i = 0; i < 3; i++)
        dp += v1[i] * v2[i];
    return dp;
}

可以見到都是迴圈的計算，於是採用解說所提到的loop unrolling技巧進行優化：
math-toolkit.h





































static inline
void add_vector(const double *a, const double *b, double *out)
{
        out[0] = a[0] + b[0];
        out[1] = a[1] + b[1];
        out[2] = a[2] + b[2];
}

static inline
void subtract_vector(const double *a, const double *b, double *out)
{
        out[0] = a[0] - b[0];
        out[1] = a[1] - b[1];
        out[2] = a[2] - b[2];
}

static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
        out[0] = a[0] * b[0];
        out[1] = a[1] * b[1];
        out[2] = a[2] * b[2];
}

static inline
void multiply_vector(const double *a, double b, double *out)
{
        out[0] = a[0] * b;
        out[1] = a[1] * b;
        out[2] = a[2] * b;
}

static inline
double dot_product(const double *v1, const double *v2)
{
    return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
}

經過優化後的執行時間有明顯改善，節省了30%的計算時間。

$ git checkout loop-unrooling  && make clean && make && ./raytracing
Execution time of raytracing() : 1.516588 sec

這時再以gprof分析：

$ git checkout loop-unrooling  && make clean && make PROFILE=1 && ./raytracing && gprof ./raytracing | less
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 20.01      0.33     0.33 69646433     0.00     0.00  dot_product
 15.16      0.58     0.25 56956357     0.00     0.00  subtract_vector
 10.31      0.75     0.17 13861875     0.00     0.00  raySphereIntersection
 10.01      0.92     0.17 13861875     0.00     0.00  rayRectangularIntersection
  9.10      1.07     0.15 10598450     0.00     0.00  normalize
  6.67      1.18     0.11 17821809     0.00     0.00  cross_product
  4.85      1.26     0.08 31410180     0.00     0.00  multiply_vector
  4.25      1.33     0.07  4620625     0.00     0.00  ray_hit_object
  3.34      1.38     0.06  2110576     0.00     0.00  localColor
  3.03      1.43     0.05 17836094     0.00     0.00  add_vector
  3.03      1.48     0.05  3838091     0.00     0.00  length
  3.03      1.53     0.05  1241598     0.00     0.00  refraction
  2.43      1.57     0.04  4221152     0.00     0.00  multiply_vectors
  1.82      1.60     0.03  2110576     0.00     0.00  compute_specular_diffuse
  1.21      1.62     0.02  2520791     0.00     0.00  idx_stack_top
  0.61      1.63     0.01  2558386     0.00     0.00  idx_stack_empty
  0.61      1.64     0.01  1048576     0.00     0.00  ray_color
  0.61      1.65     0.01        1     0.01     1.65  raytracing

也會發現計算所耗費開銷的排名也有變動。

Inline

在影片中有提到編譯器的inline技巧，而在gcc當中有提到
GCC does not inline any functions when not optimizing unless you specify the ‘always_inline’ attribute for the function.
由於在Makefile裡面我們在編譯指定最佳化的選項為：

CFLAGS = \
	-std=gnu99 -Wall -O0 -g

表示不啟用編譯器的最佳化操作，因此在程式碼當中inline建議會被忽略：

static inline

這時就要使用

__attribute__((always_inline))

來表示要啟用inline，可以見到使用了inline技巧後執行的時間也有改善了10%：

$ git checkout inline && make clean && make && ./raytracing
Execution time of raytracing() : 1.967007 sec

OpenMP

上網有找到這篇把openMP的用法整理蠻清楚的，當中也有提到OpenMP用法某些情況會導致的錯誤，例如多个執行序寫同一個變數時，這篇有提到使用private(var)的指令讓變數互有獨立副本以免發生不預期的錯誤(race conditions)，而在這篇當中有提到也可使用#pragma omp atomic的指示來避免變個被多個執行同時寫入，或是指示#pragma omp for reduction( +:sum)這種各自運算最後才整合的方式。

基於上面的說明，我決定在math-toolkit.h這個標頭檔裡面使用#pragma omp for reduction( +:sum)為基礎的平行指示，不過在該標頭檔當中只有一個方法會有race conditions的問題：








static inline
double dot_product(const double *v1, const double *v2)
{
    double dp = 0.0;
    for (int i = 0; i < 3; i++)
        dp += v1[i] * v2[i];
    return dp;
}









static inline
double dot_product(const double *v1, const double *v2)
{
    double dp = 0.0;
    #pragma omp parallel for num_threads(3) reduction( +:dp) 
    for (int i = 0; i < 3; i++)
        dp += v1[i] * v2[i];
    return dp;
}

出了一點問題：

math-toolkit.h: In function ‘dot_product’:
math-toolkit.h:70:0: warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
     #pragma omp parallel for num_threads(3) reduction( +:dp)

於是在Makefile裡修改為:




CFLAGS = \
	-std=gnu99 -Wall -O0 -g -fopenmp
LDFLAGS = \
	-lm -fopenmp

就可以順利執行了，不過在執行後會發現執行的時間反而增加，推測是以下的cod所導致，為了使用平行計算而增加的成本不足以抵銷平行計算所帶來的效益，如以下的code而言，我指示openmp分為3個threads去計算，在執行時間則從baseline的2.168869 sec增加到104.624172 sec。

num_threads=3:Execution time of raytracing() : 104.624172 sec









































static inline
void add_vector(const double *a, const double *b, double *out)
{
    #pragma omp parallel for num_threads(3)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] + b[i];
}

static inline
void subtract_vector(const double *a, const double *b, double *out)
{
    #pragma omp parallel for num_threads(3)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] - b[i];
}

static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
    #pragma omp parallel for num_threads(3)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] * b[i];
}

static inline
void multiply_vector(const double *a, double b, double *out)
{
    #pragma omp parallel for num_threads(3)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] * b;
}

static inline
double dot_product(const double *v1, const double *v2)
{
    double dp = 0.0;
    #pragma omp parallel for num_threads(3) reduction(+:dp)
    for (int i = 0; i < 3; i++)
        dp += v1[i] * v2[i];
    return dp;
}

num_threads=1:Execution time of raytracing() : 30.393997 sec









































static inline
void add_vector(const double *a, const double *b, double *out)
{
    #pragma omp parallel for num_threads(1)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] + b[i];
}

static inline
void subtract_vector(const double *a, const double *b, double *out)
{
    #pragma omp parallel for num_threads(1)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] - b[i];
}

static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
    #pragma omp parallel for num_threads(1)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] * b[i];
}

static inline
void multiply_vector(const double *a, double b, double *out)
{
    #pragma omp parallel for num_threads(1)
    for (int i = 0; i < 3; i++)
        out[i] = a[i] * b;
}

static inline
double dot_product(const double *v1, const double *v2)
{
    double dp = 0.0;
    #pragma omp parallel for num_threads(1) reduction(+:dp)
    for (int i = 0; i < 3; i++)
        dp += v1[i] * v2[i];
    return dp;
}

因此現在要平行計算的目標必須放置計算較大的單元區塊，於是從dot_product搜尋呼叫函式，在raytracing.c檔案內的raytracing函式發現一個可以嘗試優化的地方，於是將該處程式增加openMP的平行計算指令，並且以private對平行區塊外的變數進行副本拷貝，以免發生race conditions的錯誤，經過計算後時間縮至0.625264 sec，不過相較於採用-Ofast進行編譯所得的0.407022 sec還有一段差距。





























    #pragma omp parallel for private(stk), private(d) private(object_color)
    for (int j = 0; j < height; j++) {
        for (int i = 0; i < width; i++) {
            double r = 0, g = 0, b = 0;
            /* MSAA */
            for (int s = 0; s < SAMPLES; s++) {
                idx_stack_init(&stk);
                rayConstruction(d, u, v, w,
                                i * factor + s / factor,
                                j * factor + s % factor,
                                view,
                                width * factor, height * factor);
                if (ray_color(view->vrp, 0.0, d, &stk, rectangulars, spheres,
                              lights, object_color,
                              MAX_REFLECTION_BOUNCES)) {
                    r += object_color[0];
                    g += object_color[1];
                    b += object_color[2];
                } else {
                    r += background_color[0];
                    g += background_color[1];
                    b += background_color[2];
                }
                pixels[((i + (j * width)) * 3) + 0] = r * 255 / SAMPLES;
                pixels[((i + (j * width)) * 3) + 1] = g * 255 / SAMPLES;
                pixels[((i + (j * width)) * 3) + 2] = b * 255 / SAMPLES;
            }
        }
    }

若是以openMP加上__attribute__((always_inline))
指示，則能達到0.413672 sec，與編譯最佳化的-Ofast計算所得的0.407022 sec相當接近。

參考資料

A02: raytracing - HackMD