contributed by < hunng
>
heathcliffYang
經過$ make
$ ./raytracing
後
# Rendering scene
Done!
Execution time of raytracing() : 3.313914 sec
輸出圖檔(利用 convert 轉換為 png)
利用 $ make check
驗證結果
check 的內容為再執行一次後,比較正確輸出與剛剛程式的輸出
check: $(EXEC)
@./$(EXEC) && diff -u baseline.ppm out.ppm || (echo Fail; exit)
@echo "Verified OK"
# Rendering scene
Done!
Execution time of raytracing() : 3.328869 sec
Verified OK
-pg
利用 $ make PROFILE=1
增加 compiler 的 flag -pg
(用於 gprof 分析)
note from GNU make
ifeq ($(strip $(PROFILE)),1)
PROF_FLAGS = -pg
CFLAGS += $(PROF_FLAGS)
LDFLAGS += $(PROF_FLAGS)
endif
-pg
-> w/ -pg
)
$ make check
# Rendering scene
Done!
Execution time of raytracing() : 6.734041 sec
Verified OK
在開啟 -Ofast
後 ( with-pg
)
$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 0.841584 sec
pip install gprof2dot
)gprof ./raytracing | gprof2dot | dot -Tpng -o callgraph.png
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
21.36 0.57 0.57 56956357 0.00 0.00 subtract_vector
20.99 1.13 0.56 69646433 0.00 0.00 dot_product
9.56 1.39 0.26 31410180 0.00 0.00 multiply_vector
8.99 1.63 0.24 17836094 0.00 0.00 add_vector
7.87 1.84 0.21 10598450 0.00 0.00 normalize
6.56 2.01 0.18 13861875 0.00 0.00 rayRectangularIntersection
6.37 2.18 0.17 4620625 0.00 0.00 ray_hit_object
6.18 2.35 0.17 13861875 0.00 0.00 raySphereIntersection
5.25 2.49 0.14 17821809 0.00 0.00 cross_product
2.25 2.55 0.06 1048576 0.00 0.00 ray_color
2.06 2.60 0.06 4221152 0.00 0.00 multiply_vectors
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
16.58 0.29 0.29 69646433 0.00 0.00 dot_product
15.72 0.57 0.28 56956357 0.00 0.00 subtract_vector
9.15 0.73 0.16 17821809 0.00 0.00 cross_product
8.29 0.87 0.15 13861875 0.00 0.00 rayRectangularIntersection
7.43 1.00 0.13 17836094 0.00 0.00 add_vector
7.43 1.13 0.13 4620625 0.00 0.00 ray_hit_object
6.29 1.24 0.11 31410180 0.00 0.00 multiply_vector
5.72 1.34 0.10 10598450 0.00 0.00 normalize
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
34.46 0.72 0.72 13861875 0.00 0.00 rayRectangularIntersection
23.45 1.21 0.49 13861875 0.00 0.00 raySphereIntersection
10.53 1.43 0.22 2110576 0.00 0.00 compute_specular_diffuse
9.57 1.63 0.20 1048576 0.00 0.00 ray_color
7.18 1.78 0.15 4620625 0.00 0.00 ray_hit_object
5.74 1.90 0.12 2110576 0.00 0.00 localColor
2.39 1.95 0.05 1241598 0.00 0.00 refraction
2.39 2.00 0.05 1048576 0.00 0.00 rayConstruction
1.44 2.03 0.03 1241598 0.00 0.00 reflection
1.44 2.06 0.03 1 0.03 2.09 raytracing
先在在 makefile 中添加套用 omp 的 flags
CFLAGS = -std=gnu99 -Wall -O0 -g -fopenmp
LDFLAGS = -lm -fopenmp
需要注意假如只有在 CFLAGS 添加程式在進行連結時會連結不到 openmp 的程式碼,會有找不到函式的情況發生
觀察 main.c 後發現此程式中的計算都集中在以下程式碼中
raytracing(pixels, background, rectangulars, spheres, lights, &view, ROWS, COLS);
而在 raytracing.c 中又以其中的 for 迴圈佔最大的部份
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
double r = 0, g = 0, b = 0;
/* MSAA */
for (int s = 0; s < SAMPLES; s++) {
idx_stack_init(&stk);
rayConstruction(d, u, v, w,
i * factor + s / factor,
j * factor + s % factor,
view,
width * factor, height * factor);
if (ray_color(view->vrp, 0.0, d, &stk, rectangulars, spheres,
lights, object_color,
MAX_REFLECTION_BOUNCES)) {
r += object_color[0];
g += object_color[1];
b += object_color[2];
} else {
r += background_color[0];
g += background_color[1];
b += background_color[2];
}
pixels[((i + (j * width)) * 3) + 0] = r * 255 / SAMPLES;
pixels[((i + (j * width)) * 3) + 1] = g * 255 / SAMPLES;
pixels[((i + (j * width)) * 3) + 2] = b * 255 / SAMPLES;
}
}
}
第一個想法便是先拆解最外層的迴圈 for (int j = 0; j < height; j++)
利用 openmp 已定義的一些函式改寫如下
int my_rank = omp_get_thread_num();
int thread_count = omp_get_num_threads();
int range = height / thread_count;
int start = my_rank * range;
int end = start + range;
for (int j = start; j < end; j++) { /* same as before */ }
$ make check
# Rendering scene
Done!
Execution time of raytracing() : 1.648855 sec
Verified OK
接續分解 for (int i = 0; i < width; i++)
跟上述做法差不多
if(thread_count < 512) {
range = height / thread_count;
start = my_rank * range;
end = start + range;
i_start = 0;
i_end = 512;
}
else {
layer = thread_count / 512;
range = 1;
start = (my_rank - my_rank%layer)/layer;
end = start + range;
i_range = 512 / layer;
i_start = my_rank % layer * i_range;
i_end = i_start + i_range;
}
但實際使用 1024 以上的 thread 數卻沒有提昇的效果,甚至還比當初的慢,原因有:
比較圖
將 openmp 的方法利用 pthread 寫寫看
首先因為 pthread 在叫用程式時只能輸入一個參數,所以將原程式的輸入全部改寫入一個 struct
typedef struct __INPUT_NEED {
uint8_t *pixels;
color background_color;
rectangular_node rectangulars;
sphere_node spheres;
light_node lights;
const viewpoint *view;
int width;
int height;
int rank;
int pthread_count;
} inputneed;
並增加另一個程式,方便處理每個 thread 的工作範圍及叫用原 raytracing 程式
void *raytracingWS(inputneed *input)
{
printf("i am %d thread\n", input->rank);
int my_rank = input->rank;
int range = input->height / input->pthread_count;
int start = my_rank * range;
int end = start + range;
raytracing(input->pixels, input->background_color,
input->rectangulars, input->spheres,
input->lights, input->view,
input->width, input->height,
start, end);
return NULL;
}
原先想說利用一個複製的 input 便能將所有 thread 的所需滿足
只需要在每次 pthread_create 前更改其內部的 rank,就能讓每次的輸入不同
但因為無法保證 main 能在 thread 能取出其 rank 後才修改
所以會有某些 rank 被重複做了,有些反而沒做到 –> 輸出錯誤
參考ierosodin改寫後
#ifdef PTR
/* prepared pthread needed information */
int pthread_count = 4;
pthread_t *thread_handles;
thread_handles = malloc(pthread_count * sizeof(pthread_t));
inputneed **need = malloc(pthread_count * sizeof(inputneed *));
for (int i = 0; i < pthread_count; i++) {
need[i] = malloc(sizeof(inputneed));
need[i]->pthread_count = pthread_count;
need[i]->pixels = pixels;
need[i]->background_color[0] = background[0];
need[i]->background_color[1] = background[1];
need[i]->background_color[2] = background[2];
need[i]->rectangulars = rectangulars;
need[i]->spheres = spheres;
need[i]->lights = lights;
need[i]->view = &view;
need[i]->width = 512;
need[i]->height = 512;
need[i]->rank = i;
}
#endif
此方法雖然能順利解決讓所有 thread 拿到不同範圍的問題,
但也使用了過多的空間儲存相同的資料(除了 rank 以外的所有資料都是),
需在思考: 有沒有能重複利用的方法