Try   HackMD

2016q3 Homework 1 (raytracing)

tags: sysprog21 JongSyuGithub

contributed by <JonSyuGithub>
github: https://github.com/JonSyuGithub/raytracing

Environment

  • OS: Ubuntu 16.04 LTS
  • CPU: i5-457
  • Memory 12GB ($ cat /proc/meminfo)
  • Update & Install
$ sudo apt-get update
$ sudo apt-get install graphviz
$ sudo apt-get install imagemagick

Objective

  • try reducing execution time

Original Version

  • make & run
$ make
$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 2.403777 sec
  • make(+gopro) & run
$ make clean
$ make PROFILE=1
$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 5.059268 sec

gopro計算函數執行指令數量,造成程式執行時間比原先程式未計算指令數量多了約2.6s JonSyuGithub

$ gprof ./raytracing | less
Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 26.50      0.49     0.49 69646433     0.00     0.00  dot_product
 21.09      0.88     0.39 56956357     0.00     0.00  subtract_vector
  9.74      1.06     0.18 13861875     0.00     0.00  rayRectangularIntersection
  8.11      1.21     0.15 17836094     0.00     0.00  add_vector
  6.49      1.33     0.12 13861875     0.00     0.00  raySphereIntersection
  5.41      1.43     0.10 10598450     0.00     0.00  normalize
  4.87      1.52     0.09 31410180     0.00     0.00  multiply_vector
  4.87      1.61     0.09 17821809     0.00     0.00  cross_product
  3.79      1.68     0.07  4620625     0.00     0.00  ray_hit_object
  2.16      1.72     0.04  4221152     0.00     0.00  multiply_vectors
  2.16      1.76     0.04  1048576     0.00     0.00  ray_color
  1.62      1.79     0.03        1     0.03     1.85  raytracing
  1.08      1.81     0.02  2520791     0.00     0.00  idx_stack_top
  1.08      1.83     0.02  2110576     0.00     0.00  compute_specular_diffuse

dot_product、subtract_vecotr呼叫次數最多 JonSyuGithub

  • make(+gopro+modified Makefile) & run
# Rendering scene
Done!
Execution time of raytracing() : 0.623926 sec

修改Makefile的CFLAGS = -std=gnu99 -Wall -Ofast -g經由compiler最佳化,發現執行時間從4.956461 sec降到0.623926 sec,接著嘗試不透過compiler最佳化自己修改程式碼,執行時間可以降到多低! JonSyuGithub

static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; for (int i = 0; i < 3; i++) dp += v1[i] * v2[i]; return dp; } static inline void subtract_vector(const double *a, const double *b, double *out) { for (int i = 0; i < 3; i++) out[i] = a[i] - b[i]; }

Optimal Version

Loop unrolling

static inline double dot_product(const double *v1, const double *v2) { double dp = 0.0; dp = v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; return dp; } static inline void subtract_vector(const double *a, const double *b, double *out) { out[0] = a[0] - b[0]; out[1] = a[1] - b[1]; out[2] = a[2] - b[2]; }
# Rendering scene
Done!
Execution time of raytracing() : 4.652799 sec

程式執行時間5.059268 sec降到4.639686 sec,僅減少約0.4 sec JonSyuGithub

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 15.29      0.22     0.22 56956357     0.00     0.00  subtract_vector
 14.94      0.44     0.22 69646433     0.00     0.00  dot_product
 13.90      0.64     0.20 31410180     0.00     0.00  multiply_vector
 11.12      0.80     0.16 10598450     0.00     0.00  normalize
  9.73      0.94     0.14 13861875     0.00     0.00  rayRectangularIntersection
  7.64      1.05     0.11 17836094     0.00     0.00  add_vector
  7.30      1.15     0.11 17821809     0.00     0.00  cross_product

可以發現dot_product執行次數比例從26.5%降到14.94%、substract_vecotr執行次數比例從21.09%降到15.25%,因為沒有for loop不需要進行predict,參考下方unrolling後的組合語言並無jmp
JonSyuGithub

$ objdump -d ~/2016_Fall_Embedded/raytracing/raytracing.o

00000000000001eb <subtract_vector>:
     1eb:	55                   	push   %rbp
     1ec:	48 89 e5             	mov    %rsp,%rbp
     1ef:	48 83 ec 18          	sub    $0x18,%rsp
     1f3:	e8 00 00 00 00       	callq  1f8 <subtract_vector+0xd>
     1f8:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
     1fc:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
     200:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
     204:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
     208:	f2 0f 10 00          	movsd  (%rax),%xmm0
     20c:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
     210:	f2 0f 10 08          	movsd  (%rax),%xmm1
     214:	f2 0f 5c c1          	subsd  %xmm1,%xmm0
     218:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
     21c:	f2 0f 11 00          	movsd  %xmm0,(%rax)
     220:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
     224:	48 83 c0 08          	add    $0x8,%rax
     228:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
     22c:	48 83 c2 08          	add    $0x8,%rdx
     230:	f2 0f 10 02          	movsd  (%rdx),%xmm0
     234:	48 8b 55 f0          	mov    -0x10(%rbp),%rdx
     238:	48 83 c2 08          	add    $0x8,%rdx
     23c:	f2 0f 10 0a          	movsd  (%rdx),%xmm1
     240:	f2 0f 5c c1          	subsd  %xmm1,%xmm0
     244:	f2 0f 11 00          	movsd  %xmm0,(%rax)
     248:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
     24c:	48 83 c0 10          	add    $0x10,%rax
     250:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
     254:	48 83 c2 10          	add    $0x10,%rdx
     258:	f2 0f 10 02          	movsd  (%rdx),%xmm0
     25c:	48 8b 55 f0          	mov    -0x10(%rbp),%rdx
     260:	48 83 c2 10          	add    $0x10,%rdx
     264:	f2 0f 10 0a          	movsd  (%rdx),%xmm1
     268:	f2 0f 5c c1          	subsd  %xmm1,%xmm0
     26c:	f2 0f 11 00          	movsd  %xmm0,(%rax)
     270:	90                   	nop
     271:	c9                   	leaveq 
     272:	c3                   	retq
0000000000000436 <dot_product>:
     436:	55                   	push   %rbp
     437:	48 89 e5             	mov    %rsp,%rbp
     43a:	48 83 ec 20          	sub    $0x20,%rsp
     43e:	e8 00 00 00 00       	callq  443 <dot_product+0xd>
     443:	48 89 7d e8          	mov    %rdi,-0x18(%rbp)
     447:	48 89 75 e0          	mov    %rsi,-0x20(%rbp)
     44b:	66 0f ef c0          	pxor   %xmm0,%xmm0
     44f:	f2 0f 11 45 f8       	movsd  %xmm0,-0x8(%rbp)
     454:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
     458:	f2 0f 10 08          	movsd  (%rax),%xmm1
     45c:	48 8b 45 e0          	mov    -0x20(%rbp),%rax
     460:	f2 0f 10 00          	movsd  (%rax),%xmm0
     464:	f2 0f 59 c8          	mulsd  %xmm0,%xmm1
     468:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
     46c:	48 83 c0 08          	add    $0x8,%rax
     470:	f2 0f 10 10          	movsd  (%rax),%xmm2
     474:	48 8b 45 e0          	mov    -0x20(%rbp),%rax
     478:	48 83 c0 08          	add    $0x8,%rax
     47c:	f2 0f 10 00          	movsd  (%rax),%xmm0
     480:	f2 0f 59 c2          	mulsd  %xmm2,%xmm0
     484:	f2 0f 58 c8          	addsd  %xmm0,%xmm1
     488:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
     48c:	48 83 c0 10          	add    $0x10,%rax
     490:	f2 0f 10 10          	movsd  (%rax),%xmm2
     494:	48 8b 45 e0          	mov    -0x20(%rbp),%rax
     498:	48 83 c0 10          	add    $0x10,%rax
     49c:	f2 0f 10 00          	movsd  (%rax),%xmm0
     4a0:	f2 0f 59 c2          	mulsd  %xmm2,%xmm0
     4a4:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
     4a8:	f2 0f 11 45 f8       	movsd  %xmm0,-0x8(%rbp)
     4ad:	f2 0f 10 45 f8       	movsd  -0x8(%rbp),%xmm0
     4b2:	c9                   	leaveq 
     4b3:	c3                   	retq

References

謝謝同學提供disassemble該關鍵字,能夠查詢編譯後的組合語言 JobSyuGithub