contributed by <kevinbird61
>
kevin@kevin-X450JF:[~/workspace/raytracing]$ make PROFILE=0
cc -std=gnu99 -Wall -O0 -g -c -o objects.o objects.c
cc -std=gnu99 -Wall -O0 -g -c -o raytracing.o raytracing.c
cc -std=gnu99 -Wall -O0 -g -c -o main.o main.c
cc -o raytracing objects.o raytracing.o main.o -lm
kevin@kevin-X450JF:[~/workspace/raytracing]$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 2.625426 sec
kevin@kevin-X450JF:[~/workspace/raytracing]$ make PROFILE=1
cc -std=gnu99 -Wall -O0 -g -pg -c -o objects.o objects.c
cc -std=gnu99 -Wall -O0 -g -pg -c -o raytracing.o raytracing.c
cc -std=gnu99 -Wall -O0 -g -pg -c -o main.o main.c
cc -o raytracing objects.o raytracing.o main.o -lm -pg
kevin@kevin-X450JF:[~/workspace/raytracing]$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 5.403366 sec
gprof ./raytracing | less
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
22.61 0.47 0.47 69646433 0.00 0.00 dot_product
17.80 0.84 0.37 56956357 0.00 0.00 subtract_vector
9.62 1.04 0.20 31410180 0.00 0.00 multiply_vector
8.66 1.22 0.18 13861875 0.00 0.00 raySphereIntersection
8.18 1.39 0.17 13861875 0.00 0.00 rayRectangularIntersection
7.70 1.55 0.16 10598450 0.00 0.00 normalize
7.46 1.71 0.16 17836094 0.00 0.00 add_vector
6.73 1.85 0.14 4620625 0.00 0.00 ray_hit_object
...
可以看到呼叫dot_product次數很多,從這邊改起
kevinbird61 Sun, Jun 28, 2015 9:59 PM
static inline
double dot_product(const double *v1, const double *v2)
{
double dp = 0.0;
dp = dp + (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
return dp;
}
# Rendering scene
Done!
Execution time of raytracing() : 4.941005 sec
5.403366 sec
降到4.941005 sec
gprof ./raytracing | less
:Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
20.27 0.31 0.31 56956357 0.00 0.00 subtract_vector
13.08 0.51 0.20 31410180 0.00 0.00 multiply_vector
11.77 0.69 0.18 69646433 0.00 0.00 dot_product
10.46 0.85 0.16 10598450 0.00 0.00 normalize
8.50 0.98 0.13 13861875 0.00 0.00 raySphereIntersection
7.19 1.09 0.11 17836094 0.00 0.00 add_vector
...
0000000000401375 <dot_product>:
static inline
double dot_product(const double *v1, const double *v2)
{
401375: 55 push %rbp
401376: 48 89 e5 mov %rsp,%rbp
401379: 48 89 7d e8 mov %rdi,-0x18(%rbp)
40137d: 48 89 75 e0 mov %rsi,-0x20(%rbp)
double dp = 0.0;
401381: 66 0f ef c0 pxor %xmm0,%xmm0
401385: f2 0f 11 45 f8 movsd %xmm0,-0x8(%rbp)
/*dp = dp + (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);*/
for (int i = 0; i < 3; i++)
40138a: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%rbp)
401391: eb 46 jmp 4013d9 <dot_product+0x64>
dp += v1[i] * v2[i];
401393: 8b 45 f4 mov -0xc(%rbp),%eax
...
0000000000401375 <dot_product>:
static inline
double dot_product(const double *v1, const double *v2)
{
401375: 55 push %rbp
401376: 48 89 e5 mov %rsp,%rbp
401379: 48 89 7d e8 mov %rdi,-0x18(%rbp)
40137d: 48 89 75 e0 mov %rsi,-0x20(%rbp)
double dp = 0.0;
401381: 66 0f ef c0 pxor %xmm0,%xmm0
401385: f2 0f 11 45 f8 movsd %xmm0,-0x8(%rbp)
dp = dp + (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
40138a: 48 8b 45 e8 mov -0x18(%rbp),%rax
40138e: f2 0f 10 08 movsd (%rax),%xmm1
401392: 48 8b 45 e0 mov -0x20(%rbp),%rax
401396: f2 0f 10 00 movsd (%rax),%xmm0
40139a: f2 0f 59 c8 mulsd %xmm0,%xmm1
40139e: 48 8b 45 e8 mov -0x18(%rbp),%rax
4013a2: 48 83 c0 08 add $0x8,%rax
4013a6: f2 0f 10 10 movsd (%rax),%xmm2
4013aa: 48 8b 45 e0 mov -0x20(%rbp),%rax
4013ae: 48 83 c0 08 add $0x8,%rax
4013b2: f2 0f 10 00 movsd (%rax),%xmm0
4013b6: f2 0f 59 c2 mulsd %xmm2,%xmm0
4013ba: f2 0f 58 c8 addsd %xmm0,%xmm1
4013be: 48 8b 45 e8 mov -0x18(%rbp),%rax
4013c2: 48 83 c0 10 add $0x10,%rax
4013c6: f2 0f 10 10 movsd (%rax),%xmm2
...
=> 就運算上,加減次數並不會減少,省下的每次呼叫dot_product時產生的jmp
=> 觀察產生的assembly大小:
kevin@kevin-QX-350-Series:[~/workspace/raytracing]$ ls -l
總計 1276
-rw-rw-r-- 1 kevin kevin 156 9月 27 11:38 AUTHORS
-rw-rw-r-- 1 kevin kevin 786447 9月 27 11:38 baseline.ppm
-rw-rw-r-- 1 kevin kevin 167972 9月 27 12:37 disassembly.dump
-rw-rw-r-- 1 kevin kevin 167992 9月 27 12:34 disassembly_opt.dump
...
(新電腦環境下的執行情形,已加上先前loop unrolling):
kevin@kevin-QX-350-Series:[~/workspace/raytracing]$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 2.467720 sec
(加上__attribute__((always_inline))
後):
kevin@kevin-QX-350-Series:[~/workspace/raytracing]$ ./raytracing
# Rendering scene
Done!
Execution time of raytracing() : 2.397503 sec
減少0.070217秒的執行時間
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls s/call s/call name
40.25 0.87 0.87 13861875 0.00 0.00 rayRectangularIntersection
16.05 1.21 0.35 13861875 0.00 0.00 raySphereIntersection
15.35 1.54 0.33 2110576 0.00 0.00 compute_specular_diffuse
6.98 1.69 0.15 2110576 0.00 0.00 localColor
6.51 1.83 0.14 1048576 0.00 0.00 ray_color
6.51 1.97 0.14 4620625 0.00 0.00 ray_hit_object
2.79 2.03 0.06 1048576 0.00 0.00 rayConstruction
1.86 2.07 0.04 1 0.04 2.15 raytracing
1.40 2.10 0.03 1241598 0.00 0.00 reflection
...