contributed by <Quexint
>
給定一段高數值計算的程式專案,在無損答案精度下,求計算時間最短。
v0
原始版本# make
Execution time of raytracing() : 6.357352 sec
23.57 1.06 1.06 69646433 0.00 0.00 dot_product
11.56 1.58 0.52 56956357 0.00 0.00 subtract_vector
10.45 2.05 0.47 13861875 0.00 0.00 rayRectangularIntersection
v1
優化 dot_product
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
# make
Execution time of raytracing() : 5.628290 sec
18.55 0.76 0.76 56956357 0.00 0.00 subtract_vector
13.67 1.32 0.56 10598450 0.00 0.00 normalize
10.98 1.77 0.45 69646433 0.00 0.00 dot_product
10.13 2.19 0.42 13861875 0.00 0.00 rayRectangularIntersection
v2x
用 CBLAS 優化 dot_product
(X)sudo pacman -S cblas
Makefile
-lcblas
math-toolkit.h
#include <cblas.h>
#define dot_product(x, y) cblas_ddot(3, x, 1, y, 1)
Execution time of raytracing() : 5.377129 sec
v3
優化 subtract_vector
out[0] = a[0] - b[0];
out[1] = a[1] - b[1];
out[2] = a[2] - b[2];
# make
Execution time of raytracing() : 4.737710 sec
12.40 0.45 0.45 69646433 0.00 0.00 dot_product
11.71 0.87 0.42 56956357 0.00 0.00 subtract_vector
11.29 1.27 0.41 13861875 0.00 0.00 rayRectangularIntersection
10.59 1.65 0.38 10598450 0.00 0.00 normalize
v4
優化 add_vector
out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];
# make
Execution time of raytracing() : 4.456836 sec
16.10 0.51 0.51 10598450 0.00 0.00 normalize
14.05 0.96 0.45 69646433 0.00 0.00 dot_product
10.73 1.30 0.34 13861875 0.00 0.00 rayRectangularIntersection
9.47 1.60 0.30 56956357 0.00 0.00 subtract_vector
7.42 1.83 0.24 31410180 0.00 0.00 multiply_vector
6.94 2.05 0.22 4620625 0.00 0.00 ray_hit_object
6.31 2.25 0.20 17821809 0.00 0.00 cross_product
v5x
優化 normalize
(X)static inline float Q_rsqrt( float number )
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = number * 0.5F;
y = number;
i = * ( long * ) &y;
i = 0x5f3759df - ( i >> 1 );
y = * ( float * ) &i;
y = y * ( threehalfs - ( x2 * y * y ) );
return y;
}
static inline
void normalize(double *v)
{
double d = Q_rsqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
assert(d != 0.0 && "Error calculating normal");
v[0] *= d;
v[1] *= d;
v[2] *= d;
}
# make
Execution time of raytracing() : 2.345482 sec <- 超快
static inline
void normalize(double *v)
{
double d = 1.0/sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
assert(d != 0.0 && "Error calculating normal");
v[0] *= d;
v[1] *= d;
v[2] *= d;
}
# make
Execution time of raytracing() : 4.386514 sec
static inline double InvSqrt(double number)
{
long long i;
double x2, y;
const double threehalfs = 1.5F;
x2 = number * 0.5F;
y = number;
i = * ( long long * ) &y;
i = 0x5fe6ec85e7de30da - ( i >> 1 );
y = * ( double * ) &i;
y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration
y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed
return y;
}
# make
Execution time of raytracing() : 4.575374 sec
v6
優化 multiply_vectors
out[0] = a[0] * b[0];
out[1] = a[1] * b[1];
out[2] = a[2] * b[2];
# make
Execution time of raytracing() : 4.356295 sec
13.03 0.44 0.44 31410180 0.00 0.00 multiply_vector
12.28 0.85 0.41 10598450 0.00 0.00 normalize
11.68 1.24 0.39 56956357 0.00 0.00 subtract_vector
11.09 1.61 0.37 13861875 0.00 0.00 rayRectangularIntersection
10.79 1.97 0.36 69646433 0.00 0.00 dot_product
v7
優化 multiply_vector
out[0] = a[0] * b;
out[1] = a[1] * b;
out[2] = a[2] * b;
# make
Execution time of raytracing() : 3.968329 sec
13.21 0.40 0.40 10598450 0.00 0.00 normalize
12.88 0.79 0.39 13861875 0.00 0.00 rayRectangularIntersection
12.71 1.18 0.39 69646433 0.00 0.00 dot_product
10.73 1.50 0.33 13861875 0.00 0.00 raySphereIntersection
v8
OpenMP 平行化 (X)sudo pacman -S openmp
Makefile
CFLAGS = \
-std=gnu99 -Wall -O0 -g -fopenmp
LDFLAGS = \
-lm -lomp
raytracing.c
int HxW = height * width;
int i, j;
#pragma omp parallel for
for(int itr = 0; itr < HxW; itr++) {
i = itr % width;
j = itr / width;
# make
Execution time of raytracing() : 1.581772 sec
17.30 0.89 0.89 10703749 0.00 0.00 subtract_vector
11.53 1.48 0.59 2033242 0.00 0.00 normalize
11.04 2.04 0.57 13231699 0.00 0.00 dot_product
10.65 2.59 0.55 6265867 0.00 0.00 multiply_vector
8.31 3.01 0.43 2719594 0.00 0.00 rayRectangularIntersection
v9
Pthread 平行化Makefile
LDFLAGS = \
-lm -lpthread
raytracing.c
void *work(void *ptr)
{
entry *info = (entry *) ptr;
int i, j;
int HxW = (info->height) * (info->width);
idx_stack stk;
point3 d;
int factor = sqrt(SAMPLES);
for(int itr = (info->idx); itr < HxW; itr+=NUM_THREAD) {
j = itr / (info->width);
i = itr % (info->height);
double r = 0, g = 0, b = 0;
/* MSAA */
for (int s = 0; s < SAMPLES; s++) {
idx_stack_init(&stk);
rayConstruction(d, info->u, info->v, info->w,
i * factor + s / factor,
j * factor + s % factor,
info->view,
(info->width) * factor, (info->height) * factor);
if (ray_color(info->view->vrp, 0.0, d, &stk, info->rectangulars, info->spheres,
info->lights, info->object_color,
MAX_REFLECTION_BOUNCES)) {
r += (info->object_color[0]);
g += (info->object_color[1]);
b += (info->object_color[2]);
} else {
r += (info->background_color[0]);
g += (info->background_color[1]);
b += (info->background_color[2]);
}
info->pixels[((itr) * 3) + 0] = r * 255 / SAMPLES;
info->pixels[((itr) * 3) + 1] = g * 255 / SAMPLES;
info->pixels[((itr) * 3) + 2] = b * 255 / SAMPLES;
}
}
return NULL;
}
/* @param background_color this is not ambient light */
void raytracing(uint8_t *pixels, color background_color,
rectangular_node rectangulars, sphere_node spheres,
light_node lights, const viewpoint *view,
int width, int height)
{
point3 u, v, w;
color object_color = { 0.0, 0.0, 0.0 };
/* calculate u, v, w */
calculateBasisVectors(u, v, w, view);
pthread_t *thread_array = (pthread_t *) malloc (sizeof(pthread_t) * NUM_THREAD);
entry *message = (entry*) malloc (sizeof(entry) * NUM_THREAD);
for(int i = 0; i < NUM_THREAD; i++) {
message[i].idx = i;
message[i].pixels = pixels;
multiply_vector(background_color, 1.0, message[i].background_color);
message[i].rectangulars = rectangulars;
message[i].spheres = spheres;
message[i].lights = lights;
message[i].view = view;
message[i].width = width;
message[i].height = height;
multiply_vector(u, 1.0, message[i].u);
multiply_vector(v, 1.0, message[i].v);
multiply_vector(w, 1.0, message[i].w);
multiply_vector(object_color, 1.0, message[i].object_color);
pthread_create(&thread_array[i], NULL, work, (void*)&message[i]);
pthread_join(thread_array[i], NULL);
}
}
# make
Execution time of raytracing() : 4.010901 sec
raytracing.c
const int NUM_THREAD = 4, NUM_WORK = 100;
pthread_mutex_t thread_mutex;
int jobs;
int max(int a, int b)
{
return a > b ? a : b;
}
void *work(void *ptr)
{
int S, E;
while(1) {
pthread_mutex_lock(&thread_mutex);
if(jobs != 0) {
E = jobs;
jobs = max(jobs - NUM_WORK, 0);
S = jobs;
} else S = E = -1;
pthread_mutex_unlock(&thread_mutex);
if(S == -1) break;
for(int itr = S; itr < E; itr++)
...
}
/* @param background_color this is not ambient light */
void raytracing(uint8_t *pixels, color background_color,
rectangular_node rectangulars, sphere_node spheres,
light_node lights, const viewpoint *view,
int width, int height)
{
jobs = height * width;
Execution time of raytracing() : 3.977515 sec
for(int i = 0; i < NUM_THREAD; i++) {
pthread_create(&thread_array[i], NULL, work, (void*)&message[i]);
}
for(int i = 0; i < NUM_THREAD; i++)
pthread_join(thread_array[i], NULL);
Execution time of raytracing() : 1.002260 sec
Execution time of raytracing() : 0.306669 sec <- 如果再加上 -Ofast 超快
17.48 0.80 0.80 8437099 0.09 0.09 subtract_vector
12.23 1.36 0.56 1561304 0.36 0.36 normalize
11.80 1.90 0.54 10395837 0.05 0.05 dot_product
10.38 2.38 0.48 4785927 0.10 0.10 multiply_vector
v10x
AVX SIMD 加速 (X)新環境_T_v10 / 新環境_T_v9
)Makefile
CFLAGS = \
-std=gnu99 -Wall -O0 -g -mavx
math-toolkit.h
#include <immintrin.h>
static inline
void normalize(double *v)
{
double d = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
assert(d != 0.0 && "Error calculating normal");
double tmp[4] = {d, d, d, 0};
_mm256_store_pd(v, _mm256_div_pd(*(__m256d*)v, *(__m256d*)tmp));
}
static inline
void add_vector(const double *a, const double *b, double *out)
{
_mm256_store_pd(out, _mm256_add_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
_mm256_store_pd(out, _mm256_sub_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
_mm256_store_pd(out, _mm256_mul_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
double tmp[4] = {b, b, b, 0};
_mm256_store_pd(out, _mm256_mul_pd(*(__m256d*)a, *(__m256d*)tmp));
}
(0.667676/0.626422) * 1.002260 = 1.068265399
v11x
再戰 OpenMP (X)math-toolkit.h
的小型函式math-toolkit.h
#pragma omp simd
for(int i=0; i < 3; i++)
out[i] = a[i] + b[i];
Execution time of raytracing() : 1.421563 sec
v12
強制開啟 inline
math-toolkit.h
__attribute__((always_inline))
void normalize(double *v)
__attribute__((always_inline))
double length(const double *v)
__attribute__((always_inline))
void add_vector(const double *a, const double *b, double *out)
__attribute__((always_inline))
void subtract_vector(const double *a, const double *b, double *out)
__attribute__((always_inline))
void multiply_vectors(const double *a, const double *b, double *out)
__attribute__((always_inline))
void multiply_vector(const double *a, double b, double *out)
__attribute__((always_inline))
void cross_product(const double *v1, const double *v2, double *out)
__attribute__((always_inline))
double dot_product(const double *v1, const double *v2)
__attribute__((always_inline))
void scalar_triple_product(const double *u, const double *v, const double *w,
double *out)
__attribute__((always_inline))
double scalar_triple(const double *u, const double *v, const double *w)
Execution time of raytracing() : 0.876538 sec
v13x
Linux Kernel Hack (X)madvise
/ posix_madvise
建議 Kernel 對於某塊記憶體如何存取。main.c
#include <sys/mman.h>
madvise(pixels, sizeof(unsigned char) * ROWS * COLS * 3, MADV_RANDOM);
MADV_RANDOM
, MADV_WILLNEED
都沒有提高速度。v14x
Lock-Free Programming (X) while((S = jobs) && __sync_bool_compare_and_swap(&jobs, jobs, jobs - NUM_WORK));
v15
用 -Ofast
及 asm
作弊-Ofast
可以將速度提高到 0.300528sec
,所以可以分析一下是怎麼變快的。objdump -d raytracing | vim -
對照有無開啟 -Ofast
add_vector
為例,行數就差好幾倍:0000000000401135 <add_vector>:
401135: 55 push %rbp
401136: 48 89 e5 mov %rsp,%rbp
401139: 48 89 7d f8 mov %rdi,-0x8(%rbp)
40113d: 48 89 75 f0 mov %rsi,-0x10(%rbp)
401141: 48 89 55 e8 mov %rdx,-0x18(%rbp)
401145: 48 8b 45 f8 mov -0x8(%rbp),%rax
401149: f2 0f 10 08 movsd (%rax),%xmm1
40114d: 48 8b 45 f0 mov -0x10(%rbp),%rax
401151: f2 0f 10 00 movsd (%rax),%xmm0
401155: f2 0f 58 c1 addsd %xmm1,%xmm0
401159: 48 8b 45 e8 mov -0x18(%rbp),%rax
40115d: f2 0f 11 00 movsd %xmm0,(%rax)
401161: 48 8b 45 e8 mov -0x18(%rbp),%rax
401165: 48 83 c0 08 add $0x8,%rax
401169: 48 8b 55 f8 mov -0x8(%rbp),%rdx
40116d: 48 83 c2 08 add $0x8,%rdx
401171: f2 0f 10 0a movsd (%rdx),%xmm1
401175: 48 8b 55 f0 mov -0x10(%rbp),%rdx
401179: 48 83 c2 08 add $0x8,%rdx
40117d: f2 0f 10 02 movsd (%rdx),%xmm0
401181: f2 0f 58 c1 addsd %xmm1,%xmm0
401185: f2 0f 11 00 movsd %xmm0,(%rax)
401189: 48 8b 45 e8 mov -0x18(%rbp),%rax
40118d: 48 83 c0 10 add $0x10,%rax
401191: 48 8b 55 f8 mov -0x8(%rbp),%rdx
401195: 48 83 c2 10 add $0x10,%rdx
401199: f2 0f 10 0a movsd (%rdx),%xmm1
40119d: 48 8b 55 f0 mov -0x10(%rbp),%rdx
4011a1: 48 83 c2 10 add $0x10,%rdx
4011a5: f2 0f 10 02 movsd (%rdx),%xmm0
4011a9: f2 0f 58 c1 addsd %xmm1,%xmm0
4011ad: f2 0f 11 00 movsd %xmm0,(%rax)
4011b1: 90 nop
4011b2: 5d pop %rbp
4011b3: c3 retq
0000000000403350 <add_vector>:
403350: f2 0f 10 07 movsd (%rdi),%xmm0
403354: f2 0f 58 06 addsd (%rsi),%xmm0
403358: f2 0f 11 02 movsd %xmm0,(%rdx)
40335c: f2 0f 10 47 08 movsd 0x8(%rdi),%xmm0
403361: f2 0f 58 46 08 addsd 0x8(%rsi),%xmm0
403366: f2 0f 11 42 08 movsd %xmm0,0x8(%rdx)
40336b: f2 0f 10 47 10 movsd 0x10(%rdi),%xmm0
403370: f2 0f 58 46 10 addsd 0x10(%rsi),%xmm0
403375: f2 0f 11 42 10 movsd %xmm0,0x10(%rdx)
40337a: c3 retq
40337b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
-Ofast
的組合語言就可以加速。static void normalize(double *v)
{
const double a = 1.0;
asm("movsd (%rdi),%xmm1;"
"movsd 0x8(%rdi),%xmm2;"
"movsd 0x10(%rdi),%xmm3;"
"mulsd %xmm1,%xmm1;"
"mulsd %xmm2,%xmm2;"
"mulsd %xmm3,%xmm3;"
"addsd %xmm2,%xmm1;"
"addsd %xmm3,%xmm1;"
"sqrtsd %xmm1,%xmm1;"
"divsd %xmm1,%xmm0;"
"movsd (%rdi),%xmm1;"
"movsd 0x8(%rdi),%xmm2;"
"movsd 0x10(%rdi),%xmm3;"
"mulsd %xmm0,%xmm1;"
"mulsd %xmm0,%xmm2;"
"mulsd %xmm0,%xmm3;"
"movsd %xmm1, (%rdi);"
"movsd %xmm2, 0x8(%rdi);"
"movsd %xmm3, 0x10(%rdi);"
);
}
static double length(const double *v)
{
asm("movsd (%rdi),%xmm1;"
"movsd 0x8(%rdi),%xmm0;"
"mulsd %xmm1,%xmm1;"
"mulsd %xmm0,%xmm0;"
"movsd 0x10(%rdi),%xmm2;"
"mulsd %xmm2,%xmm2;"
"addsd %xmm1,%xmm0;"
"addsd %xmm2,%xmm0;"
"sqrtsd %xmm0,%xmm0;");
}
static void add_vector(const double *a, const double *b, double *out)
{
asm("movsd (%rdi),%xmm0;"
"addsd (%rsi),%xmm0;"
"movsd %xmm0,(%rdx);"
"movsd 0x8(%rdi),%xmm0;"
"addsd 0x8(%rsi),%xmm0;"
"movsd %xmm0,0x8(%rdx);"
"movsd 0x10(%rdi),%xmm0;"
"addsd 0x10(%rsi),%xmm0;"
"movsd %xmm0,0x10(%rdx);");
}
static void subtract_vector(const double *a, const double *b, double *out)
{
asm("movsd (%rdi),%xmm0;"
"subsd (%rsi),%xmm0;"
"movsd %xmm0,(%rdx);"
"movsd 0x8(%rdi),%xmm0;"
"subsd 0x8(%rsi),%xmm0;"
"movsd %xmm0,0x8(%rdx);"
"movsd 0x10(%rdi),%xmm0;"
"subsd 0x10(%rsi),%xmm0;"
"movsd %xmm0,0x10(%rdx);");
}
static void multiply_vectors(const double *a, const double *b, double *out)
{
asm("movsd (%rdi),%xmm0;"
"mulsd (%rsi),%xmm0;"
"movsd %xmm0,(%rdx);"
"movsd 0x8(%rdi),%xmm0;"
"mulsd 0x8(%rsi),%xmm0;"
"movsd %xmm0,0x8(%rdx);"
"movsd 0x10(%rdi),%xmm0;"
"mulsd 0x10(%rsi),%xmm0;"
"movsd %xmm0,0x10(%rdx);");
}
static void multiply_vector(const double *a, double b, double *out)
{
asm("movsd (%rdi),%xmm1;"
"mulsd %xmm0,%xmm1;"
"movsd %xmm1,(%rsi);"
"movsd 0x8(%rdi),%xmm1;"
"mulsd %xmm0,%xmm1;"
"movsd %xmm1,0x8(%rsi);"
"mulsd 0x10(%rdi),%xmm0;"
"movsd %xmm0,0x10(%rsi);");
}
static void cross_product(const double *v1, const double *v2, double *out)
{
asm("movsd 0x8(%rdi),%xmm0;"
"movsd 0x10(%rdi),%xmm1;"
"mulsd 0x10(%rsi),%xmm0;"
"mulsd 0x8(%rsi),%xmm1;"
"subsd %xmm1,%xmm0;"
"movsd %xmm0,(%rdx);"
"movsd 0x10(%rdi),%xmm0;"
"movsd (%rdi),%xmm1;"
"mulsd (%rsi),%xmm0;"
"mulsd 0x10(%rsi),%xmm1;"
"subsd %xmm1,%xmm0;"
"movsd %xmm0,0x8(%rdx);"
"movsd (%rdi),%xmm0;"
"movsd 0x8(%rdi),%xmm1;"
"mulsd 0x8(%rsi),%xmm0;"
"mulsd (%rsi),%xmm1;"
"subsd %xmm1,%xmm0;"
"movsd %xmm0,0x10(%rdx);");
}
static double dot_product(const double *v1, const double *v2)
{
asm("movsd (%rdi),%xmm0;"
"movsd 0x8(%rdi),%xmm1;"
"mulsd (%rsi),%xmm0;"
"mulsd 0x8(%rsi),%xmm1;"
"addsd %xmm1,%xmm0;"
"movsd 0x10(%rdi),%xmm1;"
"mulsd 0x10(%rsi),%xmm1;"
"addsd %xmm1,%xmm0;");
}
static void scalar_triple_product(const double *u, const double *v, const double *w,
double *out)
{
asm("movsd 0x8(%rsi),%xmm2;"
"movsd 0x10(%rsi),%xmm0;"
"mulsd 0x10(%rdx),%xmm2;"
"mulsd 0x8(%rdx),%xmm0;"
"subsd %xmm0,%xmm2;"
"movsd %xmm2,(%rcx);"
"movsd 0x10(%rsi),%xmm1;"
"movsd (%rsi),%xmm0;"
"mulsd (%rdx),%xmm1;"
"mulsd 0x10(%rdx),%xmm0;"
"subsd %xmm0,%xmm1;"
"movsd %xmm1,0x8(%rcx);"
"movsd (%rsi),%xmm0;"
"movsd 0x8(%rsi),%xmm3;"
"mulsd 0x8(%rdx),%xmm0;"
"mulsd (%rdx),%xmm3;"
"subsd %xmm3,%xmm0;"
"movsd %xmm0,0x10(%rcx);"
"mulsd (%rdi),%xmm2;"
"movsd %xmm2,(%rcx);"
"mulsd 0x8(%rdi),%xmm1;"
"movsd %xmm1,0x8(%rcx);"
"mulsd 0x10(%rdi),%xmm0;"
"movsd %xmm0,0x10(%rcx);");
}
static double scalar_triple(const double *u, const double *v, const double *w)
{
asm("movsd 0x8(%rdx),%xmm3;"
"movsd 0x10(%rdx),%xmm2;"
"movsd 0x10(%rdi),%xmm4;"
"movapd %xmm3,%xmm0;"
"movsd 0x8(%rdi),%xmm1;"
"movsd (%rdi),%xmm5;"
"movapd %xmm2,%xmm7;"
"movsd (%rdx),%xmm6;"
"mulsd %xmm1,%xmm7;"
"mulsd %xmm4,%xmm0;"
"mulsd %xmm5,%xmm2;"
"mulsd %xmm6,%xmm4;"
"mulsd %xmm6,%xmm1;"
"subsd %xmm7,%xmm0;"
"mulsd %xmm5,%xmm3;"
"subsd %xmm4,%xmm2;"
"mulsd (%rsi),%xmm0;"
"subsd %xmm3,%xmm1;"
"mulsd 0x8(%rsi),%xmm2;"
"mulsd 0x10(%rsi),%xmm1;"
"addsd %xmm2,%xmm0;"
"addsd %xmm1,%xmm0;");
}
Execution time of raytracing() : 0.696097 sec
-Ofast
的 0.3 秒,可能還要重構 raytracing.c
了。or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up