Try   HackMD

2016q3 Homework1 (raytracing)

contributed by <Quexint>

報告

大綱

環境

  • OS: Arch Linux 4.7.4-1-ARCH
  • CPU: Intel® Core2 Quad CPU @ 2.40GHz
  • MEM: 4G
  • GCC: 6.2.1

規格

給定一段高數值計算的程式專案,在無損答案精度下,求計算時間最短。

實驗

  • Loop Unrolling
    • dot_product
    • add_vector
    • substruct_vector
    • multiply_vector
    • multiply_vectors
  • 函式優化
    • CBLAS on dot_product
    • Inverse Square Root on normalize
  • 平行化
    • 平行化模型
      • Jobs/Worker
      • Lock-Free Programming
    • OMP on Raytracing
    • OMP on Math-Toolkit
    • Pthread on Raytraing
    • AVX SIMD on Math-Toolkie
  • 編譯器
    • Memory Advise
    • inline on Math-Toolkit
    • asm on Math-Toolkit

結果

開發記錄

v0 原始版本

  • 執行時間
# make
Execution time of raytracing() : 6.357352 sec
  • 評測
 23.57      1.06     1.06 69646433     0.00     0.00  dot_product
 11.56      1.58     0.52 56956357     0.00     0.00  subtract_vector
 10.45      2.05     0.47 13861875     0.00     0.00  rayRectangularIntersection

v1 優化 dot_product

  • 使用 Loop Unrolling 優化
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
  • 執行時間
# make
Execution time of raytracing() : 5.628290 sec
  • 評測
 18.55      0.76     0.76 56956357     0.00     0.00  subtract_vector
 13.67      1.32     0.56 10598450     0.00     0.00  normalize
 10.98      1.77     0.45 69646433     0.00     0.00  dot_product
 10.13      2.19     0.42 13861875     0.00     0.00  rayRectangularIntersection

v2x 用 CBLAS 優化 dot_product (X)

  • 參考
  • 以前寫過一點數值的程式,有用過數值的函式庫CBLAS。
    • BLAS for Pascal, CBLAS for C
  • CBLAS 有 LEVEL1 (向量操作), 2(矩陣-向量操作), 3(矩陣-矩陣操作)
  • sudo pacman -S cblas
  • Makefile
-lcblas
  • math-toolkit.h
#include <cblas.h>
#define dot_product(x, y) cblas_ddot(3, x, 1, y, 1)
  • 執行時間
Execution time of raytracing() : 5.377129 sec
  • 沒有顯著的提升

v3 優化 subtract_vector

  • 使用 Loop Unrolling 優化
out[0] = a[0] - b[0];
out[1] = a[1] - b[1];
out[2] = a[2] - b[2];
  • 執行時間
# make
Execution time of raytracing() : 4.737710 sec
  • 評測
 12.40      0.45     0.45 69646433     0.00     0.00  dot_product
 11.71      0.87     0.42 56956357     0.00     0.00  subtract_vector
 11.29      1.27     0.41 13861875     0.00     0.00  rayRectangularIntersection
 10.59      1.65     0.38 10598450     0.00     0.00  normalize

v4 優化 add_vector

  • 使用 Loop Unrolling 優化
out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];
  • 執行時間
# make
Execution time of raytracing() : 4.456836 sec
  • 評測
 16.10      0.51     0.51 10598450     0.00     0.00  normalize
 14.05      0.96     0.45 69646433     0.00     0.00  dot_product
 10.73      1.30     0.34 13861875     0.00     0.00  rayRectangularIntersection
  9.47      1.60     0.30 56956357     0.00     0.00  subtract_vector
  7.42      1.83     0.24 31410180     0.00     0.00  multiply_vector
  6.94      2.05     0.22  4620625     0.00     0.00  ray_hit_object
  6.31      2.25     0.20 17821809     0.00     0.00  cross_product

v5x 優化 normalize (X)

Trial 1: Fast Inverse Sqare Root

static inline float Q_rsqrt( float number )
{
    long i;
    float x2, y;
    const float threehalfs = 1.5F;

    x2 = number * 0.5F;
    y  = number;
    i  = * ( long * ) &y;
    i  = 0x5f3759df - ( i >> 1 );
    y  = * ( float * ) &i;
    y  = y * ( threehalfs - ( x2 * y * y ) );

    return y;
}

static inline
void normalize(double *v)
{
    double d = Q_rsqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
    assert(d != 0.0 && "Error calculating normal");

    v[0] *= d;
    v[1] *= d;
    v[2] *= d;
}
  • 執行時間
# make
Execution time of raytracing() : 2.345482 sec <- 超快
  • 精度崩了。

image alt

Trial 2: 改成乘法

static inline
void normalize(double *v)
{
    double d = 1.0/sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
    assert(d != 0.0 && "Error calculating normal");

    v[0] *= d;
    v[1] *= d;
    v[2] *= d;
}
  • 執行時間
# make
Execution time of raytracing() : 4.386514 sec

Trial 3: Fast Inverse Sqare Root (double version)

static inline double InvSqrt(double number)
{
	long long i;
	double x2, y;
	const double threehalfs = 1.5F;

	x2 = number * 0.5F;
	y = number;
	i = * ( long long * ) &y;
	i = 0x5fe6ec85e7de30da - ( i >> 1 );
	y = * ( double * ) &i;
	y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration
	y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed
	return y;
}
  • 執行時間
# make
Execution time of raytracing() : 4.575374 sec
  • 更慢,精度有準一點但還是不夠。

image alt

v6 優化 multiply_vectors

  • 使用 Loop Unrolling 優化
out[0] = a[0] * b[0];
out[1] = a[1] * b[1];
out[2] = a[2] * b[2];
  • 執行時間
# make
Execution time of raytracing() : 4.356295 sec
  • 評測
 13.03      0.44     0.44 31410180     0.00     0.00  multiply_vector
 12.28      0.85     0.41 10598450     0.00     0.00  normalize
 11.68      1.24     0.39 56956357     0.00     0.00  subtract_vector
 11.09      1.61     0.37 13861875     0.00     0.00  rayRectangularIntersection
 10.79      1.97     0.36 69646433     0.00     0.00  dot_product

v7 優化 multiply_vector

  • 使用 Loop Unrolling 優化
out[0] = a[0] * b;
out[1] = a[1] * b;
out[2] = a[2] * b;
  • 執行時間
# make
Execution time of raytracing() : 3.968329 sec
  • 評測
 13.21      0.40     0.40 10598450     0.00     0.00  normalize
 12.88      0.79     0.39 13861875     0.00     0.00  rayRectangularIntersection
 12.71      1.18     0.39 69646433     0.00     0.00  dot_product
 10.73      1.50     0.33 13861875     0.00     0.00  raySphereIntersection

v8 OpenMP 平行化 (X)

CFLAGS = \
	-std=gnu99 -Wall -O0 -g -fopenmp
LDFLAGS = \
	-lm -lomp
  • raytracing.c
    int HxW = height * width;
    int i, j;
    #pragma omp parallel for
    for(int itr = 0; itr < HxW; itr++) {
        i = itr % width;
        j = itr / width;
  • 執行時間
# make
Execution time of raytracing() : 1.581772 sec
  • 評測
 17.30      0.89     0.89 10703749     0.00     0.00  subtract_vector
 11.53      1.48     0.59  2033242     0.00     0.00  normalize
 11.04      2.04     0.57 13231699     0.00     0.00  dot_product
 10.65      2.59     0.55  6265867     0.00     0.00  multiply_vector
  8.31      3.01     0.43  2719594     0.00     0.00  rayRectangularIntersection
  • 顏色崩了,待修
  • 找不出原因,改用 pthread 改寫。

v9 Pthread 平行化

LDFLAGS = \
	-lm -lpthread
  • raytracing.c
void *work(void *ptr)
{
    entry *info = (entry *) ptr;
    int i, j;
    int HxW = (info->height) * (info->width);
    idx_stack stk;
    point3 d;
    int factor = sqrt(SAMPLES);
   for(int itr = (info->idx); itr < HxW; itr+=NUM_THREAD) {
        j = itr / (info->width);
        i = itr % (info->height);

        double r = 0, g = 0, b = 0;
        /* MSAA */
        for (int s = 0; s < SAMPLES; s++) {
            idx_stack_init(&stk);
            rayConstruction(d, info->u, info->v, info->w,
                            i * factor + s / factor,
                            j * factor + s % factor,
                            info->view,
                            (info->width) * factor, (info->height) * factor);
            if (ray_color(info->view->vrp, 0.0, d, &stk, info->rectangulars, info->spheres,
                          info->lights, info->object_color,
                          MAX_REFLECTION_BOUNCES)) {
                r += (info->object_color[0]);
                g += (info->object_color[1]);
                b += (info->object_color[2]);
            } else {
                r += (info->background_color[0]);
                g += (info->background_color[1]);
                b += (info->background_color[2]);
            }
            info->pixels[((itr) * 3) + 0] = r * 255 / SAMPLES;
            info->pixels[((itr) * 3) + 1] = g * 255 / SAMPLES;
            info->pixels[((itr) * 3) + 2] = b * 255 / SAMPLES;
        }
    }
    return NULL;
}
/* @param background_color this is not ambient light */
void raytracing(uint8_t *pixels, color background_color,
                rectangular_node rectangulars, sphere_node spheres,
                light_node lights, const viewpoint *view,
                int width, int height)
{
    point3 u, v, w;
    color object_color = { 0.0, 0.0, 0.0 };
    /* calculate u, v, w */
    calculateBasisVectors(u, v, w, view);
    pthread_t *thread_array = (pthread_t *) malloc (sizeof(pthread_t) * NUM_THREAD);
    entry *message = (entry*) malloc (sizeof(entry) * NUM_THREAD);
    for(int i = 0; i < NUM_THREAD; i++) {
        message[i].idx = i;
        message[i].pixels = pixels;
        multiply_vector(background_color, 1.0, message[i].background_color);
        message[i].rectangulars = rectangulars;
        message[i].spheres = spheres;
        message[i].lights = lights;
        message[i].view = view;
        message[i].width = width;
        message[i].height = height;
        multiply_vector(u, 1.0, message[i].u);
        multiply_vector(v, 1.0, message[i].v);
        multiply_vector(w, 1.0, message[i].w);
        multiply_vector(object_color, 1.0, message[i].object_color);
        pthread_create(&thread_array[i], NULL, work, (void*)&message[i]);
        pthread_join(thread_array[i], NULL);
    }
}
  • 執行時間
# make
Execution time of raytracing() : 4.010901 sec
  • 改用 job/worker?

分配工作

const int NUM_THREAD = 4, NUM_WORK = 100;
pthread_mutex_t thread_mutex;
int jobs;
int max(int a, int b)
{
    return a > b ? a : b;
}
void *work(void *ptr)
{
    int S, E;

    while(1) {
        pthread_mutex_lock(&thread_mutex);
        if(jobs != 0) {
            E = jobs;
            jobs = max(jobs - NUM_WORK, 0);
            S = jobs;
        } else S = E = -1;
        pthread_mutex_unlock(&thread_mutex);

        if(S == -1) break;
		for(int itr = S; itr < E; itr++)
		...
}

/* @param background_color this is not ambient light */
void raytracing(uint8_t *pixels, color background_color,
                rectangular_node rectangulars, sphere_node spheres,
                light_node lights, const viewpoint *view,
                int width, int height)
{
    jobs = height * width;

  • 執行時間
Execution time of raytracing() : 3.977515 sec
  • 怎麼可能?
  • 改成先設置好每個執行緒再啟動
    for(int i = 0; i < NUM_THREAD; i++) {
        pthread_create(&thread_array[i], NULL, work, (void*)&message[i]);
    }
    for(int i = 0; i < NUM_THREAD; i++)
        pthread_join(thread_array[i], NULL);
  • 執行時間
Execution time of raytracing() : 1.002260 sec
Execution time of raytracing() : 0.306669 sec <- 如果再加上 -Ofast 超快
  • 評測
 17.48      0.80     0.80  8437099     0.09     0.09  subtract_vector
 12.23      1.36     0.56  1561304     0.36     0.36  normalize
 11.80      1.90     0.54 10395837     0.05     0.05  dot_product
 10.38      2.38     0.48  4785927     0.10     0.10  multiply_vector

v10x AVX SIMD 加速 (X)

CFLAGS = \
	-std=gnu99 -Wall -O0 -g -mavx
  • math-toolkit.h
#include <immintrin.h>
static inline
void normalize(double *v)
{
	double d = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
	assert(d != 0.0 && "Error calculating normal");
	double tmp[4] = {d, d, d, 0};
	_mm256_store_pd(v, _mm256_div_pd(*(__m256d*)v, *(__m256d*)tmp));
}
static inline
void add_vector(const double *a, const double *b, double *out)
{
	_mm256_store_pd(out, _mm256_add_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
	_mm256_store_pd(out, _mm256_sub_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
	_mm256_store_pd(out, _mm256_mul_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
	double tmp[4] = {b, b, b, 0};
	_mm256_store_pd(out, _mm256_mul_pd(*(__m256d*)a, *(__m256d*)tmp));
}
  • 執行時間
    • 變慢:(0.667676/0.626422) * 1.002260 = 1.068265399

v11x 再戰 OpenMP (X)

  • 嘗試將 OMP 套用在 math-toolkit.h 的小型函式
  • math-toolkit.h
#pragma omp simd
for(int i=0; i < 3; i++)
	out[i] = a[i] + b[i];
  • 執行時間
Execution time of raytracing() : 1.421563 sec

v12 強制開啟 inline

__attribute__((always_inline))
void normalize(double *v)
__attribute__((always_inline))
double length(const double *v)
__attribute__((always_inline))
void add_vector(const double *a, const double *b, double *out)
__attribute__((always_inline))
void subtract_vector(const double *a, const double *b, double *out)
__attribute__((always_inline))
void multiply_vectors(const double *a, const double *b, double *out)
__attribute__((always_inline))
void multiply_vector(const double *a, double b, double *out)
__attribute__((always_inline))
void cross_product(const double *v1, const double *v2, double *out)
__attribute__((always_inline))
double dot_product(const double *v1, const double *v2)
__attribute__((always_inline))
void scalar_triple_product(const double *u, const double *v, const double *w,
                           double *out)
__attribute__((always_inline))
double scalar_triple(const double *u, const double *v, const double *w)
  • 執行時間
Execution time of raytracing() : 0.876538 sec
  • 提高 7.252796798 倍

v13x Linux Kernel Hack (X)

  • 對於常用的 Memory,嘗試去加快存取的速度。
  • 使用 madvise / posix_madvise 建議 Kernel 對於某塊記憶體如何存取。
  • main.c
#include <sys/mman.h>
madvise(pixels, sizeof(unsigned char) * ROWS * COLS * 3, MADV_RANDOM);
  • 試過 MADV_RANDOM, MADV_WILLNEED 都沒有提高速度。

v14x Lock-Free Programming (X)

 while((S = jobs) && __sync_bool_compare_and_swap(&jobs, jobs, jobs - NUM_WORK)); 
  • 執行時間:約5秒多,慢快5倍…

v15-Ofastasm 作弊

0000000000401135 <add_vector>:
  401135:	55                   	push   %rbp
  401136:	48 89 e5             	mov    %rsp,%rbp
  401139:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
  40113d:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
  401141:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
  401145:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  401149:	f2 0f 10 08          	movsd  (%rax),%xmm1
  40114d:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  401151:	f2 0f 10 00          	movsd  (%rax),%xmm0
  401155:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
  401159:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  40115d:	f2 0f 11 00          	movsd  %xmm0,(%rax)
  401161:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  401165:	48 83 c0 08          	add    $0x8,%rax
  401169:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
  40116d:	48 83 c2 08          	add    $0x8,%rdx
  401171:	f2 0f 10 0a          	movsd  (%rdx),%xmm1
  401175:	48 8b 55 f0          	mov    -0x10(%rbp),%rdx
  401179:	48 83 c2 08          	add    $0x8,%rdx
  40117d:	f2 0f 10 02          	movsd  (%rdx),%xmm0
  401181:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
  401185:	f2 0f 11 00          	movsd  %xmm0,(%rax)
  401189:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  40118d:	48 83 c0 10          	add    $0x10,%rax
  401191:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
  401195:	48 83 c2 10          	add    $0x10,%rdx
  401199:	f2 0f 10 0a          	movsd  (%rdx),%xmm1
  40119d:	48 8b 55 f0          	mov    -0x10(%rbp),%rdx
  4011a1:	48 83 c2 10          	add    $0x10,%rdx
  4011a5:	f2 0f 10 02          	movsd  (%rdx),%xmm0
  4011a9:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
  4011ad:	f2 0f 11 00          	movsd  %xmm0,(%rax)
  4011b1:	90                   	nop
  4011b2:	5d                   	pop    %rbp
  4011b3:	c3                   	retq   
0000000000403350 <add_vector>:
  403350:	f2 0f 10 07          	movsd  (%rdi),%xmm0
  403354:	f2 0f 58 06          	addsd  (%rsi),%xmm0
  403358:	f2 0f 11 02          	movsd  %xmm0,(%rdx)
  40335c:	f2 0f 10 47 08       	movsd  0x8(%rdi),%xmm0
  403361:	f2 0f 58 46 08       	addsd  0x8(%rsi),%xmm0
  403366:	f2 0f 11 42 08       	movsd  %xmm0,0x8(%rdx)
  40336b:	f2 0f 10 47 10       	movsd  0x10(%rdi),%xmm0
  403370:	f2 0f 58 46 10       	addsd  0x10(%rsi),%xmm0
  403375:	f2 0f 11 42 10       	movsd  %xmm0,0x10(%rdx)
  40337a:	c3                   	retq   
  40337b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
  • 所以直接複製 -Ofast 的組合語言就可以加速。
  • 寫組語有點痛苦,有些區塊有 jump 不能直接套。
  • 學到 FPU 跟 CPU 的組語不能混用…
static void normalize(double *v)
{
	const double a = 1.0;

	asm("movsd  (%rdi),%xmm1;"
      "movsd  0x8(%rdi),%xmm2;"
      "movsd  0x10(%rdi),%xmm3;"
      "mulsd  %xmm1,%xmm1;"
      "mulsd  %xmm2,%xmm2;"
      "mulsd  %xmm3,%xmm3;"
      "addsd  %xmm2,%xmm1;"
      "addsd  %xmm3,%xmm1;"
      "sqrtsd %xmm1,%xmm1;"
      "divsd  %xmm1,%xmm0;"
	  "movsd  (%rdi),%xmm1;"
      "movsd  0x8(%rdi),%xmm2;"
      "movsd  0x10(%rdi),%xmm3;"
      "mulsd  %xmm0,%xmm1;"
      "mulsd  %xmm0,%xmm2;"
      "mulsd  %xmm0,%xmm3;"
	  "movsd  %xmm1, (%rdi);"
      "movsd  %xmm2, 0x8(%rdi);"
      "movsd  %xmm3, 0x10(%rdi);"
				);
}

static double length(const double *v)
{
	asm("movsd  (%rdi),%xmm1;"
      "movsd  0x8(%rdi),%xmm0;"
      "mulsd  %xmm1,%xmm1;"
      "mulsd  %xmm0,%xmm0;"
      "movsd  0x10(%rdi),%xmm2;"
      "mulsd  %xmm2,%xmm2;"
      "addsd  %xmm1,%xmm0;"
      "addsd  %xmm2,%xmm0;"
      "sqrtsd %xmm0,%xmm0;");
}

static void add_vector(const double *a, const double *b, double *out)
{
	asm("movsd  (%rdi),%xmm0;"
      "addsd  (%rsi),%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x8(%rdi),%xmm0;"
      "addsd  0x8(%rsi),%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "addsd  0x10(%rsi),%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static void subtract_vector(const double *a, const double *b, double *out)
{
	asm("movsd  (%rdi),%xmm0;"
      "subsd  (%rsi),%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x8(%rdi),%xmm0;"
      "subsd  0x8(%rsi),%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "subsd  0x10(%rsi),%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static void multiply_vectors(const double *a, const double *b, double *out)
{
	asm("movsd  (%rdi),%xmm0;"
      "mulsd  (%rsi),%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x8(%rdi),%xmm0;"
      "mulsd  0x8(%rsi),%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "mulsd  0x10(%rsi),%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static void multiply_vector(const double *a, double b, double *out)
{
	asm("movsd  (%rdi),%xmm1;"
      "mulsd  %xmm0,%xmm1;"
      "movsd  %xmm1,(%rsi);"
      "movsd  0x8(%rdi),%xmm1;"
      "mulsd  %xmm0,%xmm1;"
      "movsd  %xmm1,0x8(%rsi);"
      "mulsd  0x10(%rdi),%xmm0;"
      "movsd  %xmm0,0x10(%rsi);");
}

static void cross_product(const double *v1, const double *v2, double *out)
{
	asm("movsd  0x8(%rdi),%xmm0;"
      "movsd  0x10(%rdi),%xmm1;"
      "mulsd  0x10(%rsi),%xmm0;"
      "mulsd  0x8(%rsi),%xmm1;"
      "subsd  %xmm1,%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "movsd  (%rdi),%xmm1;"
      "mulsd  (%rsi),%xmm0;"
      "mulsd  0x10(%rsi),%xmm1;"
      "subsd  %xmm1,%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  (%rdi),%xmm0;"
      "movsd  0x8(%rdi),%xmm1;"
      "mulsd  0x8(%rsi),%xmm0;"
      "mulsd  (%rsi),%xmm1;"
      "subsd  %xmm1,%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static double dot_product(const double *v1, const double *v2)
{
	asm("movsd  (%rdi),%xmm0;"
      "movsd  0x8(%rdi),%xmm1;"
      "mulsd  (%rsi),%xmm0;"
      "mulsd  0x8(%rsi),%xmm1;"
      "addsd  %xmm1,%xmm0;"
      "movsd  0x10(%rdi),%xmm1;"
      "mulsd  0x10(%rsi),%xmm1;"
			"addsd  %xmm1,%xmm0;");
}

static void scalar_triple_product(const double *u, const double *v, const double *w,
                           double *out)
{
	asm("movsd  0x8(%rsi),%xmm2;"
      "movsd  0x10(%rsi),%xmm0;"
      "mulsd  0x10(%rdx),%xmm2;"
      "mulsd  0x8(%rdx),%xmm0;"
      "subsd  %xmm0,%xmm2;"
      "movsd  %xmm2,(%rcx);"
      "movsd  0x10(%rsi),%xmm1;"
      "movsd  (%rsi),%xmm0;"
      "mulsd  (%rdx),%xmm1;"
      "mulsd  0x10(%rdx),%xmm0;"
      "subsd  %xmm0,%xmm1;"
      "movsd  %xmm1,0x8(%rcx);"
      "movsd  (%rsi),%xmm0;"
      "movsd  0x8(%rsi),%xmm3;"
      "mulsd  0x8(%rdx),%xmm0;"
      "mulsd  (%rdx),%xmm3;"
      "subsd  %xmm3,%xmm0;"
      "movsd  %xmm0,0x10(%rcx);"
      "mulsd  (%rdi),%xmm2;"
      "movsd  %xmm2,(%rcx);"
      "mulsd  0x8(%rdi),%xmm1;"
      "movsd  %xmm1,0x8(%rcx);"
      "mulsd  0x10(%rdi),%xmm0;"
      "movsd  %xmm0,0x10(%rcx);");
}

static double scalar_triple(const double *u, const double *v, const double *w)
{
	asm("movsd  0x8(%rdx),%xmm3;"
      "movsd  0x10(%rdx),%xmm2;"
      "movsd  0x10(%rdi),%xmm4;"
      "movapd %xmm3,%xmm0;"
      "movsd  0x8(%rdi),%xmm1;"
      "movsd  (%rdi),%xmm5;"
      "movapd %xmm2,%xmm7;"
      "movsd  (%rdx),%xmm6;"
      "mulsd  %xmm1,%xmm7;"
      "mulsd  %xmm4,%xmm0;"
      "mulsd  %xmm5,%xmm2;"
      "mulsd  %xmm6,%xmm4;"
      "mulsd  %xmm6,%xmm1;"
      "subsd  %xmm7,%xmm0;"
      "mulsd  %xmm5,%xmm3;"
      "subsd  %xmm4,%xmm2;"
      "mulsd  (%rsi),%xmm0;"
      "subsd  %xmm3,%xmm1;"
      "mulsd  0x8(%rsi),%xmm2;"
      "mulsd  0x10(%rsi),%xmm1;"
      "addsd  %xmm2,%xmm0;"
      "addsd  %xmm1,%xmm0;");
}
  • 執行時間 (9.132853611倍)
Execution time of raytracing() : 0.696097 sec
  • 若要打敗 -Ofast 的 0.3 秒,可能還要重構 raytracing.c 了。