2016q3 Homework1 (raytracing)

contributed by <Quexint>

報告

大綱

環境

OS: Arch Linux 4.7.4-1-ARCH
CPU: Intel® Core™2 Quad CPU @ 2.40GHz
MEM: 4G
GCC: 6.2.1

規格

給定一段高數值計算的程式專案，在無損答案精度下，求計算時間最短。

實驗

Loop Unrolling
- dot_product
- add_vector
- substruct_vector
- multiply_vector
- multiply_vectors
函式優化
- CBLAS on dot_product
- Inverse Square Root on normalize
平行化
- 平行化模型
  - Jobs/Worker
  - Lock-Free Programming
- OMP on Raytracing
- OMP on Math-Toolkit
- Pthread on Raytraing
- AVX SIMD on Math-Toolkie
編譯器
- Memory Advise
- inline on Math-Toolkit
- asm on Math-Toolkit

結果

開發記錄

參考
- Optimization of Computer Programs in C

`v0` 原始版本

執行時間

# make
Execution time of raytracing() : 6.357352 sec

評測

 23.57      1.06     1.06 69646433     0.00     0.00  dot_product
 11.56      1.58     0.52 56956357     0.00     0.00  subtract_vector
 10.45      2.05     0.47 13861875     0.00     0.00  rayRectangularIntersection

`v1` 優化 `dot_product`

使用 Loop Unrolling 優化

return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];

執行時間

# make
Execution time of raytracing() : 5.628290 sec

評測

 18.55      0.76     0.76 56956357     0.00     0.00  subtract_vector
 13.67      1.32     0.56 10598450     0.00     0.00  normalize
 10.98      1.77     0.45 69646433     0.00     0.00  dot_product
 10.13      2.19     0.42 13861875     0.00     0.00  rayRectangularIntersection

`v2x` 用 CBLAS 優化 `dot_product` (X)

參考
- CBLAS Code
- BLAS Library
以前寫過一點數值的程式，有用過數值的函式庫CBLAS。
- BLAS for Pascal, CBLAS for C
CBLAS 有 LEVEL1 (向量操作), 2(矩陣-向量操作), 3(矩陣-矩陣操作)
sudo pacman -S cblas
Makefile

-lcblas

math-toolkit.h

#include <cblas.h>
#define dot_product(x, y) cblas_ddot(3, x, 1, y, 1)

執行時間

Execution time of raytracing() : 5.377129 sec

沒有顯著的提升

`v3` 優化 `subtract_vector`

使用 Loop Unrolling 優化

out[0] = a[0] - b[0];
out[1] = a[1] - b[1];
out[2] = a[2] - b[2];

執行時間

# make
Execution time of raytracing() : 4.737710 sec

評測

 12.40      0.45     0.45 69646433     0.00     0.00  dot_product
 11.71      0.87     0.42 56956357     0.00     0.00  subtract_vector
 11.29      1.27     0.41 13861875     0.00     0.00  rayRectangularIntersection
 10.59      1.65     0.38 10598450     0.00     0.00  normalize

`v4` 優化 `add_vector`

使用 Loop Unrolling 優化

out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];

執行時間

# make
Execution time of raytracing() : 4.456836 sec

評測

 16.10      0.51     0.51 10598450     0.00     0.00  normalize
 14.05      0.96     0.45 69646433     0.00     0.00  dot_product
 10.73      1.30     0.34 13861875     0.00     0.00  rayRectangularIntersection
  9.47      1.60     0.30 56956357     0.00     0.00  subtract_vector
  7.42      1.83     0.24 31410180     0.00     0.00  multiply_vector
  6.94      2.05     0.22  4620625     0.00     0.00  ray_hit_object
  6.31      2.25     0.20 17821809     0.00     0.00  cross_product

`v5x` 優化 `normalize` (X)

分析: 強化 sqrt(x) 或是強化除法
參考

Trial 1: Fast Inverse Sqare Root

static inline float Q_rsqrt( float number )
{
    long i;
    float x2, y;
    const float threehalfs = 1.5F;

    x2 = number * 0.5F;
    y  = number;
    i  = * ( long * ) &y;
    i  = 0x5f3759df - ( i >> 1 );
    y  = * ( float * ) &i;
    y  = y * ( threehalfs - ( x2 * y * y ) );

    return y;
}

static inline
void normalize(double *v)
{
    double d = Q_rsqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
    assert(d != 0.0 && "Error calculating normal");

    v[0] *= d;
    v[1] *= d;
    v[2] *= d;
}

執行時間

# make
Execution time of raytracing() : 2.345482 sec <- 超快

精度崩了。

Trial 2: 改成乘法

static inline
void normalize(double *v)
{
    double d = 1.0/sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
    assert(d != 0.0 && "Error calculating normal");

    v[0] *= d;
    v[1] *= d;
    v[2] *= d;
}

執行時間

# make
Execution time of raytracing() : 4.386514 sec

Trial 3: Fast Inverse Sqare Root (double version)

static inline double InvSqrt(double number)
{
	long long i;
	double x2, y;
	const double threehalfs = 1.5F;

	x2 = number * 0.5F;
	y = number;
	i = * ( long long * ) &y;
	i = 0x5fe6ec85e7de30da - ( i >> 1 );
	y = * ( double * ) &i;
	y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration
	y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed
	return y;
}

執行時間

# make
Execution time of raytracing() : 4.575374 sec

更慢，精度有準一點但還是不夠。

`v6` 優化 `multiply_vectors`

使用 Loop Unrolling 優化

out[0] = a[0] * b[0];
out[1] = a[1] * b[1];
out[2] = a[2] * b[2];

執行時間

# make
Execution time of raytracing() : 4.356295 sec

評測

 13.03      0.44     0.44 31410180     0.00     0.00  multiply_vector
 12.28      0.85     0.41 10598450     0.00     0.00  normalize
 11.68      1.24     0.39 56956357     0.00     0.00  subtract_vector
 11.09      1.61     0.37 13861875     0.00     0.00  rayRectangularIntersection
 10.79      1.97     0.36 69646433     0.00     0.00  dot_product

`v7` 優化 `multiply_vector`

使用 Loop Unrolling 優化

out[0] = a[0] * b;
out[1] = a[1] * b;
out[2] = a[2] * b;

執行時間

# make
Execution time of raytracing() : 3.968329 sec

評測

 13.21      0.40     0.40 10598450     0.00     0.00  normalize
 12.88      0.79     0.39 13861875     0.00     0.00  rayRectangularIntersection
 12.71      1.18     0.39 69646433     0.00     0.00  dot_product
 10.73      1.50     0.33 13861875     0.00     0.00  raySphereIntersection

`v8` OpenMP 平行化 (X)

參考
- https://computing.llnl.gov/tutorials/openMP/
- http://bisqwit.iki.fi/story/howto/openmp/
sudo pacman -S openmp
Makefile

CFLAGS = \
	-std=gnu99 -Wall -O0 -g -fopenmp
LDFLAGS = \
	-lm -lomp

raytracing.c

    int HxW = height * width;
    int i, j;
    #pragma omp parallel for
    for(int itr = 0; itr < HxW; itr++) {
        i = itr % width;
        j = itr / width;

執行時間

# make
Execution time of raytracing() : 1.581772 sec

評測

 17.30      0.89     0.89 10703749     0.00     0.00  subtract_vector
 11.53      1.48     0.59  2033242     0.00     0.00  normalize
 11.04      2.04     0.57 13231699     0.00     0.00  dot_product
 10.65      2.59     0.55  6265867     0.00     0.00  multiply_vector
  8.31      3.01     0.43  2719594     0.00     0.00  rayRectangularIntersection

顏色崩了，待修
找不出原因，改用 pthread 改寫。

`v9` Pthread 平行化

參考
- Pthread Library
Makefile

LDFLAGS = \
	-lm -lpthread

raytracing.c

void *work(void *ptr)
{
    entry *info = (entry *) ptr;
    int i, j;
    int HxW = (info->height) * (info->width);
    idx_stack stk;
    point3 d;
    int factor = sqrt(SAMPLES);
   for(int itr = (info->idx); itr < HxW; itr+=NUM_THREAD) {
        j = itr / (info->width);
        i = itr % (info->height);

        double r = 0, g = 0, b = 0;
        /* MSAA */
        for (int s = 0; s < SAMPLES; s++) {
            idx_stack_init(&stk);
            rayConstruction(d, info->u, info->v, info->w,
                            i * factor + s / factor,
                            j * factor + s % factor,
                            info->view,
                            (info->width) * factor, (info->height) * factor);
            if (ray_color(info->view->vrp, 0.0, d, &stk, info->rectangulars, info->spheres,
                          info->lights, info->object_color,
                          MAX_REFLECTION_BOUNCES)) {
                r += (info->object_color[0]);
                g += (info->object_color[1]);
                b += (info->object_color[2]);
            } else {
                r += (info->background_color[0]);
                g += (info->background_color[1]);
                b += (info->background_color[2]);
            }
            info->pixels[((itr) * 3) + 0] = r * 255 / SAMPLES;
            info->pixels[((itr) * 3) + 1] = g * 255 / SAMPLES;
            info->pixels[((itr) * 3) + 2] = b * 255 / SAMPLES;
        }
    }
    return NULL;
}
/* @param background_color this is not ambient light */
void raytracing(uint8_t *pixels, color background_color,
                rectangular_node rectangulars, sphere_node spheres,
                light_node lights, const viewpoint *view,
                int width, int height)
{
    point3 u, v, w;
    color object_color = { 0.0, 0.0, 0.0 };
    /* calculate u, v, w */
    calculateBasisVectors(u, v, w, view);
    pthread_t *thread_array = (pthread_t *) malloc (sizeof(pthread_t) * NUM_THREAD);
    entry *message = (entry*) malloc (sizeof(entry) * NUM_THREAD);
    for(int i = 0; i < NUM_THREAD; i++) {
        message[i].idx = i;
        message[i].pixels = pixels;
        multiply_vector(background_color, 1.0, message[i].background_color);
        message[i].rectangulars = rectangulars;
        message[i].spheres = spheres;
        message[i].lights = lights;
        message[i].view = view;
        message[i].width = width;
        message[i].height = height;
        multiply_vector(u, 1.0, message[i].u);
        multiply_vector(v, 1.0, message[i].v);
        multiply_vector(w, 1.0, message[i].w);
        multiply_vector(object_color, 1.0, message[i].object_color);
        pthread_create(&thread_array[i], NULL, work, (void*)&message[i]);
        pthread_join(thread_array[i], NULL);
    }
}

執行時間

# make
Execution time of raytracing() : 4.010901 sec

改用 job/worker?

分配工作

參考
- Mutex Lock Code Examples
raytracing.c

const int NUM_THREAD = 4, NUM_WORK = 100;
pthread_mutex_t thread_mutex;
int jobs;
int max(int a, int b)
{
    return a > b ? a : b;
}
void *work(void *ptr)
{
    int S, E;

    while(1) {
        pthread_mutex_lock(&thread_mutex);
        if(jobs != 0) {
            E = jobs;
            jobs = max(jobs - NUM_WORK, 0);
            S = jobs;
        } else S = E = -1;
        pthread_mutex_unlock(&thread_mutex);

        if(S == -1) break;
		for(int itr = S; itr < E; itr++)
		...
}

/* @param background_color this is not ambient light */
void raytracing(uint8_t *pixels, color background_color,
                rectangular_node rectangulars, sphere_node spheres,
                light_node lights, const viewpoint *view,
                int width, int height)
{
    jobs = height * width;

執行時間

Execution time of raytracing() : 3.977515 sec

怎麼可能？
改成先設置好每個執行緒再啟動

    for(int i = 0; i < NUM_THREAD; i++) {
        pthread_create(&thread_array[i], NULL, work, (void*)&message[i]);
    }
    for(int i = 0; i < NUM_THREAD; i++)
        pthread_join(thread_array[i], NULL);

執行時間

Execution time of raytracing() : 1.002260 sec
Execution time of raytracing() : 0.306669 sec <- 如果再加上 -Ofast 超快

評測

 17.48      0.80     0.80  8437099     0.09     0.09  subtract_vector
 12.23      1.36     0.56  1561304     0.36     0.36  normalize
 11.80      1.90     0.54 10395837     0.05     0.05  dot_product
 10.38      2.38     0.48  4785927     0.10     0.10  multiply_vector

`v10x` AVX SIMD 加速 (X)

參考
- http://wiki.csie.ncku.edu.tw/embedded/2015q3h1ext
- Intrinsics for Intel® Advanced Vector Extensions
前情提要
- 測試環境 CPU 不支援 AVX ，所以換了一個環境測，以下倍率為估算值(新環境_T_v10 / 新環境_T_v9)
Makefile

CFLAGS = \
	-std=gnu99 -Wall -O0 -g -mavx

math-toolkit.h

#include <immintrin.h>
static inline
void normalize(double *v)
{
	double d = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
	assert(d != 0.0 && "Error calculating normal");
	double tmp[4] = {d, d, d, 0};
	_mm256_store_pd(v, _mm256_div_pd(*(__m256d*)v, *(__m256d*)tmp));
}
static inline
void add_vector(const double *a, const double *b, double *out)
{
	_mm256_store_pd(out, _mm256_add_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void subtract_vector(const double *a, const double *b, double *out)
{
	_mm256_store_pd(out, _mm256_sub_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void multiply_vectors(const double *a, const double *b, double *out)
{
	_mm256_store_pd(out, _mm256_mul_pd(*(__m256d*)a, *(__m256d*)b));
}
static inline
void multiply_vector(const double *a, double b, double *out)
{
	double tmp[4] = {b, b, b, 0};
	_mm256_store_pd(out, _mm256_mul_pd(*(__m256d*)a, *(__m256d*)tmp));
}

執行時間
- 變慢：(0.667676/0.626422) * 1.002260 = 1.068265399

`v11x` 再戰 OpenMP (X)

嘗試將 OMP 套用在 math-toolkit.h 的小型函式
math-toolkit.h

#pragma omp simd
for(int i=0; i < 3; i++)
	out[i] = a[i] + b[i];

執行時間

Execution time of raytracing() : 1.421563 sec

`v12` 強制開啟 `inline`

參考
- How do I force gcc to inline a function?
- force inline function in other translation unit
math-toolkit.h

__attribute__((always_inline))
void normalize(double *v)
__attribute__((always_inline))
double length(const double *v)
__attribute__((always_inline))
void add_vector(const double *a, const double *b, double *out)
__attribute__((always_inline))
void subtract_vector(const double *a, const double *b, double *out)
__attribute__((always_inline))
void multiply_vectors(const double *a, const double *b, double *out)
__attribute__((always_inline))
void multiply_vector(const double *a, double b, double *out)
__attribute__((always_inline))
void cross_product(const double *v1, const double *v2, double *out)
__attribute__((always_inline))
double dot_product(const double *v1, const double *v2)
__attribute__((always_inline))
void scalar_triple_product(const double *u, const double *v, const double *w,
                           double *out)
__attribute__((always_inline))
double scalar_triple(const double *u, const double *v, const double *w)

執行時間

Execution time of raytracing() : 0.876538 sec

提高 7.252796798 倍

`v13x` Linux Kernel Hack (X)

對於常用的 Memory，嘗試去加快存取的速度。
使用 madvise / posix_madvise 建議 Kernel 對於某塊記憶體如何存取。
main.c

#include <sys/mman.h>
madvise(pixels, sizeof(unsigned char) * ROWS * COLS * 3, MADV_RANDOM);

試過 MADV_RANDOM, MADV_WILLNEED 都沒有提高速度。

`v14x` Lock-Free Programming (X)

參考:
試圖拿掉保護 Jobs 的互斥鎖。
採用 GCC 內建的原子操作

 while((S = jobs) && __sync_bool_compare_and_swap(&jobs, jobs, jobs - NUM_WORK));

執行時間：約5秒多，慢快5倍…

`v15` 用 `-Ofast` 及 `asm` 作弊

參考
由於開 -Ofast 可以將速度提高到 0.300528sec，所以可以分析一下是怎麼變快的。
用 objdump -d raytracing | vim - 對照有無開啟 -Ofast
以 add_vector 為例，行數就差好幾倍：

0000000000401135 <add_vector>:
  401135:	55                   	push   %rbp
  401136:	48 89 e5             	mov    %rsp,%rbp
  401139:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
  40113d:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
  401141:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
  401145:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  401149:	f2 0f 10 08          	movsd  (%rax),%xmm1
  40114d:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  401151:	f2 0f 10 00          	movsd  (%rax),%xmm0
  401155:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
  401159:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  40115d:	f2 0f 11 00          	movsd  %xmm0,(%rax)
  401161:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  401165:	48 83 c0 08          	add    $0x8,%rax
  401169:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
  40116d:	48 83 c2 08          	add    $0x8,%rdx
  401171:	f2 0f 10 0a          	movsd  (%rdx),%xmm1
  401175:	48 8b 55 f0          	mov    -0x10(%rbp),%rdx
  401179:	48 83 c2 08          	add    $0x8,%rdx
  40117d:	f2 0f 10 02          	movsd  (%rdx),%xmm0
  401181:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
  401185:	f2 0f 11 00          	movsd  %xmm0,(%rax)
  401189:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  40118d:	48 83 c0 10          	add    $0x10,%rax
  401191:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
  401195:	48 83 c2 10          	add    $0x10,%rdx
  401199:	f2 0f 10 0a          	movsd  (%rdx),%xmm1
  40119d:	48 8b 55 f0          	mov    -0x10(%rbp),%rdx
  4011a1:	48 83 c2 10          	add    $0x10,%rdx
  4011a5:	f2 0f 10 02          	movsd  (%rdx),%xmm0
  4011a9:	f2 0f 58 c1          	addsd  %xmm1,%xmm0
  4011ad:	f2 0f 11 00          	movsd  %xmm0,(%rax)
  4011b1:	90                   	nop
  4011b2:	5d                   	pop    %rbp
  4011b3:	c3                   	retq

0000000000403350 <add_vector>:
  403350:	f2 0f 10 07          	movsd  (%rdi),%xmm0
  403354:	f2 0f 58 06          	addsd  (%rsi),%xmm0
  403358:	f2 0f 11 02          	movsd  %xmm0,(%rdx)
  40335c:	f2 0f 10 47 08       	movsd  0x8(%rdi),%xmm0
  403361:	f2 0f 58 46 08       	addsd  0x8(%rsi),%xmm0
  403366:	f2 0f 11 42 08       	movsd  %xmm0,0x8(%rdx)
  40336b:	f2 0f 10 47 10       	movsd  0x10(%rdi),%xmm0
  403370:	f2 0f 58 46 10       	addsd  0x10(%rsi),%xmm0
  403375:	f2 0f 11 42 10       	movsd  %xmm0,0x10(%rdx)
  40337a:	c3                   	retq   
  40337b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

所以直接複製 -Ofast 的組合語言就可以加速。
寫組語有點痛苦，有些區塊有 jump 不能直接套。
學到 FPU 跟 CPU 的組語不能混用…

static void normalize(double *v)
{
	const double a = 1.0;

	asm("movsd  (%rdi),%xmm1;"
      "movsd  0x8(%rdi),%xmm2;"
      "movsd  0x10(%rdi),%xmm3;"
      "mulsd  %xmm1,%xmm1;"
      "mulsd  %xmm2,%xmm2;"
      "mulsd  %xmm3,%xmm3;"
      "addsd  %xmm2,%xmm1;"
      "addsd  %xmm3,%xmm1;"
      "sqrtsd %xmm1,%xmm1;"
      "divsd  %xmm1,%xmm0;"
	  "movsd  (%rdi),%xmm1;"
      "movsd  0x8(%rdi),%xmm2;"
      "movsd  0x10(%rdi),%xmm3;"
      "mulsd  %xmm0,%xmm1;"
      "mulsd  %xmm0,%xmm2;"
      "mulsd  %xmm0,%xmm3;"
	  "movsd  %xmm1, (%rdi);"
      "movsd  %xmm2, 0x8(%rdi);"
      "movsd  %xmm3, 0x10(%rdi);"
				);
}

static double length(const double *v)
{
	asm("movsd  (%rdi),%xmm1;"
      "movsd  0x8(%rdi),%xmm0;"
      "mulsd  %xmm1,%xmm1;"
      "mulsd  %xmm0,%xmm0;"
      "movsd  0x10(%rdi),%xmm2;"
      "mulsd  %xmm2,%xmm2;"
      "addsd  %xmm1,%xmm0;"
      "addsd  %xmm2,%xmm0;"
      "sqrtsd %xmm0,%xmm0;");
}

static void add_vector(const double *a, const double *b, double *out)
{
	asm("movsd  (%rdi),%xmm0;"
      "addsd  (%rsi),%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x8(%rdi),%xmm0;"
      "addsd  0x8(%rsi),%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "addsd  0x10(%rsi),%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static void subtract_vector(const double *a, const double *b, double *out)
{
	asm("movsd  (%rdi),%xmm0;"
      "subsd  (%rsi),%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x8(%rdi),%xmm0;"
      "subsd  0x8(%rsi),%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "subsd  0x10(%rsi),%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static void multiply_vectors(const double *a, const double *b, double *out)
{
	asm("movsd  (%rdi),%xmm0;"
      "mulsd  (%rsi),%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x8(%rdi),%xmm0;"
      "mulsd  0x8(%rsi),%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "mulsd  0x10(%rsi),%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static void multiply_vector(const double *a, double b, double *out)
{
	asm("movsd  (%rdi),%xmm1;"
      "mulsd  %xmm0,%xmm1;"
      "movsd  %xmm1,(%rsi);"
      "movsd  0x8(%rdi),%xmm1;"
      "mulsd  %xmm0,%xmm1;"
      "movsd  %xmm1,0x8(%rsi);"
      "mulsd  0x10(%rdi),%xmm0;"
      "movsd  %xmm0,0x10(%rsi);");
}

static void cross_product(const double *v1, const double *v2, double *out)
{
	asm("movsd  0x8(%rdi),%xmm0;"
      "movsd  0x10(%rdi),%xmm1;"
      "mulsd  0x10(%rsi),%xmm0;"
      "mulsd  0x8(%rsi),%xmm1;"
      "subsd  %xmm1,%xmm0;"
      "movsd  %xmm0,(%rdx);"
      "movsd  0x10(%rdi),%xmm0;"
      "movsd  (%rdi),%xmm1;"
      "mulsd  (%rsi),%xmm0;"
      "mulsd  0x10(%rsi),%xmm1;"
      "subsd  %xmm1,%xmm0;"
      "movsd  %xmm0,0x8(%rdx);"
      "movsd  (%rdi),%xmm0;"
      "movsd  0x8(%rdi),%xmm1;"
      "mulsd  0x8(%rsi),%xmm0;"
      "mulsd  (%rsi),%xmm1;"
      "subsd  %xmm1,%xmm0;"
      "movsd  %xmm0,0x10(%rdx);");
}

static double dot_product(const double *v1, const double *v2)
{
	asm("movsd  (%rdi),%xmm0;"
      "movsd  0x8(%rdi),%xmm1;"
      "mulsd  (%rsi),%xmm0;"
      "mulsd  0x8(%rsi),%xmm1;"
      "addsd  %xmm1,%xmm0;"
      "movsd  0x10(%rdi),%xmm1;"
      "mulsd  0x10(%rsi),%xmm1;"
			"addsd  %xmm1,%xmm0;");
}

static void scalar_triple_product(const double *u, const double *v, const double *w,
                           double *out)
{
	asm("movsd  0x8(%rsi),%xmm2;"
      "movsd  0x10(%rsi),%xmm0;"
      "mulsd  0x10(%rdx),%xmm2;"
      "mulsd  0x8(%rdx),%xmm0;"
      "subsd  %xmm0,%xmm2;"
      "movsd  %xmm2,(%rcx);"
      "movsd  0x10(%rsi),%xmm1;"
      "movsd  (%rsi),%xmm0;"
      "mulsd  (%rdx),%xmm1;"
      "mulsd  0x10(%rdx),%xmm0;"
      "subsd  %xmm0,%xmm1;"
      "movsd  %xmm1,0x8(%rcx);"
      "movsd  (%rsi),%xmm0;"
      "movsd  0x8(%rsi),%xmm3;"
      "mulsd  0x8(%rdx),%xmm0;"
      "mulsd  (%rdx),%xmm3;"
      "subsd  %xmm3,%xmm0;"
      "movsd  %xmm0,0x10(%rcx);"
      "mulsd  (%rdi),%xmm2;"
      "movsd  %xmm2,(%rcx);"
      "mulsd  0x8(%rdi),%xmm1;"
      "movsd  %xmm1,0x8(%rcx);"
      "mulsd  0x10(%rdi),%xmm0;"
      "movsd  %xmm0,0x10(%rcx);");
}

static double scalar_triple(const double *u, const double *v, const double *w)
{
	asm("movsd  0x8(%rdx),%xmm3;"
      "movsd  0x10(%rdx),%xmm2;"
      "movsd  0x10(%rdi),%xmm4;"
      "movapd %xmm3,%xmm0;"
      "movsd  0x8(%rdi),%xmm1;"
      "movsd  (%rdi),%xmm5;"
      "movapd %xmm2,%xmm7;"
      "movsd  (%rdx),%xmm6;"
      "mulsd  %xmm1,%xmm7;"
      "mulsd  %xmm4,%xmm0;"
      "mulsd  %xmm5,%xmm2;"
      "mulsd  %xmm6,%xmm4;"
      "mulsd  %xmm6,%xmm1;"
      "subsd  %xmm7,%xmm0;"
      "mulsd  %xmm5,%xmm3;"
      "subsd  %xmm4,%xmm2;"
      "mulsd  (%rsi),%xmm0;"
      "subsd  %xmm3,%xmm1;"
      "mulsd  0x8(%rsi),%xmm2;"
      "mulsd  0x10(%rsi),%xmm1;"
      "addsd  %xmm2,%xmm0;"
      "addsd  %xmm1,%xmm0;");
}

執行時間 (9.132853611倍)

Execution time of raytracing() : 0.696097 sec

若要打敗 -Ofast 的 0.3 秒，可能還要重構 raytracing.c 了。

2016q3 Homework1 (raytracing)

報告

大綱

環境

規格

實驗

結果

開發記錄

v0 原始版本

v1 優化 dot_product

v2x 用 CBLAS 優化 dot_product (X)

v3 優化 subtract_vector

v4 優化 add_vector

v5x 優化 normalize (X)

Trial 1: Fast Inverse Sqare Root

Trial 2: 改成乘法

Trial 3: Fast Inverse Sqare Root (double version)

v6 優化 multiply_vectors

v7 優化 multiply_vector

v8 OpenMP 平行化 (X)

v9 Pthread 平行化

分配工作

v10x AVX SIMD 加速 (X)

v11x 再戰 OpenMP (X)

v12 強制開啟 inline

v13x Linux Kernel Hack (X)

v14x Lock-Free Programming (X)

v15 用 -Ofast 及 asm 作弊

`v0` 原始版本

`v1` 優化 `dot_product`

`v2x` 用 CBLAS 優化 `dot_product` (X)

`v3` 優化 `subtract_vector`

`v4` 優化 `add_vector`

`v5x` 優化 `normalize` (X)

`v6` 優化 `multiply_vectors`

`v7` 優化 `multiply_vector`

`v8` OpenMP 平行化 (X)

`v9` Pthread 平行化

`v10x` AVX SIMD 加速 (X)

`v11x` 再戰 OpenMP (X)

`v12` 強制開啟 `inline`

`v13x` Linux Kernel Hack (X)

`v14x` Lock-Free Programming (X)

`v15` 用 `-Ofast` 及 `asm` 作弊