sysprog
主講人: jserv / 課程討論區: 2016 年系統軟體課程
「真正的發現之旅不在於尋求新的風景,而是在於擁有新的眼光。」
“The real voyage of discovery consists not in seeking new landscapes, but in having new eyes.”
― 法國作家 Marcel Proust
[ source ]
(最大公因數; Greatest Common Divisor)
演算法:
示範:
GCD(48, 40)
48 - 40, 40 -> 8, 40
8 != 40, 於是重複上述動作: 40 - 8, 8 -> 32, 8
8 != 32, 於是重複上述動作: 32 - 8, 8 -> 24, 8
8 != 24, 於是重複上述動作: 24 - 8, 8 -> 16, 8
8 != 16, 於是重複上述動作: 16 - 8, 8 -> 8, 8
8 == 8 成立,於是 8 為 GCD
用 C 語言實現歐幾里得法: (v1)
int findGCD(int a, int b) {
while (1) {
if (a > b) a -= b;
else if (a < b) b -= a;
else return a;
}
}
以 Intel C compiler 產生的機械碼來說: (Pentium 4)
指令 | 數量 | 延遲 | cycle count |
---|---|---|---|
減法 | 5 | 1 | 5 |
比較 | 5 | 1 | 5 |
分支 | 14 | 1 | 14 |
其他 | 0 | 1 | 0 |
24 |
改成 mod 操作的 C 語言程式如下: (v2)
int findGCD(int a, int b) {
while (1) {
a %= b;
if (a == 0) return b;
if (a == 1) return 1;
b %= a;
if (b == 0) return a;
if (b == 1) return 1;
}
}
以 Intel C compiler 產生的機械碼來說:
指令 | 數量 | 延遲 | cycle count |
---|---|---|---|
mod (整數除法) | 2 | 68 | 136 |
比較 | 3 | 1 | 3 |
分支 | 3 | 1 | 3 |
其他 | 6 | 1 | 6 |
148 |
如果可排除整數除法和減法的負擔,得到以下實做: (v3)
int findGCD(int a, int b) {
while (1) {
if (a > (b * 4)) {
a %= b;
if (a == 0) return b;
if (a == 1) return 1;
} else if (a >= b) {
a -= b;
if (a == 0) return b;
if (a == 1) return 1;
}
if (b > (a * 4)) {
b %= a;
if (b == 0) return a;
if (b == 1) return 1;
} else if (b >= a) {
b -= a;
if (b == 0) return a;
if (b == 1) return 1;
}
}
}
可得到更快的實做:
v1: 14.56s
v2: 18.55s
v3: 12.14s
實驗人: 張家榮
平台:BeagleBone Black (ARMv7-A, 1 GHz)
實驗程式碼:github
採用組合語言撰寫 (GCD 的部份),由於 Cortex-A8 不支援 idiv (要在 ARM Cortex-A15 以上才有),所以 以 ARM 組合語言實做計算機組織課本上的除法器的空間優化版本。
Assembly
MOD:
@ Entry r0: low (remainder low) must be postive<input:Dividend>
@ r1: high (remainder high) any number
@ r2: Divisor (divisor) must be non-zero and positive<input:Divisor>
low .req r0;
high .req r1;
divisor .req r2;
mov high,#0
.rept 33 @ repeat 33 times
subs high,high,divisor @remainder = remainder - divisor
@it lo
addlo high, high, divisor
adcs low, low, low @ low<<1; ((high-divisor)>0) ? (new bit=1) : (new bit=0)
adc high,high,high @ divisor>>1 == high(64bit for high&low)<<1
.endr
lsr high, #1 @shift remainder to correct position(because last divide don’t know this is the last divide,it still prepare for the next divide<shift>)
@ Exit r0: Quotient (remainder low)
@ r1: remainder (remainder high)
mov pc,lr
Euclidean(thumb) V.S. mod(thumb) V.S. mod_minus(thumb)
Euclidean(arm) V.S. mod(arm) V.S. mod_minus(arm)
[ source ]
void naive_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x++){
for (int y = 0; y < h; y++) {
*(dst + x*h + y) = *(src + y * w + x);
}
}
}
接著參考 Programming trivia: 4x4 integer matrix transpose in SSE2 實作 Intel SSE2 加速的版本,由於先進處理器架構有著 automatic/speculative prefetcher 所以這裡我們故意寫得比較 CPU-unfriendly 一些:
void sse_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
__m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i*)(dst+(x*h)+y), I0);
_mm_storeu_si128((__m128i*)(dst+((x+1)*h)+y), I1);
_mm_storeu_si128((__m128i*)(dst+((x+2)*h)+y), I2);
_mm_storeu_si128((__m128i*)(dst+((x+3)*h)+y), I3);
}
}
}
我們來撰寫一個使用 SSE prefetch 指令的加速版本:
void sse_prefetch_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
#define** PFDIST 8**
_mm_prefetch(src + (y + PFDIST + 0) * w + x, _MM_HINT_T1);
_mm_prefetch(src + (y + PFDIST + 1) * w + x, _MM_HINT_T1);
_mm_prefetch(src + (y + PFDIST + 2) * w + x, _MM_HINT_T1);
_mm_prefetch(src + (y + PFDIST + 3) * w + x, _MM_HINT_T1);
__m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i*)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i*)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i*)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i*)(dst + ((x + 3) * h) + y), I3);
}
}
}
接著是測試程式:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
//for using x86-SSE2 intrinsics you have to include this
#include <xmmintrin.h>
// PUT naive_transpose, sse_transpose, sse_prefetch_transpose HERE
// ...
#define TEST_W 4096
#define TEST_H 4096
int main(void)
{
//verify the result of 4x4 matrix
{
int testin[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
int testout[16];
for(int y = 0; y < 4; y++){
for(int x = 0; x < 4; x++)
printf(" %2d", testin[y*4+x]);
printf("\n");
}
printf("\n");
sse_transpose(testin, testout, 4, 4);
for(int y = 0; y < 4; y++){
for(int x = 0; x < 4; x++)
printf(" %2d", testout[y*4+x]);
printf("\n");
}
}
{
struct timeval stime, etime;
int *src =(int*)malloc(sizeof(int)*TEST_W*TEST_H);
int *out0 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);
int *out1 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);
int *out2 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);
srand(time(NULL));
for(int y = 0; y < TEST_H; y++){
for(int x = 0; x < TEST_W; x++)
*(src + y*TEST_W + x) = rand();
}
gettimeofday(&stime, NULL);
sse_prefetch_transpose(src, out0, TEST_W, TEST_H);
printf("sse prefetch: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));
gettimeofday(&stime, NULL);
sse_transpose(src, out1, TEST_W, TEST_H);
gettimeofday(&etime, NULL);
printf("sse: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));
gettimeofday(&stime, NULL);
naive_transpose(src, out2, TEST_W, TEST_H);
gettimeofday(&etime, NULL);
printf("naive: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));
free(src);
free(out0);
free(out1);
free(out2);
}
return 0;
}
存成 test.c, 並且按照下列方式編譯
$ gcc -msse2 -o test test.c
在 Intel Core i7 3612QM 執行得到下列結果:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
sse prefetch: 57782 us
sse: 117184 us
naive: 228870 us
儘管現今 CPU 有著強大的 auto-prefetcher,但是並不是所有的演算都有著規律或是線型的記憶體存取模式, 如此透過優化 prefetching 依然是能夠有相當的效能增進.
在 EeePC 900 (Intel Celeron M 900 MHz) 仍有 22% 的效能差異:
sse prefetch: 676158 us
sse: 860861 us
naive: 1644733 us