2017q1 Homework3 (software-pipelining)

contributed by <Sean1127>

作業要求

論文

實驗

原始版

$ ./main
sse prefetch: 	 68693 us
sse: 		 132225 us
naive: 		 265718 us

naive



for (int x = 0; x < w; x++)
    for (int y = 0; y < h; y++)
        *(dst + x * h + y) = *(src + y * w + x);

最直覺的寫法
從第 1 列第 1 行向下執行，直到第 1 行完成再換下 1 列
因為是做矩陣轉置，所以不管先做行運算或列運算，都不能利用 space locality

src

0 1 2 3
dst

0

1

2

3

但是如果真的把迴圈順序交換



for (int y = 0; y < h; y++)
    for (int x = 0; x < w; x++)
        *(dst + x * h + y) = *(src + y * w + x);

時間其實是有差的，目前還沒想到是為什麼

原始版

$ perf stat -r 5 -e L1-dcache-load-misses,cache-misses,cache-references,instructions,cycles ./naive
time: 	 343626 us
time: 	 338534 us
time: 	 354738 us
time: 	 344098 us
time: 	 358309 us

Performance counter stats for './naive' (5 runs):
28,315,632      L1-dcache-load-misses
16,192,372      cache-misses
17,692,297      cache-references
1,523,833,604      instructions
1,345,559,038      cycles
0.546889550 seconds time elapsed

交換版

time: 	 365820 us
time: 	 389949 us
time: 	 368741 us
time: 	 365922 us
time: 	 368898 us

Performance counter stats for './naivee' (5 runs):

28,988,883      L1-dcache-load-misses
16,513,842      cache-misses
18,892,205      cache-references
1,524,141,580      instructions
1,506,438,915      cycles

0.569884695 seconds time elapsed

sse




















for (int x = 0; x < w; x += 4) {
    for (int y = 0; y < h; y += 4) {
        __m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
        __m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
        __m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
        __m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
        __m128i T0 = _mm_unpacklo_epi32(I0, I1);
        __m128i T1 = _mm_unpacklo_epi32(I2, I3);
        __m128i T2 = _mm_unpackhi_epi32(I0, I1);
        __m128i T3 = _mm_unpackhi_epi32(I2, I3);
        I0 = _mm_unpacklo_epi64(T0, T1);
        I1 = _mm_unpackhi_epi64(T0, T1);
        I2 = _mm_unpacklo_epi64(T2, T3);
        I3 = _mm_unpackhi_epi64(T2, T3);
        _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
        _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
        _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
        _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
    }
}

原理參考 Programming trivia: 4x4 integer matrix transpose in SSE2

sse_prefetch


























for (int x = 0; x < w; x += 4) {
    for (int y = 0; y < h; y += 4) {
#define PFDIST  8
        _mm_prefetch(src+(y + PFDIST + 0) *w + x, _MM_HINT_T1);
        _mm_prefetch(src+(y + PFDIST + 1) *w + x, _MM_HINT_T1);
        _mm_prefetch(src+(y + PFDIST + 2) *w + x, _MM_HINT_T1);
        _mm_prefetch(src+(y + PFDIST + 3) *w + x, _MM_HINT_T1);
        
        __m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x));
        __m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x));
        __m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x));
        __m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x));
        __m128i T0 = _mm_unpacklo_epi32(I0, I1);
        __m128i T1 = _mm_unpacklo_epi32(I2, I3);
        __m128i T2 = _mm_unpackhi_epi32(I0, I1);
        __m128i T3 = _mm_unpackhi_epi32(I2, I3);
        I0 = _mm_unpacklo_epi64(T0, T1);
        I1 = _mm_unpackhi_epi64(T0, T1);
        I2 = _mm_unpacklo_epi64(T2, T3);
        I3 = _mm_unpackhi_epi64(T2, T3);
        _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
        _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
        _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
        _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
    }
}

2017q1 Homework3 (software-pipelining)

作業要求

論文

實驗

原始版

naive

sse

sse_prefetch

tags: sysprog

tags: `sysprog`