contributed by <Sean1127
>
$ ./main
sse prefetch: 68693 us
sse: 132225 us
naive: 265718 us
for (int x = 0; x < w; x++)
for (int y = 0; y < h; y++)
*(dst + x * h + y) = *(src + y * w + x);
最直覺的寫法
從第 1 列第 1 行向下執行,直到第 1 行完成再換下 1 列
因為是做矩陣轉置,所以不管先做行運算或列運算,都不能利用 space locality
0 | 1 | 2 | 3 |
---|---|---|---|
0 | |||
---|---|---|---|
1 | |||
2 | |||
3 |
但是如果真的把迴圈順序交換
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
*(dst + x * h + y) = *(src + y * w + x);
時間其實是有差的,目前還沒想到是為什麼
$ perf stat -r 5 -e L1-dcache-load-misses,cache-misses,cache-references,instructions,cycles ./naive
time: 343626 us
time: 338534 us
time: 354738 us
time: 344098 us
time: 358309 us
Performance counter stats for './naive' (5 runs):
28,315,632 L1-dcache-load-misses
16,192,372 cache-misses
17,692,297 cache-references
1,523,833,604 instructions
1,345,559,038 cycles
0.546889550 seconds time elapsed
time: 365820 us
time: 389949 us
time: 368741 us
time: 365922 us
time: 368898 us
Performance counter stats for './naivee' (5 runs):
28,988,883 L1-dcache-load-misses
16,513,842 cache-misses
18,892,205 cache-references
1,524,141,580 instructions
1,506,438,915 cycles
0.569884695 seconds time elapsed
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
原理參考 Programming trivia: 4x4 integer matrix transpose in SSE2
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
#define PFDIST 8
_mm_prefetch(src+(y + PFDIST + 0) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 1) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 2) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 3) *w + x, _MM_HINT_T1);
__m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
sysprog