contributed by <henry0929016816
>
void naive_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x++)
for (int y = 0; y < h; y++)
*(dst + x * h + y) = *(src + y * w + x);
}
cache 通常都是抓取連續記憶體位址到 cahce 裡,而上面的程式可以看出 src 的地址為不連續的存取,所以會造成大量的 cache misses
使用 perf stat 觀察 cache misses
Performance counter stats for './naive_transpose' (100 runs):
17,306,551 cache-misses # 95.227 % of all cache refs
18,174,026 cache-references
1,448,802,557 instructions # 1.25 insn per cycle
1,155,692,803 cycles
0.528174199 seconds time elapsed
發現到有高達 95.227 % 的 cache misses
naive: 276748 us
void sse_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
}
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
所以如果傳入 I0 和 I1