contributed by<BodomMoon
>
SSE 是一套 SIMD 的擴充指令集,請先了解 SIMD 是啥再回來閱讀本文。
首先必須要了解以下 6 個函數的行為
__m128i : 一種 128 bits 的資料型態,可對應 SSE 特有的 128 bits register 操作。
__m128i _mm_loadu_si128 (__m128i *p):透過指標讀取一個 128 bit 的值。
__m128i I0 = _mm_loadu_si128((__m128i *)(array));
void _mm_storeu_si128 (__m128i *p, __m128i a):透過指標將一個 128 bit 的值存入目標記憶體地址
_mm_storeu_si128((__m128i *)(array), I0);
__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b):將兩個 m128i 變數的前兩筆 32 bits 資料交錯擺放。
假設我們將一個 m128i 的資料型態視為 int[4] 的話示意如下
r[0] = a[0] ; r[1] = b[0];
r[2] = a[1] ; r[3] = b[1];
r[0] = a[2] ; r[1] = b[2];
r[2] = a[3] ; r[3] = b[3];
r[0] = a[0]; r[1] = a[1];
r[2] = b[0]; r[3] = b[1];
r[0] = a[2]; r[1] = a[3];
r[2] = b[2]; r[3] = b[3];
而今天我們轉置一個矩陣時使用以下程式碼
void sse_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
//printf("%d\n", *(src + (y + 0) * w + x));
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
}
雖然乍看之下覺得似乎很不太容易理解,但其實只要把步驟如下拆解就很容易解讀。以一個4*4的矩陣為例
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
//讀取結果
I0= 1 2 3 4
I1= 5 6 7 8
I2= 9 10 11 12
I3=13 14 15 16
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
//執行完結果
T0 = 1 5 2 6
T1 = 9 13 10 14
T2 = 3 7 4 8
T3 =11 15 12 16
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
//執行完結果
I0 = 1 5 9 13
I1 = 2 6 10 14
I2 = 3 7 11 15
I3 = 4 8 12 16
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
//儲存轉置結果
並且 4 * 4 是這個算法處理的基礎單位,若是 8 * 8 的矩陣則延伸成以下形式
此為 column order 的實作方式。
後面實驗部份請至 2018 software-pipelining - SSE 矩陣實驗及 prefetch 論文實證 部份接續閱讀