Try   HackMD

SSE 矩陣運算原理

contributed by<BodomMoon>

指令詳解

SSE 是一套 SIMD 的擴充指令集,請先了解 SIMD 是啥再回來閱讀本文。

首先必須要了解以下 6 個函數的行為

r[0] = a[0] ; r[1] = b[0];
r[2] = a[1] ; r[3] = b[1];
r[0] = a[2] ; r[1] = b[2];
r[2] = a[3] ; r[3] = b[3];
r[0] = a[0]; r[1] = a[1]; 
r[2] = b[0]; r[3] = b[1];
r[0] = a[2]; r[1] = a[3]; 
r[2] = b[2]; r[3] = b[3];

而今天我們轉置一個矩陣時使用以下程式碼

void sse_transpose(int *src, int *dst, int w, int h)
{
    for (int x = 0; x < w; x += 4) {
        for (int y = 0; y < h; y += 4) {
            //printf("%d\n", *(src + (y + 0) * w + x));
            __m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
            __m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
            __m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
            __m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
            __m128i T0 = _mm_unpacklo_epi32(I0, I1);
            __m128i T1 = _mm_unpacklo_epi32(I2, I3);
            __m128i T2 = _mm_unpackhi_epi32(I0, I1);
            __m128i T3 = _mm_unpackhi_epi32(I2, I3);
            I0 = _mm_unpacklo_epi64(T0, T1);
            I1 = _mm_unpackhi_epi64(T0, T1);
            I2 = _mm_unpacklo_epi64(T2, T3);
            I3 = _mm_unpackhi_epi64(T2, T3);
            _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
            _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
            _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
            _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
        }
    }
}

雖然乍看之下覺得似乎很不太容易理解,但其實只要把步驟如下拆解就很容易解讀。以一個4*4的矩陣為例

 1  2  3  4
 5  6  7  8
 9 10 11 12
13 14 15 16
  • 第一步 讀取陣列到 __m128i 的資料型態中
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
//讀取結果
I0= 1  2  3  4
I1= 5  6  7  8
I2= 9 10 11 12
I3=13 14 15 16
  • 第二步 以 32bit 為單位交錯放置 __m128i 變數內的數值
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
//執行完結果
T0 = 1  5  2  6
T1 = 9 13 10 14
T2 = 3  7  4  8
T3 =11 15 12 16
  • 第三步 以 64bit 為單位再交錯放置一次 __m128i 變數內的數值完成轉置
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
//執行完結果
I0 = 1  5  9 13
I1 = 2  6 10 14
I2 = 3  7 11 15
I3 = 4  8 12 16
  • 第四步 將 __m128i 變數內轉置完成的結果存回目的陣列
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
//儲存轉置結果

並且 4 * 4 是這個算法處理的基礎單位,若是 8 * 8 的矩陣則延伸成以下形式

此為 column order 的實作方式。

後面實驗部份請至 2018 software-pipelining - SSE 矩陣實驗及 prefetch 論文實證 部份接續閱讀