Try   HackMD

2017q1 Homework3 (software-pipelining)

contributed by <Sean1127>

作業要求

論文

實驗

原始版

$ ./main
sse prefetch: 	 68693 us
sse: 		 132225 us
naive: 		 265718 us

naive

for (int x = 0; x < w; x++) for (int y = 0; y < h; y++) *(dst + x * h + y) = *(src + y * w + x);

最直覺的寫法
從第 1 列第 1 行向下執行,直到第 1 行完成再換下 1 列
因為是做矩陣轉置,所以不管先做行運算或列運算,都不能利用 space locality

  • src
    0 1 2 3
  • dst
    0
    1
    2
    3

但是如果真的把迴圈順序交換

for (int y = 0; y < h; y++) for (int x = 0; x < w; x++) *(dst + x * h + y) = *(src + y * w + x);

時間其實是有差的,目前還沒想到是為什麼

  • 原始版
    ​​$ perf stat -r 5 -e L1-dcache-load-misses,cache-misses,cache-references,instructions,cycles ./naive
    ​​time: 	 343626 us
    ​​time: 	 338534 us
    ​​time: 	 354738 us
    ​​time: 	 344098 us
    ​​time: 	 358309 us
    ​​
    ​​Performance counter stats for './naive' (5 runs):
    ​​28,315,632      L1-dcache-load-misses
    ​​16,192,372      cache-misses
    ​​17,692,297      cache-references
    ​​1,523,833,604      instructions
    ​​1,345,559,038      cycles
    ​​0.546889550 seconds time elapsed
    
  • 交換版
    ​​time: 	 365820 us
    ​​time: 	 389949 us
    ​​time: 	 368741 us
    ​​time: 	 365922 us
    ​​time: 	 368898 us
    ​​
    ​​Performance counter stats for './naivee' (5 runs):
    ​​
    ​​28,988,883      L1-dcache-load-misses
    ​​16,513,842      cache-misses
    ​​18,892,205      cache-references
    ​​1,524,141,580      instructions
    ​​1,506,438,915      cycles
    ​​
    ​​0.569884695 seconds time elapsed
    

sse

for (int x = 0; x < w; x += 4) { for (int y = 0; y < h; y += 4) { __m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x)); __m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x)); __m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x)); __m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x)); __m128i T0 = _mm_unpacklo_epi32(I0, I1); __m128i T1 = _mm_unpacklo_epi32(I2, I3); __m128i T2 = _mm_unpackhi_epi32(I0, I1); __m128i T3 = _mm_unpackhi_epi32(I2, I3); I0 = _mm_unpacklo_epi64(T0, T1); I1 = _mm_unpackhi_epi64(T0, T1); I2 = _mm_unpacklo_epi64(T2, T3); I3 = _mm_unpackhi_epi64(T2, T3); _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0); _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1); _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2); _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3); } }

原理參考 Programming trivia: 4x4 integer matrix transpose in SSE2

sse_prefetch

for (int x = 0; x < w; x += 4) { for (int y = 0; y < h; y += 4) { #define PFDIST 8 _mm_prefetch(src+(y + PFDIST + 0) *w + x, _MM_HINT_T1); _mm_prefetch(src+(y + PFDIST + 1) *w + x, _MM_HINT_T1); _mm_prefetch(src+(y + PFDIST + 2) *w + x, _MM_HINT_T1); _mm_prefetch(src+(y + PFDIST + 3) *w + x, _MM_HINT_T1); __m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x)); __m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x)); __m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x)); __m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x)); __m128i T0 = _mm_unpacklo_epi32(I0, I1); __m128i T1 = _mm_unpacklo_epi32(I2, I3); __m128i T2 = _mm_unpackhi_epi32(I0, I1); __m128i T3 = _mm_unpackhi_epi32(I2, I3); I0 = _mm_unpacklo_epi64(T0, T1); I1 = _mm_unpackhi_epi64(T0, T1); I2 = _mm_unpacklo_epi64(T2, T3); I3 = _mm_unpackhi_epi64(T2, T3); _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0); _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1); _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2); _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3); } }
tags: sysprog