# 2017q1 Homework3 (software-pipelining) contributed by <`Sean1127`> ## [作業要求](https://hackmd.io/s/rks62p1sl) ## 論文 ## 實驗 ### 原始版 ``` $ ./main sse prefetch: 68693 us sse: 132225 us naive: 265718 us ``` #### naive ```clike= for (int x = 0; x < w; x++) for (int y = 0; y < h; y++) *(dst + x * h + y) = *(src + y * w + x); ``` 最直覺的寫法 從第 1 列第 1 行向下執行,直到第 1 行完成再換下 1 列 因為是做矩陣轉置,所以不管先做行運算或列運算,都不能利用 space locality * src |0|1|2|3| |-|-|-|-| ||||| ||||| ||||| * dst |0|||| |-|-|-|-| |1|||| |2|||| |3|||| 但是如果真的把迴圈順序交換 ```clike= for (int y = 0; y < h; y++) for (int x = 0; x < w; x++) *(dst + x * h + y) = *(src + y * w + x); ``` 時間其實是有差的,目前還沒想到是為什麼 * 原始版 ``` $ perf stat -r 5 -e L1-dcache-load-misses,cache-misses,cache-references,instructions,cycles ./naive time: 343626 us time: 338534 us time: 354738 us time: 344098 us time: 358309 us Performance counter stats for './naive' (5 runs): 28,315,632 L1-dcache-load-misses 16,192,372 cache-misses 17,692,297 cache-references 1,523,833,604 instructions 1,345,559,038 cycles 0.546889550 seconds time elapsed ``` * 交換版 ``` time: 365820 us time: 389949 us time: 368741 us time: 365922 us time: 368898 us Performance counter stats for './naivee' (5 runs): 28,988,883 L1-dcache-load-misses 16,513,842 cache-misses 18,892,205 cache-references 1,524,141,580 instructions 1,506,438,915 cycles 0.569884695 seconds time elapsed ``` #### sse ```clike= for (int x = 0; x < w; x += 4) { for (int y = 0; y < h; y += 4) { __m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x)); __m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x)); __m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x)); __m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x)); __m128i T0 = _mm_unpacklo_epi32(I0, I1); __m128i T1 = _mm_unpacklo_epi32(I2, I3); __m128i T2 = _mm_unpackhi_epi32(I0, I1); __m128i T3 = _mm_unpackhi_epi32(I2, I3); I0 = _mm_unpacklo_epi64(T0, T1); I1 = _mm_unpackhi_epi64(T0, T1); I2 = _mm_unpacklo_epi64(T2, T3); I3 = _mm_unpackhi_epi64(T2, T3); _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0); _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1); _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2); _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3); } } ``` 原理參考 [Programming trivia: 4x4 integer matrix transpose in SSE2](https://www.randombit.net/bitbashing/2009/10/08/integer_matrix_transpose_in_sse2.html) #### sse_prefetch ```clike= for (int x = 0; x < w; x += 4) { for (int y = 0; y < h; y += 4) { #define PFDIST 8 _mm_prefetch(src+(y + PFDIST + 0) *w + x, _MM_HINT_T1); _mm_prefetch(src+(y + PFDIST + 1) *w + x, _MM_HINT_T1); _mm_prefetch(src+(y + PFDIST + 2) *w + x, _MM_HINT_T1); _mm_prefetch(src+(y + PFDIST + 3) *w + x, _MM_HINT_T1); __m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x)); __m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x)); __m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x)); __m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x)); __m128i T0 = _mm_unpacklo_epi32(I0, I1); __m128i T1 = _mm_unpacklo_epi32(I2, I3); __m128i T2 = _mm_unpackhi_epi32(I0, I1); __m128i T3 = _mm_unpackhi_epi32(I2, I3); I0 = _mm_unpacklo_epi64(T0, T1); I1 = _mm_unpackhi_epi64(T0, T1); I2 = _mm_unpacklo_epi64(T2, T3); I3 = _mm_unpackhi_epi64(T2, T3); _mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0); _mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1); _mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2); _mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3); } } ``` ###### tags: `sysprog`