# 2017q1 Homework3 (software-pipelining)
contributed by <`Sean1127`>
## [作業要求](https://hackmd.io/s/rks62p1sl)
## 論文
## 實驗
### 原始版
```
$ ./main
sse prefetch: 68693 us
sse: 132225 us
naive: 265718 us
```
#### naive
```clike=
for (int x = 0; x < w; x++)
for (int y = 0; y < h; y++)
*(dst + x * h + y) = *(src + y * w + x);
```
最直覺的寫法
從第 1 列第 1 行向下執行,直到第 1 行完成再換下 1 列
因為是做矩陣轉置,所以不管先做行運算或列運算,都不能利用 space locality
* src
|0|1|2|3|
|-|-|-|-|
|||||
|||||
|||||
* dst
|0||||
|-|-|-|-|
|1||||
|2||||
|3||||
但是如果真的把迴圈順序交換
```clike=
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
*(dst + x * h + y) = *(src + y * w + x);
```
時間其實是有差的,目前還沒想到是為什麼
* 原始版
```
$ perf stat -r 5 -e L1-dcache-load-misses,cache-misses,cache-references,instructions,cycles ./naive
time: 343626 us
time: 338534 us
time: 354738 us
time: 344098 us
time: 358309 us
Performance counter stats for './naive' (5 runs):
28,315,632 L1-dcache-load-misses
16,192,372 cache-misses
17,692,297 cache-references
1,523,833,604 instructions
1,345,559,038 cycles
0.546889550 seconds time elapsed
```
* 交換版
```
time: 365820 us
time: 389949 us
time: 368741 us
time: 365922 us
time: 368898 us
Performance counter stats for './naivee' (5 runs):
28,988,883 L1-dcache-load-misses
16,513,842 cache-misses
18,892,205 cache-references
1,524,141,580 instructions
1,506,438,915 cycles
0.569884695 seconds time elapsed
```
#### sse
```clike=
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
```
原理參考 [Programming trivia: 4x4 integer matrix transpose in SSE2](https://www.randombit.net/bitbashing/2009/10/08/integer_matrix_transpose_in_sse2.html)
#### sse_prefetch
```clike=
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
#define PFDIST 8
_mm_prefetch(src+(y + PFDIST + 0) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 1) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 2) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 3) *w + x, _MM_HINT_T1);
__m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
```
###### tags: `sysprog`