sysprogram
contributed by <st9007a
>
Linux 4.4.0-92-generic
Ubuntu 16.04.1 LTS
CPU family: 6
CPU model: 94
CPU model name: Intel(R) Core(TM) i5-6500 CPU @ 3.20GHz
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 6144K
void _mm_prefetch(char *ptr, int hint)
ptr
:prefetch addresshint
:prefetch hint,參照下圖int vector[100] = get_orgin(100);
for (int i; i < 100; i++) {
_mm_prefetch(&vector[i + 4], T0); //prefetch distance = 4
printf("%d\n", vector[i]);
}
fprofile-generate
and fprofile-use
flags參照原文 Section 6
將專案編譯執行後結果如下:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
sse prefetch: 54226 us
sse: 135799 us
naive: 238329 us
上面兩個矩陣是用來驗證 sse_transpose
接著下面三個是使用三種不同方法來進行矩陣 transpose 時的執行時間,而這裡使用的矩陣大小為
再來看看這三個方法是怎麼時做的,首先是 naive_transpose()
void naive_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x++)
for (int y = 0; y < h; y++)
*(dst + x * h + y) = *(src + y * w + x);
}
naive_transpose()
將 src
的 (x, y) 位置的值直接 assign 到 dst
的 (y, x) 位置
然後是 sse_transpose()
void sse_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
__m128i I0 = _mm_loadu_si128((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
}
根據 intel intrinsics guide,可以知道程式碼中每個 intrinsics function 的作用
__m128i __mm_loadu_si128(__m128i const *mem_addr)
:將 128-bit 的整數從 mem_addr 載入到 dst,dst 會作為回傳值回傳出來__m128i __mm_unpacklo_epi32(__m128i a, __m128i b)
:將 a 與 b 低位的 64-bit 資料切成 a[31:0] , a[63:32] , b[31:0] , b[63:32],然後填入 dst 回傳,dst 由高位到低位依序是 b[63:32] , a[63:32] , b[31:0] , a[31:0]__mm128i __mm_unpackhi_epi32(__m128i a, __m128i b)
:行為與 __mm_unpacklo_epi32
類似,只是改成取 a 與 b 的高位__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b)
: 行為與 __mm_unpacklo_epi32
類似,只是資料不會被分割,所以 dst 由高位到低位是 b[63:0], a[63:0]__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b)
:行為與 __mm_unpackhi_epi32
類似,dst 由高位到低位是 b[127:64], a[127:64]void __mm_storeu_si128(__m128i *mem_addr, __m128i a)
:將 a 的值寫入 mem_addr 中知道了每個 function 的功用後,sse_transpose 做了什麼還是很難想像,所以先帶入
src_matrix = {
{ 1, 2, 3, 4},
{ 5, 6, 7, 8},
{ 9,10,11,12},
{13,14,15,16},
}
I0 = { 1, 2, 3, 4}
I1 = { 5, 6, 7, 8}
I2 = { 9,10,11,12}
I3 = {13,14,15,16}
T0 = { 1, 5, 2, 6}
T1 = { 9,13,10,14}
T2 = { 3, 7, 4, 8}
T3 = {11,15,12,16}
I0 = { 1, 5, 9,13}
I1 = { 2, 6,10,14}
I2 = { 3, 7,11,15}
I3 = { 4, 8,12,16}
dst_matrix = {
{ 1, 5, 9,13},
{ 2, 6,10,14},
{ 3, 7,11,15},
{ 4, 8,12,16}
}
將每一行執行結果列出來後就可以看到,透過 unpack 的方式,矩陣被轉置了
最後是 sse_prefetch_transpose()
void sse_prefetch_transpose(int *src, int *dst, int w, int h)
{
for (int x = 0; x < w; x += 4) {
for (int y = 0; y < h; y += 4) {
#define PFDIST 8
_mm_prefetch(src+(y + PFDIST + 0) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 1) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 2) *w + x, _MM_HINT_T1);
_mm_prefetch(src+(y + PFDIST + 3) *w + x, _MM_HINT_T1);
__m128i I0 = _mm_loadu_si128 ((__m128i *)(src + (y + 0) * w + x));
__m128i I1 = _mm_loadu_si128 ((__m128i *)(src + (y + 1) * w + x));
__m128i I2 = _mm_loadu_si128 ((__m128i *)(src + (y + 2) * w + x));
__m128i I3 = _mm_loadu_si128 ((__m128i *)(src + (y + 3) * w + x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);
_mm_storeu_si128((__m128i *)(dst + ((x + 0) * h) + y), I0);
_mm_storeu_si128((__m128i *)(dst + ((x + 1) * h) + y), I1);
_mm_storeu_si128((__m128i *)(dst + ((x + 2) * h) + y), I2);
_mm_storeu_si128((__m128i *)(dst + ((x + 3) * h) + y), I3);
}
}
}
sse_prefetch_transpose()
除了 prefetch 的部分,其他部分跟 sse_transpose()
是一樣的,而 prefetch 的部分則是 prefetch 了 往後數 PFDIST + 0 ~ 3 個 row 的資料並且放置到 L2 以及更高階的 cache 上
為了能個別測試效能,所以先設計一個通用的介面 transpose.h
void transpose(int *src, int *dst, int w, int h);
接著用不同檔案來個別實作不同方法:naive_transpose.c
, sse_transpose.c
, sse_prefetch_transpose.c
完成後,修改一下 main.c
跟 Makefile
讓其能自動產生對應的執行檔,接著用 perf stat
來看看各自的 cache miss
$ perf stat --repeat 100 -e cache-misses,cache-references,instructions,cycles ./naive_transpose
Performance counter stats for './navie_transpose' (100 runs):
4108,3426 cache-misses # 86.021 % of all cache refs ( +- 0.31% )
4775,9788 cache-references ( +- 0.28% )
14,4866,1674 instructions # 1.48 insns per cycle ( +- 0.00% )
9,7703,3562 cycles ( +- 0.07% )
0.343869644 seconds time elapsed ( +- 0.07% )
$ perf stat --repeat 100 -e cache-misses,cache-references,instructions,cycles ./sse_transpose
Performance counter stats for './sse_transpose' (100 runs):
1455,1286 cache-misses # 75.665 % of all cache refs ( +- 0.06% )
1923,1186 cache-references ( +- 0.07% )
12,3676,0551 instructions # 1.72 insns per cycle ( +- 0.00% )
7,1766,9516 cycles ( +- 0.07% )
0.264611085 seconds time elapsed ( +- 0.17% )
$ perf stat --repeat 100 -e cache-misses,cache-references,instructions,cycles ./sse_prefetch_transpose
Performance counter stats for './sse_prefetch_transpose' (100 runs):
823,1685 cache-misses # 64.939 % of all cache refs ( +- 0.18% )
1267,5942 cache-references ( +- 0.12% )
12,8279,3332 instructions # 2.30 insns per cycle ( +- 0.00% )
5,5755,6001 cycles ( +- 0.01% )
0.156492681 seconds time elapsed ( +- 0.05% )
可以看到 prefetch 讓 cache miss 下降了,接著搭配 raw counter 使用 perf stat
raw counter 參照 Intel® 64 and IA-32 Architectures Developer’s Manual: Vol. 3B 的第 19-2 章:Performance Monitoring Events For 6th Generation Intel®Core™ Processor And 7th Generation Intel®Core™ Processor
找了一下,決定觀察這三個 event,原因在於上面 prefetch hint 為 T1,T1 會將 prefetched data 放置於 L2 以及更高階的 cache,所以理論上可以在 L2 cache 上觀察到明顯的數據變化,第三個 event 是拿來算百分比用的
$ perf stat --repeat 100 -e r2124,r4124,rE124 ./navie_transpose
Performance counter stats for './navie_transpose' (100 runs):
1678,1605 r2124 ( +- 0.00% )
4,6208 r4124 ( +- 1.15% )
1683,4637 rE124 ( +- 0.00% )
0.343165430 seconds time elapsed ( +- 0.08% )
$ perf stat --repeat 100 -e r2124,r4124,rE124 ./sse_transpose
Performance counter stats for './sse_transpose' (100 runs):
421,6532 r2124 ( +- 0.00% )
2,3383 r4124 ( +- 0.39% )
424,9238 rE124 ( +- 0.01% )
0.264649831 seconds time elapsed ( +- 0.15% )
$ perf stat --repeat 100 -e r2124,r4124,rE124 ./sse_prefetch_transpose
Performance counter stats for './sse_prefetch_transpose' (100 runs):
3,3902 r2124 ( +- 0.30% )
332,5174 r4124 ( +- 0.29% )
336,6697 rE124 ( +- 0.29% )
0.156584539 seconds time elapsed ( +- 0.05% )
naive | sse | sse prefetch | |
---|---|---|---|
L2_RQSTS.DEMAND_DATA_RD_MISS | 1678,1605 (99.684%) |
421,6532 (99.699%) |
3,3902 (1.007%) |
L2_RQSTS.DEMAND_DATA_RD_HIT | 4,6208 (0.274%) |
2,3383 (0.553%) |
332,5174 (98.767%) |
L2_RQSTS.ALL_DEMAND_DATA_RD | 1683,4637 | 424,9238 | 336,6697 |
從上面的表格可以看到,naive 到 sse 的 L2 cache miss/hit rate 沒有顯著變化,但是 sse prefetch 很明顯的使 L2 hit rate 大幅提升,驗證了 prefetch hint = T1 的狀況 (註:這裡的 miss/hit rate 僅計算 demand data read request)
AVX transpose 參考了 stackoverflow – Transpose an 8x8 float using AVX/AVX2 的解答,由於 AVX 的暫存器是 256-bit,一次可以讀取 8 個 32-bit 整數,所以 for loop 的 step 改成 8
for (int x = 0; x < w; x += 8) {
for (int y = 0; y < h; y += 8) {
__m256i I0 = _mm256_loadu_si256((__m256i *)(src + (y + 0) * w + x));
__m256i I1 = _mm256_loadu_si256((__m256i *)(src + (y + 1) * w + x));
__m256i I2 = _mm256_loadu_si256((__m256i *)(src + (y + 2) * w + x));
__m256i I3 = _mm256_loadu_si256((__m256i *)(src + (y + 3) * w + x));
__m256i I4 = _mm256_loadu_si256((__m256i *)(src + (y + 4) * w + x));
__m256i I5 = _mm256_loadu_si256((__m256i *)(src + (y + 5) * w + x));
__m256i I6 = _mm256_loadu_si256((__m256i *)(src + (y + 6) * w + x));
__m256i I7 = _mm256_loadu_si256((__m256i *)(src + (y + 7) * w + x));
__m256i T0 = _mm256_unpacklo_epi32(I0, I1);
__m256i T1 = _mm256_unpackhi_epi32(I0, I1);
__m256i T2 = _mm256_unpacklo_epi32(I2, I3);
__m256i T3 = _mm256_unpackhi_epi32(I2, I3);
__m256i T4 = _mm256_unpacklo_epi32(I4, I5);
__m256i T5 = _mm256_unpackhi_epi32(I4, I5);
__m256i T6 = _mm256_unpacklo_epi32(I6, I7);
__m256i T7 = _mm256_unpackhi_epi32(I6, I7);
I0 = (__m256i)_mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
I1 = (__m256i)_mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
I2 = (__m256i)_mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
I3 = (__m256i)_mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
I4 = (__m256i)_mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
I5 = (__m256i)_mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
I6 = (__m256i)_mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
I7 = (__m256i)_mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
T0 = _mm256_permute2f128_si256(I0, I4, 0x20);
T1 = _mm256_permute2f128_si256(I1, I5, 0x20);
T2 = _mm256_permute2f128_si256(I2, I6, 0x20);
T3 = _mm256_permute2f128_si256(I3, I7, 0x20);
T4 = _mm256_permute2f128_si256(I0, I4, 0x31);
T5 = _mm256_permute2f128_si256(I1, I5, 0x31);
T6 = _mm256_permute2f128_si256(I2, I6, 0x31);
T7 = _mm256_permute2f128_si256(I3, I7, 0x31);
_mm256_storeu_si256((__m256i *)(dst + ((x + 0) * h) + y), T0);
_mm256_storeu_si256((__m256i *)(dst + ((x + 1) * h) + y), T1);
_mm256_storeu_si256((__m256i *)(dst + ((x + 2) * h) + y), T2);
_mm256_storeu_si256((__m256i *)(dst + ((x + 3) * h) + y), T3);
_mm256_storeu_si256((__m256i *)(dst + ((x + 4) * h) + y), T4);
}
}
附上執行結果比較,avx_prefetch_transpose
的 prefetch distance 這裡事先設定為 8
exec naive_transpose
execution time: 225405 us
exec sse_transpose
execution time: 151882 us
exec sse_prefetch_transpose
execution time: 43714 us
exec avx_transpose
execution time: 56837 us
exec avx_prefetch_transpose
execution time: 43255 us
可以看到 avx_transpose
的執行速度比 sse_transpose
還快,執行速度的部分可以參照 Performance of SSE and AVX Instruction Sets 的 Data Reuse 的部分,avx_transpose
,不斷重複使用資料(I0 ~ I7 , T0 ~ T7),而 avx 在 Data Reuse 上的效能是勝過 sse 的
接著嘗試使用不同的 prefetch distance 來比較 avx_prefetch_transpose
的效能
prefetch distance | 8 | 16 | 24 | 32 |
---|---|---|---|---|
Average | 42850.82 | 43546.13 | 45357.07 | 47583.64 |
Standard Deviation | 298.008 | 320.335 | 606.971 | 815.092 |
Confidence Interval (95%) | 42254.804 - 43446.836 | 42905.46 - 44186.8 | 44143.128 - 46571.012 | 45953.456 - 49213.824 |
表格為不同 prefetch distance 個跑 100 次後其執行時間的統計結果,從資料分布來看 prefetch distance = 8 的效能最好,這裡就不考慮更大的 prefetch distance,根據論文的敘述,prefetch distance 太大會導致 cache miss 機率提升
再來重新比較 L2 cache miss/hit
avx | avx prefetch | sse | sse prefetch | naive | |
---|---|---|---|---|---|
DATA_RD_MISS | 307,4598 (97.822%) |
89,6375 (32.186%) |
421,5791 (99.239%) |
3,3613 (1.009%) |
1678,0959 (99.686%) |
DATA_RD_HIT | 3,2268 (1.027%) |
185,2637 (66.522%) |
2,3144 (0.545%) |
329,1381 (98.761%) |
4,6184 (0.274%) |
DEMAND_DATA_RD | 314,3045 | 278,4986 | 424,8124 | 333,2668 | 1683,3807 |
可以看出 prefetch 有確實提升 L2 cache hit rate,但是提升幅度沒有 sse prefetch 那麼大
最後是比較 sse_prefetch_transpose
與 avx_prefetch_transpose
執行速度,其他方法的執行速度差距明顯,所以就不特別比較
avx_prefetch_transpose | sse_prefetch_transpose | |
---|---|---|
Average | 42750.75 | 43661.29 |
Standard Deviation | 328.847 | 128.357 |
Confidence Interval (95%) | 42093.056 - 43408.444 | 43404.576 - 43918.004 |
比較執行 100 次的結果,avx_prefetch_transpose
的執行速度是比較快的,但是卻沒有贏太多,這裡可以同樣參照 Performance of SSE and AVX Instruction Sets 中 Asynchronous Data Transfer 的執行結果比較,avx 跟 sse 在有 prefetch 的狀況下,效能是相近的
為了進一步驗證 sse 與 avx 之間的效能比較,所以針對長度從 1024 到 8192 的矩陣做不同方法的效能比較,矩陣長度以 1024 為單位做遞增,其結果如下
第一張圖可以很直覺看出各個方法的效能差異,sse 與 avx 的效能在不同矩陣大小下也保持著前述的結論:
而第二張圖的 y 軸使用 logscale,主要原因是第一張圖會給人成長倍率為:naive > sse > avx > sse prefetch = avx prefetch 的錯覺,但是 y 軸用成 logscale 後就可以看出其成長倍率是相近的
intel intrinsics guide
Intel® 64 and IA-32 Architectures Developer’s Manual: Vol. 3B
software-pipling 共筆
stackoverflow – Transpose an 8x8 float using AVX/AVX2
_MM_SHUFFLE question
Performance of SSE and AVX Instruction Sets