## CPU 效能調優與演算法設計
> htop在 LAB 4 中顯示了Imbalance的問題,怎麼解?
```c
請輸入 Fibonacci 項數 (建議 30-40): 40
請輸入 Task Cutoff 閾值 (建議 20,輸入 0 代表全平行): 0
[1] 執行純序列計算...
序列結果: 102334155 | 耗時: 0.146667 秒
[2] 執行 OpenMP Task 計算 (Cutoff=0)...
平行結果: 102334155 | 耗時: 215.439894 秒
```
## Benchmark : 矩陣乘法 (Naive MatMul)
```c
# WSL2 perf 被 ban,所以只能用這個,然後不要跑太大的數字
sudo apt update && sudo apt install valgrind -y
valgrind --tool=cachegrind ./matmul_naive
# Check
export OMP_NUM_THREADS=16 #(default)
echo $OMP_NUM_THREADS
# EXE
gcc fopenmp filename.c -o filename
```
### Code 2D matrix
```c
// matmul_naive_2D.c
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1000
// 分配不連續的指標陣列
double** alloc_2d(int n) {
double **mat = (double **)malloc(n * sizeof(double *));
for (int i = 0; i < n; i++) {
mat[i] = (double *)malloc(n * sizeof(double));
}
return mat;
}
void free_2d(double **mat, int n) {
for (int i = 0; i < n; i++) free(mat[i]);
free(mat);
}
void init_2d(double **mat, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
mat[i][j] = (double)rand() / RAND_MAX;
}
}
}
int main() {
// 1. 分配分散的記憶體 (Array of Pointers)
double **A = alloc_2d(N);
double **B = alloc_2d(N);
double **C = alloc_2d(N);
// C 矩陣需要手動歸零
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) C[i][j] = 0.0;
}
srand(time(NULL));
init_2d(A, N);
init_2d(B, N);
printf("--- 2D 指標陣列版本 (N=%d) ---\n", N);
double start = omp_get_wtime();
// 隱含雙重位址查找:先去記憶體找 A[i] 的位址,再去抓 [k] 的值
sum += A[i][k] * B[k][j];
}
C[i][j] = sum;
// 2. 核心計算
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
}
}
double end = omp_get_wtime();
printf("2D 版本耗時: %.4f 秒\n\n", end - start);
free_2d(A, N); free_2d(B, N); free_2d(C, N);
return 0;
}
```
#### Return
```c
2D 版本耗時: 78.9607 秒
==71421==
==71421== I refs: 30,200,266,781
==71421== I1 misses: 1,855
==71421== LLi misses: 1,827
==71421== I1 miss rate: 0.00%
==71421== LLi miss rate: 0.00%
==71421==
==71421== D refs: 14,086,594,242 (13,066,393,883 rd + 1,020,200,359 wr)
==71421== D1 misses: 972,517,644 ( 971,137,411 rd + 1,380,233 wr)
==71421== LLd misses: 585,570 ( 82,572 rd + 502,998 wr)
==71421== D1 miss rate: 6.9% ( 7.4% + 0.1% )
==71421== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==71421==
==71421== LL refs: 972,519,499 ( 971,139,266 rd + 1,380,233 wr)
==71421== LL misses: 587,397 ( 84,399 rd + 502,998 wr)
==71421== LL miss rate: 0.0% ( 0.0% + 0.0% )
```
### Code 2D -> 1D matrix
```c
// matmul_naive_1D.c
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1024 // 矩陣維度
// 初始化矩陣
void init_matrix(double *mat, int size) {
for (int i = 0; i < size * size; i++) {
mat[i] = (double)rand() / RAND_MAX;
}
}
int main() {
// 分配記憶體 (使用一維陣列模擬 N*N)
double *A = (double *)malloc(N * N * sizeof(double));
double *B = (double *)malloc(N * N * sizeof(double));
double *C = (double *)calloc(N * N, sizeof(double)); // 初始化為 0
if (!A || !B || !C) {
printf("記憶體分配失敗!\n");
return 1;
}
srand(time(NULL));
init_matrix(A, N);
init_matrix(B, N);
printf("開始矩陣乘法 (N=%d, Threads=%d)...\n", N, omp_get_max_threads());
double start = omp_get_wtime();
// --- Naive MatMul 核心邏輯 ---
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
// A[i][k] -> A[i*N + k] (橫向,順著走)
// B[k][j] -> B[k*N + j] (縱向,跨 N 個元素跳著走)
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
double end = omp_get_wtime();
printf("計算完成。耗時: %.4f 秒\n", end - start);
// 簡單校驗 (防止編譯器優化掉整個迴圈)
printf("結果範例 C[0]: %f\n", C[0]);
free(A); free(B); free(C);
return 0;
}
```
```c
1D 版本耗時: 65.4077 秒
==71886==
==71886== I refs: 26,173,962,133
==71886== I1 misses: 1,837
==71886== LLi misses: 1,798
==71886== I1 miss rate: 0.00%
==71886== LLi miss rate: 0.00%
==71886==
==71886== D refs: 12,074,077,168 (11,055,055,664 rd + 1,019,021,504 wr)
==71886== D1 misses: 1,127,252,106 ( 1,126,001,300 rd + 1,250,806 wr)
==71886== LLd misses: 453,660 ( 77,940 rd + 375,720 wr)
==71886== D1 miss rate: 9.3% ( 10.2% + 0.1% )
==71886== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==71886==
==71886== LL refs: 1,127,253,943 ( 1,126,003,137 rd + 1,250,806 wr)
==71886== LL misses: 455,458 ( 79,738 rd + 375,720 wr)
==71886== LL miss rate: 0.0% ( 0.0% + 0.0% )
```
### 推測
1. 快取缺失率(Miss Rate)並不正比於最終執行時間
> 2D由於是`malloc`的產物,隨機分布於heap中,相當於前方有做了padding?
> 1D則是沒有padding,所以可能導致sharing false?
> 如果推論為正確,那padding所下降的miss rate是很可觀的,甚至比是否連續還重要,當然這是對這個scale而言
2. 速度最終還是由1D快了些許,或許表示miss rate所帶來的penalty跟cache conflict比起來還低? 像TLB的cost可能就比想像中的高?
3. `#pragma omp parallel for collapse(2)`的優化
總之,目前的cache 配置導致每讀取一個 double,快取線 (Cache Line) 就會失效一次。所以要透過轉置來解決
### 轉置
源自於C code 對矩陣記憶體的排列為 Row-major ,使 $$sum += A[i][k] * B[k][j]$$
這類矩陣乘法在k增加時,B矩陣的存取跨越是一整列,可預測的是,miss rate會降低不少。
>另外,在優化後且不使用compile優化的情況下,16/1 的threads數量並無太大影響
> 額外補充,上述的測試中,LL皆會是0,極有可能是因為L3可以裝入完整的矩陣,故只測量最後一層cache (L3) 的miss rate 為0
```c
// matmul_naive_1TD.c
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1000
void init_1d(double *mat, int size) {
for (int i = 0; i < size * size; i++) {
mat[i] = (double)rand() / RAND_MAX;
}
}
int main() {
double *A = (double *)malloc(N * N * sizeof(double));
double *B = (double *)malloc(N * N * sizeof(double));
double *B_T = (double *)malloc(N * N * sizeof(double)); // 用於存放轉置後的 B
double *C = (double *)calloc(N * N, sizeof(double));
srand(time(NULL));
init_1d(A, N);
init_1d(B, N);
printf("--- 1D 轉置優化版本 (N=%d) ---\n", N);
double start = omp_get_wtime();
// 1. 矩陣轉置階段 (這部分開銷極小,但收益巨大)
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
B_T[i * N + j] = B[j * N + i];
}
}
// 2. 核心計算階段 可自行加入num_threads(THREADS)在for後,但測試後16與1的差距並不大
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
// 現在 A[i*N + k] 和 B_T[j*N + k] 都是連續記憶體存取!
sum += A[i * N + k] * B_T[j * N + k];
}
C[i * N + j] = sum;
}
}
double end = omp_get_wtime();
printf("轉置版本耗時: %.4f 秒\n", end - start);
free(A); free(B); free(B_T); free(C);
return 0;
}
```
#### Return
```
轉置版本耗時: 51.7340 秒
==75983==
==75983== I refs: 26,197,966,489
==75983== I1 misses: 1,854
==75983== LLi misses: 1,815
==75983== I1 miss rate: 0.00%
==75983== LLi miss rate: 0.00%
==75983==
==75983== D refs: 12,084,079,749 (11,064,056,980 rd + 1,020,022,769 wr)
==75983== D1 misses: 126,629,175 ( 126,128,340 rd + 500,835 wr)
==75983== LLd misses: 627,563 ( 126,821 rd + 500,742 wr)
==75983== D1 miss rate: 1.0% ( 1.1% + 0.0% )
==75983== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==75983==
==75983== LL refs: 126,631,029 ( 126,130,194 rd + 500,835 wr)
==75983== LL misses: 629,378 ( 128,636 rd + 500,742 wr)
==75983== LL miss rate: 0.0% ( 0.0% + 0.0% )
```
## Tiling
> 當矩陣過大,那就切割吧,至少要切到剛剛好L1或L2 !?
### Tiled Matmul Code
即便我們轉置了矩陣,讓存取變得連續,但當 $N$ 變得很大時(例如 $N=2048$ 或更大),整個矩陣的尺寸會遠遠超過你的 **L1/L2 快取**容量。
* **目前的困境**:在計算完矩陣的一列後,當你計算下一列時,之前讀入快取的數據可能已經被「擠掉」了。這意味著同樣的數據會不斷地在 RAM 與 Cache 之間重複搬運。
* **分塊的邏輯**:將大矩陣切成一個個小方塊(Tile),大小剛好能**完全塞進 L1 或 L2 快取**。在處理這個小方塊時,所有的運算都在快取內完成,徹底消除對記憶體頻寬的依賴。
---
這會將原本的三層迴圈($i, j, k$)擴展為**六層迴圈**。外三層決定現在處理哪一個「塊」,內三層負責塊內的計算。
> 這裡我們假設分塊大小為 `B` (Block Size),調整進行測量
```c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>
// 定義矩陣大小與分塊大小
#define N 1000
#define B 8 // Block Size: 根據 L1 Cache 大小調整 (32x32 double 約 8KB)
// 輔助函式:取最小值
#define MIN(a, b) ((a) < (b) ? (a) : (b))
void init_matrix(double *mat, int size) {
for (int i = 0; i < size * size; i++) {
mat[i] = (double)rand() / RAND_MAX;
}
}
int main() {
// 1. 記憶體分配 (1D 連續空間)
double *A = (double *)malloc(N * N * sizeof(double));
double *B_m = (double *)malloc(N * N * sizeof(double)); // 原始 B
double *B_T = (double *)malloc(N * N * sizeof(double)); // 轉置後的 B
double *C = (double *)calloc(N * N, sizeof(double)); // 初始化為 0
if (!A || !B_m || !B_T || !C) {
printf("記憶體分配失敗!\n");
return -1;
}
srand(time(NULL));
init_matrix(A, N);
init_matrix(B_m, N);
printf("--- 最終優化版本: 1D + Transpose + Tiling (N=%d, B=%d) ---\n", N, B);
double start_time = omp_get_wtime();
// 2. 矩陣轉置 (將 B 轉置為 B_T,確保計算時為行連續存取)
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
B_T[i * N + j] = B_m[j * N + i];
}
}
// 3. 核心計算:六層迴圈 (Tiled Matrix Multiplication)
// 外三層迴圈:以「塊」為單位移動
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i += B) {
for (int j = 0; j < N; j += B) {
for (int k = 0; k < N; k += B) {
// 內三層迴圈:在「塊」內部進行計算
for (int ii = i; ii < MIN(i + B, N); ii++) {
for (int jj = j; jj < MIN(j + B, N); jj++) {
double sum = 0;
// 這裡 A 和 B_T 都是 Row-major 連續存取,極利於 Cache L1 與 SIMD
for (int kk = k; kk < MIN(k + B, N); kk++) {
sum += A[ii * N + kk] * B_T[jj * N + kk];
}
C[ii * N + jj] += sum;
}
}
}
}
}
double end_time = omp_get_wtime();
printf("優化版本耗時: %.4f 秒\n", end_time - start_time);
// 驗證計算(可選,抽取一個值檢查)
// printf("C[500]: %f\n", C[500]);
free(A);
free(B_m);
free(B_T);
free(C);
return 0;
}
```
---
### 如何決定 Block Size ?
這不是猜出來的,而是根據硬體規格計算出來的。
1. **查詢快取大小**:
在 Linux 下執行 `lscpu` 或 `cat /proc/cpuinfo`。
* 例如:L1 Data Cache = 32 KB。
2. **計算容量限制**:
在計算一個 $B \times B$ 的塊時,我們通常需要同時容納 A 的一塊、B 的一塊(以及 C 的一塊)。
* 如果是 `double` (8 Bytes),則 $3 \times B^2 \times 8 \text{ Bytes} \leq \text{L1 Cache Size}$。
* 若 L1 為 32 KB,則 $B \approx 32$ 是個不錯的起點。
### Return
```
// B = 32 with -O3
耗時: 8.7287 秒
==82241==
==82241== I refs: 5,192,939,004
==82241== I1 misses: 2,010
==82241== LLi misses: 1,944
==82241== I1 miss rate: 0.00%
==82241== LLi miss rate: 0.00%
==82241==
==82241== D refs: 1,132,077,286 (1,082,979,224 rd + 49,098,062 wr)
==82241== D1 misses: 9,969,797 ( 9,593,296 rd + 376,501 wr)
==82241== LLd misses: 628,051 ( 251,857 rd + 376,194 wr)
==82241== D1 miss rate: 0.9% ( 0.9% + 0.8% )
==82241== LLd miss rate: 0.1% ( 0.0% + 0.8% )
==82241==
==82241== LL refs: 9,971,807 ( 9,595,306 rd + 376,501 wr)
==82241== LL misses: 629,995 ( 253,801 rd + 376,194 wr)
==82241== LL miss rate: 0.0% ( 0.0% + 0.8% )
```
```
// B = 64 without -O3
耗時: 58.0920 秒, (add -O3 耗時: 8.0809 秒, but miss rate increase)
==82745==
==82745== I refs: 31,826,330,518
==82745== I1 misses: 2,030
==82745== LLi misses: 1,966
==82745== I1 miss rate: 0.00%
==82745== LLi miss rate: 0.00%
==82745==
==82745== D refs: 13,348,682,436 (12,283,394,572 rd + 1,065,287,864 wr)
==82745== D1 misses: 7,935,741 ( 7,559,300 rd + 376,441 wr)
==82745== LLd misses: 630,322 ( 254,130 rd + 376,192 wr)
==82745== D1 miss rate: 0.1% ( 0.1% + 0.0% )
==82745== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==82745==
==82745== LL refs: 7,937,771 ( 7,561,330 rd + 376,441 wr)
==82745== LL misses: 632,288 ( 256,096 rd + 376,192 wr)
==82745== LL miss rate: 0.0% ( 0.0% + 0.0% )
```
END
LAB 6
https://hackmd.io/@1p0_VOUHTXGcHlBU9A0OEA/rkqFUXtoZx