Cuda 學習 - Conjugate Gradient Method
===
# 定義
## 正定矩陣(Positive-Definite matrix)
* 一個矩陣 $A \in \mathbb{R}^{n \times n}$是正定(Positive-Definite)那需要滿足以下條件
* $A=A^{T}$ 為一個對稱矩陣
* 對任意的向量 $x\in\mathbb{R}^{n}$ 且$x$不為零向量, 能夠滿足$x^{T}A{x}>0$
# 問題描述
* 一個線性系統如下, 其中$A\in\mathbb{R}^{n \times n}$ 是正定矩陣, 我們的目的是要求解以下線性系統獲得 $x$ $$Ax=b, x\in \mathbb{R}^{n} $$
* 當矩陣 $A$ 越大時計算反矩陣是一項很耗計算量的工作所以會傾向使用迭代的方式進行, 而 Conjugate Gradient Method (CG method)則是一個方式通常使用於 Sparse matrix下求解線性系統, 但前提是 $A$ 要是正定矩陣.
* 相關的理論推倒可以參考林得勝教授的這篇文章[Conjugate gradient method - iterative method](https://teshenglin.github.io/post/2023_conjugate_gradient_method_2/), 裏頭給出了詳細的理論推導由來, 建議的先備知識為線性代數。
## Algorithm
* 下面是 CG method 的演算法, 這個方法非常的簡潔裏頭僅需計算$Ap_{k}$ 一個矩陣成向量其餘都是向量的相加減及係數的運算, 所以非常適合使用 cuda 進行加速.
$$\begin{array}{l}
r_{0}=b-Ax_{0} \\
p_{0}=r_{0} \\
k = 0 \\
while(True) \\
\qquad \alpha_{k} = \frac{<r_{k},r_{k}>}{<p_{k},Ap_{k}>} \\
\qquad x_{k+1} = r_{k} + \alpha_{k}p_{k} \\
\qquad r_{k+1} = r_{k} - \alpha_{k}Ap_{k} \\
\qquad if (\quad||r_{k+1}||_{2} <= \epsilon) break \\
\qquad \beta_{k} = \frac{<r_{k+1},r_{k+1}>}{<r_{k},r_{k}>} \\
\qquad p_{k+1} = r_{k+1} + \beta_{k}p_{k} \\
\qquad k = k+1
\end{array}$$
# Cuda 所需撰寫的工具
## 矩陣乘向量
```cpp=
__global__ void matrixMultiVector(double* d_matrix, double* d_vector_in,double* d_vector_out, int rows, int cols) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < cols) {
double sum = 0.0;
for(int j = 0; j < cols; j++)
{
sum += d_matrix[idx*cols + j]*d_vector_in[j];
}
d_vector_out[idx] = sum;
}
__syncthreads();
}
```
## 向量相加
```cpp=
__global__ void vectorAdd(double* d_vector1, double* d_vector2, double* scale, double* d_out, int rows) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < rows ) {
// 這裡可以對矩陣進行操作,例如 d_matrix[idx] = d_matrix[idx] * 2.0;
d_out[idx] = d_vector1[idx] + (*scale)*d_vector2[idx];
}
__syncthreads();
}
```
## 向量相減
```cpp=
__global__ void vectorMinus(double* d_vector1, double* d_vector2, double* scale, double* d_out, int rows) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < rows ) {
// 這裡可以對矩陣進行操作,例如 d_matrix[idx] = d_matrix[idx] * 2.0;
d_out[idx] = d_vector1[idx] - (*scale)*d_vector2[idx];
// printf("Num %d with %f is: %f - %f = %f\n",idx,*scale,d_vector1[idx],d_vector2[idx],d_out[idx]);
}
__syncthreads();
}
```
## 向量乘上係數
```cpp=
__global__ void vectorMultiScale(double* d_vector, double s,int rows) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < rows ) {
// 這裡可以對矩陣進行操作,例如 d_matrix[idx] = d_matrix[idx] * 2.0;
d_vector[idx] *= s;
}
__syncthreads();
}
```
## 內積
* 在這要特別說明內積在最後是將向量內的所有元素做向量和, 所以我們先使用 ```vectorDotElementWise``` 取得更新後的向量, 之後再使用 ```vectorDotValuePara``` 利用 parallel reductuin 的方式將更新後的向量進行求和.
* 這裡也使用了 shared memory 進行加速
```cpp=
__global__ void vectorDotElementWise(double* d_vector1, double* d_vector2, double* d_vector3, int rows) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < rows ) {
// 這裡可以對矩陣進行操作,例如 d_matrix[idx] = d_matrix[idx] * 2.0;
d_vector3[idx] = d_vector2[idx] * d_vector1[idx];
// printf("%d %f\n", idx, d_vector3[idx]);
}
__syncthreads();
}
__global__ void vectorDotValuePara(double* d_vector_in, double* d_vector_out, int rows)
{
extern __shared__ double sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = (blockIdx.x*blockDim.x*2) + threadIdx.x;
sdata[tid] = d_vector_in[i]+ d_vector_in[i+blockDim.x];
__syncthreads();
// // do reduction in shared mem
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0)
{
d_vector_out[blockIdx.x] = sdata[0];
// printf("Output ID %d is %f\n",blockIdx.x,d_vector_out[blockIdx.x]);
}
}
```
# CG method (CPU版)
* 在這邊我使用了 Eigen 這個 library 完成在 CPU 上的矩陣運算
```cpp=
void CGMethodCPU(const MatrixType &h_A, const VectorType &h_b, VectorType &h_xout)
{
// Construct 10x1 vector x0 by Eigen
Eigen::VectorXd h_x;// = h_b;
h_x = h_b;
// // Construct 10x1 vector xk, rk, bk
Eigen::VectorXd h_xk, h_rk, h_pk;
h_xk = h_x;
h_rk = h_b - h_A*h_xk;
h_pk = h_rk;
double up = 0.0;
int i = 0;
Eigen::VectorXd Ap ;
auto start = std::chrono::high_resolution_clock::now();
while(true)
{
double low = h_rk.dot(h_rk);
Ap = h_A*h_pk;
double ak = low/(h_pk.dot(Ap));
h_xk = h_xk + ak*h_pk;
h_rk = h_rk - ak*Ap;
up = h_rk.dot(h_rk);
// std::cout << sqrt(up) << std::endl;
if(sqrt(up) < 1e-6) break;
double bk = up/low;
h_pk = h_rk + bk*h_pk;
i++;
// break;
}
// h_xout = h_x;
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout << "Cpu time: " << elapsed.count() << " ms\n";
std::cout << "Res is " << sqrt(up) << std::endl;
std::cout << "Num is " << i << std::endl;
}
```
# CG method(GPU)
* 在 Cuda 中較繁瑣的會是配置 cuda 的記憶體大小及釋放, 可以看到光是 cudaMalloc 及 cudaFree 就佔了不少的行數.
* 相較於 CPU 的程式多上了不少但也因為善用了 GPU 的記憶體效能才可以達到良好的加速
```cpp=
template<typename MatrixType, typename VectorType>
void CGMethodGPU(const MatrixType &h_A, const VectorType &h_b, VectorType &h_xout)
{
int rows = num;
int cols = num;
Eigen::VectorXd h_one(num);
// Cuda Malloc
double *d_A, *d_b, *d_x, *d_p;
double *d_r, *d_Ap, *d_temp, *d_reduce;
double *d_low, *d_up, *d_ak, *d_bk, *d_value;
d_A = nullptr;
d_b = nullptr; // 初始化設備指標
d_x = nullptr;
// Set up grid, block
int block_num = 64;
int grid_num = (num+block_num-1)/block_num;
int grid_para_num = (num+1)/(block_num*2);
std::cout << "Grid_para_num is " << grid_para_num << std::endl;
dim3 block(block_num);
dim3 grid(grid_num);
dim3 grid_para(grid_para_num);
transferMatrixToCUDA(h_A, d_A, h_A.rows(), h_A.cols());
transferVectorToCUDA(h_b, d_b, h_b.rows(),true); // 傳送 b,因為它是列向量,所以 cols = 1
transferVectorToCUDA(h_one,d_temp,h_one.rows(),true);
cudaMalloc(&d_x, rows * sizeof(double));
cudaMalloc(&d_Ap, rows * sizeof(double));
cudaMalloc(&d_temp, rows * sizeof(double));
cudaMalloc(&d_reduce, grid_para_num * sizeof(double));
cudaMalloc(&d_p, rows * sizeof(double));
cudaMalloc(&d_r, rows * sizeof(double));
cudaMalloc(&d_low, 1 * sizeof(double));
cudaMalloc(&d_up, 1 * sizeof(double));
cudaMalloc(&d_ak, 1 * sizeof(double));
cudaMalloc(&d_bk, 1 * sizeof(double));
cudaMalloc(&d_value, 1*sizeof(double));
double *h_reduce = new double[grid_para.x];
double *h_res_temp = new double[rows];
double h_value = 1.0;
cudaMemcpy(d_value, &h_value, sizeof(double), cudaMemcpyHostToDevice);
int step = 0;
//Initial h_rx, h_xk, h_pk
// Initial d_x
transferVectorToCUDA(h_xout, d_x, h_xout.rows(),true);
// Initial d_r
matrixMultiVector<<<grid, block>>>(d_A, d_x, d_temp, rows, cols);
vectorMinus<<<grid,block>>>(d_b, d_temp, d_value, d_r, rows);
// Initial d_p
cudaMemcpy(d_p, d_r, rows*sizeof(double), cudaMemcpyDeviceToDevice);
// Start Conjugate Gradient
auto start = std::chrono::high_resolution_clock::now();
while(true)
{
// double low = h_rk.dot(h_rk);
double h_low = 0.0;
double h_value = 0.0;
double h_up = 0.0;
vectorDotElementWise<<<grid, block>>>(d_r,d_r,d_temp, rows);
vectorDotValuePara<<<grid_para, block>>>(d_temp, d_reduce, rows);
cudaMemcpy(h_reduce, d_reduce, grid_para.x*sizeof(double),cudaMemcpyDeviceToHost);
sumCPU(h_reduce,grid_para.x, &h_low);
// Ap = h_A*h_pk;
matrixMultiVector<<<grid, block>>>(d_A, d_p, d_Ap, rows, cols);
// double ak = low/(h_pk.dot(Ap));
vectorDotElementWise<<<grid, block>>>(d_p, d_Ap, d_temp, rows);
vectorDotValuePara<<<grid_para, block>>>(d_temp, d_reduce, rows);
cudaMemcpy(h_reduce, d_reduce, grid_para.x*sizeof(double),cudaMemcpyDeviceToHost);
sumCPU(h_reduce,grid_para.x, &h_value);
// h_ak = h_low/h_value;
double h_ak = h_low/h_value;
cudaMemcpy(d_ak, &h_ak, sizeof(double), cudaMemcpyHostToDevice);
// h_xk = h_xk + ak*h_pk;
vectorAdd<<<grid, block>>>(d_x, d_p, d_ak, d_x, rows);
// h_rk = h_rk - ak*Ap;
vectorMinus<<<grid, block>>>(d_r, d_Ap, d_ak, d_r, rows);
// up = h_rk.dot(h_rk);
vectorDotElementWise<<<grid, block>>>(d_r, d_r, d_temp, rows);
vectorDotValuePara<<<grid_para, block>>>(d_temp, d_reduce, rows);
cudaMemcpy(h_reduce, d_reduce, grid_para.x*sizeof(double),cudaMemcpyDeviceToHost);
sumCPU(h_reduce,grid_para.x, &h_up);
if(sqrt(h_up) < 1e-6)
{
std::cout << "Res is " << sqrt(h_up) << std::endl;
break;
}
// double bk = up/low;
double h_bk = h_up/h_low;
cudaMemcpy(d_bk, &h_bk, sizeof(double), cudaMemcpyHostToDevice);
// h_pk = h_rk + bk*h_pk;
vectorAdd<<<grid, block>>>(d_r,d_p,d_bk,d_p,rows);
step++;
// break;
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout << "Gpu time: " << elapsed.count() << " ms\n";
// 清理
cudaFree(d_A);
cudaFree(d_b);
cudaFree(d_x);
cudaFree(d_p);
cudaFree(d_r);
cudaFree(d_Ap);
cudaFree(d_temp);
cudaFree(d_reduce);
cudaFree(d_up);
cudaFree(d_low);
cudaFree(d_ak);
cudaFree(d_bk);
cudaFree(d_value);
}
```
# 速度測試 CPU vs GPU
* 硬體環境
* 筆電 : Dell G15
* GPU : RTX3050 Laptop 4G
* CPU : 12th Gen Intel(R) Core(TM) i5-12500H, 16 cores
* Memory : 16GB
| Size of matrix |2^7 |2^8 | 2^9 |2^10 |2^11 |
| ---------------- |------- |-------- | ------- |-------- |-------- |
| CPU(ms) |37.7589 |153.477 | 2588.11 |22830.5 |201780 |
| GPU(ms) |14.8235 |19.0418 | 126.494 |435.445 |2650.17 |
| Speed |2.54X |8.06X | 20.46X |52.43X |76.138X |

source : https://teshenglin.github.io/post/2023_conjugate_gradient_method_2/
github : https://github.com/GU-Lin/Cuda_Practice/blob/main/8_Matrix_Operation/main.cu