# SpMV
## Current version (v5)
### main<span>.</span>cu
```cpp=
#include <iostream>
#include <vector>
#include <iomanip>
#include <set>
#include <algorithm>
#include <chrono>
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
fprintf(stderr, "CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
exit(EXIT_FAILURE); \
} \
}
template <int MAX_VALUE = 10, int MAX_ELEMENTS_PER_ROW = 32>
void generate_irregular_problem(int A_num_rows, int A_num_cols, int &A_nnz,
float* &A_values, int* &A_columns, int* &A_offsets, float* &x) {
A_nnz = 0;
A_offsets = new int[A_num_rows+1];
A_offsets[0] = 0;
for (int i = 0; i < A_num_rows; i++) {
int num_elements = rand() % std::min(MAX_ELEMENTS_PER_ROW+1, A_num_cols+1);
A_offsets[i+1] = A_offsets[i] + num_elements;
A_nnz += num_elements;
}
A_values = new float[A_nnz];
for (int i = 0; i < A_nnz; i++) {
A_values[i] = rand() % MAX_VALUE + 1;
}
A_columns = new int[A_nnz];
for (int i = 0; i < A_num_rows; i++) {
int fill_size = A_offsets[i+1] - A_offsets[i];
std::set<int> random_cols;
while (random_cols.size() < fill_size) {
random_cols.insert(rand() % A_num_cols);
}
std::copy(random_cols.begin(), random_cols.end(), A_columns+A_offsets[i]);
}
x = new float[A_num_cols];
for (int i = 0; i < A_num_cols; i++) {
x[i] = rand() % MAX_VALUE + 1;
}
}
void print_sparse_matrix_raw(int num_rows, int num_cols, int nnz,
float *values, int *columns, int *offsets) {
std::cout << "num_rows: " << num_rows << std::endl;
std::cout << "num_cols: " << num_cols << std::endl;
std::cout << "nnz: " << nnz << std::endl;
std::cout << "values: [ ";
for (int i = 0; i < nnz; i++) {
std::cout << values[i] << " ";
}
std::cout << "]" << std::endl;
std::cout << "columns: [ ";
for (int i = 0; i < nnz; i++) {
std::cout << columns[i] << " ";
}
std::cout << "]" << std::endl;
std::cout << "offsets: [ ";
for (int i = 0; i <= num_rows; i++) {
std::cout << offsets[i] << " ";
}
std::cout << "]" << std::endl;
std::cout << std::endl;
}
template <int WIDTH = 2>
void print_sparse_matrix(int num_rows, int num_cols,
float *values, int *columns, int *offsets) {
for (int i = 0; i < num_rows; i++) {
std::vector<int> cols(num_cols, 0);
for (int j = offsets[i]; j < offsets[i+1]; j++)
cols[columns[j]] = values[j];
for (int j = 0; j < num_cols; j++)
std::cout << std::setw(WIDTH) << cols[j] << " ";
std::cout << std::endl;
}
std::cout << std::endl;
}
template <int WIDTH = 2>
void print_vector(int num_rows, float *values) {
for (int i = 0; i < num_rows; i++)
std::cout << std::setw(WIDTH) << values[i] << std::endl;
std::cout << std::endl;
}
bool check_result(float *result, float *result_ref, int size) {
constexpr float EPSILON = 1e-5;
for (int i = 0; i < size; i++)
if (fabs(result[i]-result_ref[i]) > EPSILON)
return false;
return true;
}
void spmv_cpu(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
for (int i = 0; i < num_rows; i++) {
float sum = 0.0f;
for (int j = A_offsets[i]; j < A_offsets[i+1]; j++) {
sum += A_values[j] * x[A_columns[j]];
}
result[i] = sum;
}
}
__global__ void spmv_1d(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= num_rows) return;
for (int i = A_offsets[row]; i < A_offsets[row+1]; i++) {
result[row] += A_values[i] * x[A_columns[i]];
}
}
__device__ __forceinline__ int binary_search(volatile int* arr, int value, int left, int right) {
// find the smallest index in [left, right) where arr[index] > value
// (assume we can always find a solution)
while (left < right) {
int mid = (left + right) >> 1;
if (arr[mid] <= value) left = mid + 1;
else right = mid;
}
return left;
}
template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32>
__global__ void spmv_cte(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
// statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP
static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP");
constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_WARP;
volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP];
volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP];
volatile __shared__ int reds[WARPS_PER_BLOCK][THREADS_PER_WARP];
int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int warp_id = threadIdx.x / THREADS_PER_WARP;
int lane_id = threadIdx.x % THREADS_PER_WARP;
// TODO: optimize load efficiency
scans[warp_id][lane_id] = (global_thread_id+1 <= num_rows) ? A_offsets[global_thread_id+1] : A_offsets[num_rows];
int global_fine_task_start_id = (global_thread_id-lane_id <= num_rows) ? A_offsets[global_thread_id-lane_id] : A_offsets[num_rows];
reds[warp_id][lane_id] = 0;
int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - global_fine_task_start_id;
for (int fine_task_id = lane_id;
fine_task_id < num_fine_tasks;
fine_task_id += THREADS_PER_WARP) {
int global_fine_task_id = global_fine_task_start_id + fine_task_id;
mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]];
int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP);
int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1] - global_fine_task_start_id);
int in_seg_idx = min(lane_id, fine_task_id-seg_start_id);
int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx;
for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) {
if (in_seg_idx + stride < seg_size)
mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride];
}
if (in_seg_idx == 0) reds[warp_id][coarse_task_id] += mapped[warp_id][lane_id];
}
if (global_thread_id < num_rows) result[global_thread_id] = reds[warp_id][lane_id];
}
int main() {
// generate random sparse matrix A and dense vector x in CSR format
std::cout << "Gernerating problem..." << std::endl << std::endl;
constexpr int A_NUM_ROWS = 1000000;
constexpr int A_NUM_COLS = 1000000;
int A_nnz;
float *h_A_values;
int *h_A_columns;
int *h_A_offsets;
float *h_x;
generate_irregular_problem(A_NUM_ROWS, A_NUM_COLS, A_nnz, h_A_values, h_A_columns, h_A_offsets, h_x);
// -------------------------------------------------------------------------
// print problem definition
// std::cout << "A (CSR Format):" << std::endl;
// print_sparse_matrix_raw(A_NUM_ROWS, A_NUM_COLS, A_nnz, h_A_values, h_A_columns, h_A_offsets);
// std::cout << "A:" << std::endl;
// print_sparse_matrix(A_NUM_ROWS, A_NUM_COLS, h_A_values, h_A_columns, h_A_offsets);
// std::cout << "x:" << std::endl;
// print_vector(A_NUM_COLS, h_x);
// -------------------------------------------------------------------------
// run the reference implementation (on cpu)
std::cout << "Calculating reference result..." << std::endl;
float *h_result_ref = new float[A_NUM_ROWS];
auto t0 = std::chrono::high_resolution_clock::now();
spmv_cpu(A_NUM_ROWS, h_A_values, h_A_columns, h_A_offsets, h_x, h_result_ref);
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::milliseconds duration = std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0);
std::cout << duration.count() << " ms" << std::endl << std::endl;
// -------------------------------------------------------------------------
// print reference result
// std::cout << "Reference Result:" << std::endl;
// for (int i = 0; i < A_NUM_ROWS; i++)
// std::cout << h_result_ref[i] << " ";
// std::cout << std::endl << std::endl;
// -------------------------------------------------------------------------
// allocate device memory
float *d_A_values;
int *d_A_columns;
int *d_A_offsets;
float *d_x;
CHECK_CUDA( cudaMalloc(&d_A_values, A_nnz * sizeof(float)) )
CHECK_CUDA( cudaMalloc(&d_A_columns, A_nnz * sizeof(int)) )
CHECK_CUDA( cudaMalloc(&d_A_offsets, (A_NUM_ROWS + 1) * sizeof(int)) )
CHECK_CUDA( cudaMalloc(&d_x, A_NUM_COLS * sizeof(float)) )
CHECK_CUDA( cudaMemcpy(d_A_values, h_A_values, A_nnz * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(d_A_columns, h_A_columns, A_nnz * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(d_A_offsets, h_A_offsets,
(A_NUM_ROWS + 1) * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(d_x, h_x, A_NUM_COLS * sizeof(float),
cudaMemcpyHostToDevice) )
// -------------------------------------------------------------------------
// allocate device memory for result
float *d_result_1d, *d_result_cte;
CHECK_CUDA( cudaMalloc(&d_result_1d, A_NUM_ROWS * sizeof(float)) )
CHECK_CUDA( cudaMemset(d_result_1d, 0, A_NUM_ROWS * sizeof(float)) )
CHECK_CUDA( cudaMalloc(&d_result_cte, A_NUM_ROWS * sizeof(float)) )
CHECK_CUDA( cudaMemset(d_result_cte, 0, A_NUM_ROWS * sizeof(float)) )
// -------------------------------------------------------------------------
// run the kernel
std::cout << "Launching kernel..." << std::endl << std::endl;
constexpr int THREADS_PER_BLOCK = 64;
constexpr int BLOCKS_PER_GRID = (A_NUM_ROWS + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
spmv_1d<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(A_NUM_ROWS, d_A_values, d_A_columns,
d_A_offsets, d_x, d_result_1d);
spmv_cte<THREADS_PER_BLOCK><<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(A_NUM_ROWS, d_A_values, d_A_columns,
d_A_offsets, d_x, d_result_cte);
CHECK_CUDA( cudaGetLastError() )
CHECK_CUDA( cudaDeviceSynchronize() )
// -------------------------------------------------------------------------
// copy the result back to the host
float *h_result_1d = new float[A_NUM_ROWS];
float *h_result_cte = new float[A_NUM_ROWS];
CHECK_CUDA( cudaMemcpy(h_result_1d, d_result_1d, A_NUM_ROWS * sizeof(float),
cudaMemcpyDeviceToHost) )
CHECK_CUDA( cudaMemcpy(h_result_cte, d_result_cte, A_NUM_ROWS * sizeof(float),
cudaMemcpyDeviceToHost) )
// -------------------------------------------------------------------------
// print 1d decomposition result
std::cout << "1D Decomposition:" << std::endl;
// for (int i = 0; i < A_NUM_ROWS; i++)
// std::cout << h_result_1d[i] << " ";
// std::cout << std::endl;
// -------------------------------------------------------------------------
// check 1d decomposition result
if (check_result(h_result_1d, h_result_ref, A_NUM_ROWS))
std::cout << "(PASS)" << std::endl << std::endl;
else
std::cout << "(FAIL)" << std::endl << std::endl;
// -------------------------------------------------------------------------
// print CTE result
std::cout << "CTE:" << std::endl;
// for (int i = 0; i < A_NUM_ROWS; i++)
// std::cout << h_result_cte[i] << " ";
// std::cout << std::endl;
// -------------------------------------------------------------------------
// check CTE result
if (check_result(h_result_cte, h_result_ref, A_NUM_ROWS))
std::cout << "(PASS)" << std::endl << std::endl;
else
std::cout << "(FAIL)" << std::endl << std::endl;
}
```
### Result
```shell
$ nvcc -O3 main.cu
$ ./a.out
Gernerating problem...
Calculating reference result...
Launching kernel...
1D Decomposition:
(PASS)
CTE:
(PASS)
```
### Analysis
#### nvprof
```shell
$ nvprof ./a.out
Gernerating problem...
Calculating reference result...
==23888== NVPROF is profiling process 23888, command: ./a.out
Launching kernel...
1D Decomposition:
(PASS)
CTE:
(PASS)
==23888== Profiling application: ./a.out
==23888== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 70.06% 15.395ms 4 3.8487ms 417.76us 7.4323ms [CUDA memcpy HtoD]
12.89% 2.8318ms 1 2.8318ms 2.8318ms 2.8318ms spmv_1d(int, float const *, int const *, int const *, float const *, float*)
10.50% 2.3085ms 2 1.1542ms 1.1500ms 1.1585ms [CUDA memcpy DtoH]
6.54% 1.4376ms 1 1.4376ms 1.4376ms 1.4376ms void spmv_cte<int=64, int=32>(int, float const *, int const *, int const *, float const *, float*)
0.01% 2.4000us 2 1.2000us 1.0880us 1.3120us [CUDA memset]
API calls: 89.60% 234.07ms 6 39.012ms 500.14us 231.37ms cudaMalloc
7.63% 19.926ms 6 3.3210ms 549.43us 7.4957ms cudaMemcpy
1.63% 4.2637ms 1 4.2637ms 4.2637ms 4.2637ms cudaDeviceSynchronize
0.84% 2.2045ms 96 22.963us 199ns 937.25us cuDeviceGetAttribute
0.18% 474.48us 1 474.48us 474.48us 474.48us cuDeviceTotalMem
0.08% 200.31us 1 200.31us 200.31us 200.31us cuDeviceGetName
0.02% 53.459us 2 26.729us 16.372us 37.087us cudaMemset
0.01% 28.464us 2 14.232us 8.6610us 19.803us cudaLaunchKernel
0.00% 2.6930us 1 2.6930us 2.6930us 2.6930us cuDeviceGetPCIBusId
0.00% 2.4210us 3 807ns 201ns 1.5420us cuDeviceGetCount
0.00% 1.4000us 2 700ns 259ns 1.1410us cuDeviceGet
0.00% 469ns 1 469ns 469ns 469ns cuDeviceGetUuid
0.00% 396ns 1 396ns 396ns 396ns cudaGetLastError
```
* 矩陣大小 1000000*1000000,CTE 相較 1D Decomposition 在 Tesla P100 上,約有 2x 的 speedup。
#### NVIDIA Visual Profiler
##### 1D Decomposition

##### CTE

* 使用 CTE,warp 中閒置的 thread 顯著減少,與論文論點相符。
#### GPGPU-Sim
[GPGPU-Sim Full Output](/EMUhznrvTCSjACCQGpuOgg)
##### Comparison
| | 1D Decomposition | CTE |
| -------- | -------- | -------- |
| gpu_sim_cycle | 448716 | 163759 |
| gpu_sim_insn | 16033343 | 171687628 |
| gpu_ipc | 35.7316| 1048.4165 |
| gpu_occupancy | 96.6536% | 84.5215% |
| gpu_stall_dramfull | 3885801 | 3885801 |
| L2_BW | 429.4661 GB/Sec | 543.4926 GB/Sec |
| L1D_total_cache_accesses | 4785235 | 6908590 |
| L1D_total_cache_misses | 4463600 | 6501025 |
| L1D_total_cache_miss_rate | 0.9328 | 0.9410 |
| L1D_total_cache_reservation_fails |1288862 | 1288870 |
| gpgpu_n_stall_shd_mem | 5602915 | 7559832 |
| averagemflatency | 1737 | 1276 |
| avg_icnt2mem_latency | 1163 | 818 |
| avg_mrq_latency | 120 | 103 |
| avg_icnt2sh_latency | 17 | 16 |
| L2_total_cache_accesses | 4411820 | 6449407 |
| L2_total_cache_misses | 2314247 | 2742590 |
| L2_total_cache_miss_rate | 0.5246 | 0.4252 |
| L2_total_cache_pending_hits | 7584 | 8833 |
| L2_total_cache_reservation_fails | 15692 | 15692 |
| Warp Stall | 84887 | 4823993 |
| W0_Idle | 362995 | 714073 |
| W0_Scoreboard | 49422861 | 56411601 |

* CTE 在大部分時間下 warp 中所有的 thread 都是 active 的,因此效能比 1D Decomposition 佳。
### Note
#### Fails for GPU beyond Volta architecture
> Independent Thread Scheduling can lead to a rather different set of threads participating in the executed code than intended if the developer made assumptions about warp-synchronicity of previous hardware architectures. In particular, any warp-synchronous code (such as synchronization-free, intra-warp reductions) should be revisited to ensure compatibility with Volta and beyond. See Compute Capability 7.x for further details.
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capability-7-x
## Previous versions
### v1
:::spoiler
```cpp=
__device__ __forceinline__ int binary_search(const int* arr, int value, int left, int right) {
// find the largest index in [left, right) where arr[index] <= value
// (assume we can always find a solution)
while (left < right) {
int mid = (left + right) >> 1;
if (arr[mid] <= value) left = mid + 1;
else right = mid;
}
return left - 1;
}
template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32>
__global__ void spmv_cte(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
// statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP
static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP");
constexpr int WARPS_PER_BLOCK = (THREADS_PER_BLOCK+THREADS_PER_WARP-1) / THREADS_PER_WARP;
volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP];
int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / THREADS_PER_WARP;
int warp_id = threadIdx.x / THREADS_PER_WARP;
int lane_id = threadIdx.x % THREADS_PER_WARP;
int first_row_in_warp = global_warp_id * THREADS_PER_WARP;
int last_row_in_warp = min(num_rows, global_warp_id * THREADS_PER_WARP + THREADS_PER_WARP);
int first_fine_task_id_in_warp = A_offsets[first_row_in_warp];
int last_fine_task_id_in_warp = A_offsets[last_row_in_warp];
for (int fine_task_id = first_fine_task_id_in_warp + lane_id;
fine_task_id < last_fine_task_id_in_warp;
fine_task_id += THREADS_PER_WARP) {
int row = binary_search(A_offsets, fine_task_id, first_row_in_warp, last_row_in_warp);
mapped[warp_id][lane_id] = A_values[fine_task_id] * x[A_columns[fine_task_id]];
int in_seg_idx = min(lane_id, fine_task_id-A_offsets[row]);
int seg_size = min(A_offsets[row+1]-fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx;
for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1)
if (in_seg_idx + stride < seg_size)
mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride];
if (in_seg_idx == 0) result[row] += mapped[warp_id][lane_id];
}
}
```
:::
### v2
:::spoiler
```cpp=
template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32>
__global__ void spmv_cte(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
// statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP
static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP");
int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int global_warp_id = global_thread_id / THREADS_PER_WARP;
int warp_id = threadIdx.x / THREADS_PER_WARP;
int lane_id = threadIdx.x % THREADS_PER_WARP;
constexpr int WARPS_PER_BLOCK = (THREADS_PER_BLOCK+THREADS_PER_WARP-1) / THREADS_PER_WARP;
volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP];
if (global_thread_id < num_rows) scans[warp_id][lane_id] = A_offsets[global_thread_id+1];
else scans[warp_id][lane_id] = A_offsets[num_rows];
int fine_task_start_id = A_offsets[global_warp_id*THREADS_PER_WARP];
volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP];
int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - fine_task_start_id;
for (int fine_task_id = lane_id;
fine_task_id < num_fine_tasks;
fine_task_id += THREADS_PER_WARP) {
int global_fine_task_id = fine_task_start_id + fine_task_id;
mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]];
int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP);
int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1]-fine_task_start_id);
int in_seg_idx = min(lane_id, fine_task_id-seg_start_id);
int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx;
for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1)
if (in_seg_idx + stride < seg_size)
mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride];
int global_coarse_task_id = global_warp_id * THREADS_PER_WARP + coarse_task_id;
if (in_seg_idx == 0) result[global_coarse_task_id] += mapped[warp_id][lane_id];
}
}
```
:::
### v3
:::spoiler
```cpp=
template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32>
__global__ void spmv_cte(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
// statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP
static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP");
int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int warp_id = threadIdx.x / THREADS_PER_WARP;
int lane_id = threadIdx.x % THREADS_PER_WARP;
constexpr int WARPS_PER_BLOCK = (THREADS_PER_BLOCK+THREADS_PER_WARP-1) / THREADS_PER_WARP;
volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP];
scans[warp_id][lane_id] = (global_thread_id < num_rows) ? A_offsets[global_thread_id+1] : A_offsets[num_rows];
int global_fine_task_start_id = A_offsets[global_thread_id - lane_id];
volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP];
volatile __shared__ int reds[WARPS_PER_BLOCK][THREADS_PER_WARP];
reds[warp_id][lane_id] = 0;
int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - global_fine_task_start_id;
for (int fine_task_id = lane_id;
fine_task_id < num_fine_tasks;
fine_task_id += THREADS_PER_WARP) {
int global_fine_task_id = global_fine_task_start_id + fine_task_id;
mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]];
int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP);
int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1] - global_fine_task_start_id);
int in_seg_idx = min(lane_id, fine_task_id-seg_start_id);
int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx;
for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1)
if (in_seg_idx + stride < seg_size)
mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride];
if (in_seg_idx == 0) reds[warp_id][coarse_task_id] += mapped[warp_id][lane_id];
}
if (global_thread_id < num_rows) result[global_thread_id] += reds[warp_id][lane_id];
}
```
:::
### v4
:::spoiler
```cpp=
template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32>
__global__ void spmv_cte(int num_rows,
const float* __restrict__ A_values,
const int* __restrict__ A_columns,
const int* __restrict__ A_offsets,
const float* __restrict__ x,
float* __restrict__ result) {
// statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP
static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP");
constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_WARP;
volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP];
volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP];
volatile __shared__ int reds[WARPS_PER_BLOCK][THREADS_PER_WARP];
int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int warp_id = threadIdx.x / THREADS_PER_WARP;
int lane_id = threadIdx.x % THREADS_PER_WARP;
// TODO: optimize load efficiency
scans[warp_id][lane_id] = (global_thread_id < num_rows) ? A_offsets[global_thread_id+1] : A_offsets[num_rows];
int global_fine_task_start_id = A_offsets[global_thread_id-lane_id];
reds[warp_id][lane_id] = 0;
int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - global_fine_task_start_id;
for (int fine_task_id = lane_id;
fine_task_id < num_fine_tasks;
fine_task_id += THREADS_PER_WARP) {
int global_fine_task_id = global_fine_task_start_id + fine_task_id;
mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]];
int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP);
int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1] - global_fine_task_start_id);
int in_seg_idx = min(lane_id, fine_task_id-seg_start_id);
int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx;
for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) {
if (in_seg_idx + stride < seg_size)
mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride];
}
if (in_seg_idx == 0) reds[warp_id][coarse_task_id] += mapped[warp_id][lane_id];
}
if (global_thread_id < num_rows) result[global_thread_id] = reds[warp_id][lane_id];
}
```
:::