# SpMV ## Current version (v5) ### main<span>.</span>cu ```cpp= #include <iostream> #include <vector> #include <iomanip> #include <set> #include <algorithm> #include <chrono> #define CHECK_CUDA(func) \ { \ cudaError_t status = (func); \ if (status != cudaSuccess) { \ fprintf(stderr, "CUDA API failed at line %d with error: %s (%d)\n", \ __LINE__, cudaGetErrorString(status), status); \ exit(EXIT_FAILURE); \ } \ } template <int MAX_VALUE = 10, int MAX_ELEMENTS_PER_ROW = 32> void generate_irregular_problem(int A_num_rows, int A_num_cols, int &A_nnz, float* &A_values, int* &A_columns, int* &A_offsets, float* &x) { A_nnz = 0; A_offsets = new int[A_num_rows+1]; A_offsets[0] = 0; for (int i = 0; i < A_num_rows; i++) { int num_elements = rand() % std::min(MAX_ELEMENTS_PER_ROW+1, A_num_cols+1); A_offsets[i+1] = A_offsets[i] + num_elements; A_nnz += num_elements; } A_values = new float[A_nnz]; for (int i = 0; i < A_nnz; i++) { A_values[i] = rand() % MAX_VALUE + 1; } A_columns = new int[A_nnz]; for (int i = 0; i < A_num_rows; i++) { int fill_size = A_offsets[i+1] - A_offsets[i]; std::set<int> random_cols; while (random_cols.size() < fill_size) { random_cols.insert(rand() % A_num_cols); } std::copy(random_cols.begin(), random_cols.end(), A_columns+A_offsets[i]); } x = new float[A_num_cols]; for (int i = 0; i < A_num_cols; i++) { x[i] = rand() % MAX_VALUE + 1; } } void print_sparse_matrix_raw(int num_rows, int num_cols, int nnz, float *values, int *columns, int *offsets) { std::cout << "num_rows: " << num_rows << std::endl; std::cout << "num_cols: " << num_cols << std::endl; std::cout << "nnz: " << nnz << std::endl; std::cout << "values: [ "; for (int i = 0; i < nnz; i++) { std::cout << values[i] << " "; } std::cout << "]" << std::endl; std::cout << "columns: [ "; for (int i = 0; i < nnz; i++) { std::cout << columns[i] << " "; } std::cout << "]" << std::endl; std::cout << "offsets: [ "; for (int i = 0; i <= num_rows; i++) { std::cout << offsets[i] << " "; } std::cout << "]" << std::endl; std::cout << std::endl; } template <int WIDTH = 2> void print_sparse_matrix(int num_rows, int num_cols, float *values, int *columns, int *offsets) { for (int i = 0; i < num_rows; i++) { std::vector<int> cols(num_cols, 0); for (int j = offsets[i]; j < offsets[i+1]; j++) cols[columns[j]] = values[j]; for (int j = 0; j < num_cols; j++) std::cout << std::setw(WIDTH) << cols[j] << " "; std::cout << std::endl; } std::cout << std::endl; } template <int WIDTH = 2> void print_vector(int num_rows, float *values) { for (int i = 0; i < num_rows; i++) std::cout << std::setw(WIDTH) << values[i] << std::endl; std::cout << std::endl; } bool check_result(float *result, float *result_ref, int size) { constexpr float EPSILON = 1e-5; for (int i = 0; i < size; i++) if (fabs(result[i]-result_ref[i]) > EPSILON) return false; return true; } void spmv_cpu(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { for (int i = 0; i < num_rows; i++) { float sum = 0.0f; for (int j = A_offsets[i]; j < A_offsets[i+1]; j++) { sum += A_values[j] * x[A_columns[j]]; } result[i] = sum; } } __global__ void spmv_1d(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { int row = blockIdx.x * blockDim.x + threadIdx.x; if (row >= num_rows) return; for (int i = A_offsets[row]; i < A_offsets[row+1]; i++) { result[row] += A_values[i] * x[A_columns[i]]; } } __device__ __forceinline__ int binary_search(volatile int* arr, int value, int left, int right) { // find the smallest index in [left, right) where arr[index] > value // (assume we can always find a solution) while (left < right) { int mid = (left + right) >> 1; if (arr[mid] <= value) left = mid + 1; else right = mid; } return left; } template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32> __global__ void spmv_cte(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { // statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP"); constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_WARP; volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP]; volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP]; volatile __shared__ int reds[WARPS_PER_BLOCK][THREADS_PER_WARP]; int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int warp_id = threadIdx.x / THREADS_PER_WARP; int lane_id = threadIdx.x % THREADS_PER_WARP; // TODO: optimize load efficiency scans[warp_id][lane_id] = (global_thread_id+1 <= num_rows) ? A_offsets[global_thread_id+1] : A_offsets[num_rows]; int global_fine_task_start_id = (global_thread_id-lane_id <= num_rows) ? A_offsets[global_thread_id-lane_id] : A_offsets[num_rows]; reds[warp_id][lane_id] = 0; int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - global_fine_task_start_id; for (int fine_task_id = lane_id; fine_task_id < num_fine_tasks; fine_task_id += THREADS_PER_WARP) { int global_fine_task_id = global_fine_task_start_id + fine_task_id; mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]]; int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP); int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1] - global_fine_task_start_id); int in_seg_idx = min(lane_id, fine_task_id-seg_start_id); int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx; for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) { if (in_seg_idx + stride < seg_size) mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride]; } if (in_seg_idx == 0) reds[warp_id][coarse_task_id] += mapped[warp_id][lane_id]; } if (global_thread_id < num_rows) result[global_thread_id] = reds[warp_id][lane_id]; } int main() { // generate random sparse matrix A and dense vector x in CSR format std::cout << "Gernerating problem..." << std::endl << std::endl; constexpr int A_NUM_ROWS = 1000000; constexpr int A_NUM_COLS = 1000000; int A_nnz; float *h_A_values; int *h_A_columns; int *h_A_offsets; float *h_x; generate_irregular_problem(A_NUM_ROWS, A_NUM_COLS, A_nnz, h_A_values, h_A_columns, h_A_offsets, h_x); // ------------------------------------------------------------------------- // print problem definition // std::cout << "A (CSR Format):" << std::endl; // print_sparse_matrix_raw(A_NUM_ROWS, A_NUM_COLS, A_nnz, h_A_values, h_A_columns, h_A_offsets); // std::cout << "A:" << std::endl; // print_sparse_matrix(A_NUM_ROWS, A_NUM_COLS, h_A_values, h_A_columns, h_A_offsets); // std::cout << "x:" << std::endl; // print_vector(A_NUM_COLS, h_x); // ------------------------------------------------------------------------- // run the reference implementation (on cpu) std::cout << "Calculating reference result..." << std::endl; float *h_result_ref = new float[A_NUM_ROWS]; auto t0 = std::chrono::high_resolution_clock::now(); spmv_cpu(A_NUM_ROWS, h_A_values, h_A_columns, h_A_offsets, h_x, h_result_ref); auto t1 = std::chrono::high_resolution_clock::now(); std::chrono::milliseconds duration = std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0); std::cout << duration.count() << " ms" << std::endl << std::endl; // ------------------------------------------------------------------------- // print reference result // std::cout << "Reference Result:" << std::endl; // for (int i = 0; i < A_NUM_ROWS; i++) // std::cout << h_result_ref[i] << " "; // std::cout << std::endl << std::endl; // ------------------------------------------------------------------------- // allocate device memory float *d_A_values; int *d_A_columns; int *d_A_offsets; float *d_x; CHECK_CUDA( cudaMalloc(&d_A_values, A_nnz * sizeof(float)) ) CHECK_CUDA( cudaMalloc(&d_A_columns, A_nnz * sizeof(int)) ) CHECK_CUDA( cudaMalloc(&d_A_offsets, (A_NUM_ROWS + 1) * sizeof(int)) ) CHECK_CUDA( cudaMalloc(&d_x, A_NUM_COLS * sizeof(float)) ) CHECK_CUDA( cudaMemcpy(d_A_values, h_A_values, A_nnz * sizeof(float), cudaMemcpyHostToDevice) ) CHECK_CUDA( cudaMemcpy(d_A_columns, h_A_columns, A_nnz * sizeof(int), cudaMemcpyHostToDevice) ) CHECK_CUDA( cudaMemcpy(d_A_offsets, h_A_offsets, (A_NUM_ROWS + 1) * sizeof(int), cudaMemcpyHostToDevice) ) CHECK_CUDA( cudaMemcpy(d_x, h_x, A_NUM_COLS * sizeof(float), cudaMemcpyHostToDevice) ) // ------------------------------------------------------------------------- // allocate device memory for result float *d_result_1d, *d_result_cte; CHECK_CUDA( cudaMalloc(&d_result_1d, A_NUM_ROWS * sizeof(float)) ) CHECK_CUDA( cudaMemset(d_result_1d, 0, A_NUM_ROWS * sizeof(float)) ) CHECK_CUDA( cudaMalloc(&d_result_cte, A_NUM_ROWS * sizeof(float)) ) CHECK_CUDA( cudaMemset(d_result_cte, 0, A_NUM_ROWS * sizeof(float)) ) // ------------------------------------------------------------------------- // run the kernel std::cout << "Launching kernel..." << std::endl << std::endl; constexpr int THREADS_PER_BLOCK = 64; constexpr int BLOCKS_PER_GRID = (A_NUM_ROWS + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; spmv_1d<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(A_NUM_ROWS, d_A_values, d_A_columns, d_A_offsets, d_x, d_result_1d); spmv_cte<THREADS_PER_BLOCK><<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(A_NUM_ROWS, d_A_values, d_A_columns, d_A_offsets, d_x, d_result_cte); CHECK_CUDA( cudaGetLastError() ) CHECK_CUDA( cudaDeviceSynchronize() ) // ------------------------------------------------------------------------- // copy the result back to the host float *h_result_1d = new float[A_NUM_ROWS]; float *h_result_cte = new float[A_NUM_ROWS]; CHECK_CUDA( cudaMemcpy(h_result_1d, d_result_1d, A_NUM_ROWS * sizeof(float), cudaMemcpyDeviceToHost) ) CHECK_CUDA( cudaMemcpy(h_result_cte, d_result_cte, A_NUM_ROWS * sizeof(float), cudaMemcpyDeviceToHost) ) // ------------------------------------------------------------------------- // print 1d decomposition result std::cout << "1D Decomposition:" << std::endl; // for (int i = 0; i < A_NUM_ROWS; i++) // std::cout << h_result_1d[i] << " "; // std::cout << std::endl; // ------------------------------------------------------------------------- // check 1d decomposition result if (check_result(h_result_1d, h_result_ref, A_NUM_ROWS)) std::cout << "(PASS)" << std::endl << std::endl; else std::cout << "(FAIL)" << std::endl << std::endl; // ------------------------------------------------------------------------- // print CTE result std::cout << "CTE:" << std::endl; // for (int i = 0; i < A_NUM_ROWS; i++) // std::cout << h_result_cte[i] << " "; // std::cout << std::endl; // ------------------------------------------------------------------------- // check CTE result if (check_result(h_result_cte, h_result_ref, A_NUM_ROWS)) std::cout << "(PASS)" << std::endl << std::endl; else std::cout << "(FAIL)" << std::endl << std::endl; } ``` ### Result ```shell $ nvcc -O3 main.cu $ ./a.out Gernerating problem... Calculating reference result... Launching kernel... 1D Decomposition: (PASS) CTE: (PASS) ``` ### Analysis #### nvprof ```shell $ nvprof ./a.out Gernerating problem... Calculating reference result... ==23888== NVPROF is profiling process 23888, command: ./a.out Launching kernel... 1D Decomposition: (PASS) CTE: (PASS) ==23888== Profiling application: ./a.out ==23888== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 70.06% 15.395ms 4 3.8487ms 417.76us 7.4323ms [CUDA memcpy HtoD] 12.89% 2.8318ms 1 2.8318ms 2.8318ms 2.8318ms spmv_1d(int, float const *, int const *, int const *, float const *, float*) 10.50% 2.3085ms 2 1.1542ms 1.1500ms 1.1585ms [CUDA memcpy DtoH] 6.54% 1.4376ms 1 1.4376ms 1.4376ms 1.4376ms void spmv_cte<int=64, int=32>(int, float const *, int const *, int const *, float const *, float*) 0.01% 2.4000us 2 1.2000us 1.0880us 1.3120us [CUDA memset] API calls: 89.60% 234.07ms 6 39.012ms 500.14us 231.37ms cudaMalloc 7.63% 19.926ms 6 3.3210ms 549.43us 7.4957ms cudaMemcpy 1.63% 4.2637ms 1 4.2637ms 4.2637ms 4.2637ms cudaDeviceSynchronize 0.84% 2.2045ms 96 22.963us 199ns 937.25us cuDeviceGetAttribute 0.18% 474.48us 1 474.48us 474.48us 474.48us cuDeviceTotalMem 0.08% 200.31us 1 200.31us 200.31us 200.31us cuDeviceGetName 0.02% 53.459us 2 26.729us 16.372us 37.087us cudaMemset 0.01% 28.464us 2 14.232us 8.6610us 19.803us cudaLaunchKernel 0.00% 2.6930us 1 2.6930us 2.6930us 2.6930us cuDeviceGetPCIBusId 0.00% 2.4210us 3 807ns 201ns 1.5420us cuDeviceGetCount 0.00% 1.4000us 2 700ns 259ns 1.1410us cuDeviceGet 0.00% 469ns 1 469ns 469ns 469ns cuDeviceGetUuid 0.00% 396ns 1 396ns 396ns 396ns cudaGetLastError ``` * 矩陣大小 1000000*1000000,CTE 相較 1D Decomposition 在 Tesla P100 上,約有 2x 的 speedup。 #### NVIDIA Visual Profiler ##### 1D Decomposition ![](https://i.imgur.com/3GZ5Mrp.png) ##### CTE ![](https://i.imgur.com/3NTMoXk.png) * 使用 CTE,warp 中閒置的 thread 顯著減少,與論文論點相符。 #### GPGPU-Sim [GPGPU-Sim Full Output](/EMUhznrvTCSjACCQGpuOgg) ##### Comparison | | 1D Decomposition | CTE | | -------- | -------- | -------- | | gpu_sim_cycle | 448716 | 163759 | | gpu_sim_insn | 16033343 | 171687628 | | gpu_ipc | 35.7316| 1048.4165 | | gpu_occupancy | 96.6536% | 84.5215% | | gpu_stall_dramfull | 3885801 | 3885801 | | L2_BW | 429.4661 GB/Sec | 543.4926 GB/Sec | | L1D_total_cache_accesses | 4785235 | 6908590 | | L1D_total_cache_misses | 4463600 | 6501025 | | L1D_total_cache_miss_rate | 0.9328 | 0.9410 | | L1D_total_cache_reservation_fails |1288862 | 1288870 | | gpgpu_n_stall_shd_mem | 5602915 | 7559832 | | averagemflatency | 1737 | 1276 | | avg_icnt2mem_latency | 1163 | 818 | | avg_mrq_latency | 120 | 103 | | avg_icnt2sh_latency | 17 | 16 | | L2_total_cache_accesses | 4411820 | 6449407 | | L2_total_cache_misses | 2314247 | 2742590 | | L2_total_cache_miss_rate | 0.5246 | 0.4252 | | L2_total_cache_pending_hits | 7584 | 8833 | | L2_total_cache_reservation_fails | 15692 | 15692 | | Warp Stall | 84887 | 4823993 | | W0_Idle | 362995 | 714073 | | W0_Scoreboard | 49422861 | 56411601 | ![](https://i.imgur.com/Ns5CDLs.png) * CTE 在大部分時間下 warp 中所有的 thread 都是 active 的,因此效能比 1D Decomposition 佳。 ### Note #### Fails for GPU beyond Volta architecture > Independent Thread Scheduling can lead to a rather different set of threads participating in the executed code than intended if the developer made assumptions about warp-synchronicity of previous hardware architectures. In particular, any warp-synchronous code (such as synchronization-free, intra-warp reductions) should be revisited to ensure compatibility with Volta and beyond. See Compute Capability 7.x for further details. https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capability-7-x ## Previous versions ### v1 :::spoiler ```cpp= __device__ __forceinline__ int binary_search(const int* arr, int value, int left, int right) { // find the largest index in [left, right) where arr[index] <= value // (assume we can always find a solution) while (left < right) { int mid = (left + right) >> 1; if (arr[mid] <= value) left = mid + 1; else right = mid; } return left - 1; } template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32> __global__ void spmv_cte(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { // statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP"); constexpr int WARPS_PER_BLOCK = (THREADS_PER_BLOCK+THREADS_PER_WARP-1) / THREADS_PER_WARP; volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP]; int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / THREADS_PER_WARP; int warp_id = threadIdx.x / THREADS_PER_WARP; int lane_id = threadIdx.x % THREADS_PER_WARP; int first_row_in_warp = global_warp_id * THREADS_PER_WARP; int last_row_in_warp = min(num_rows, global_warp_id * THREADS_PER_WARP + THREADS_PER_WARP); int first_fine_task_id_in_warp = A_offsets[first_row_in_warp]; int last_fine_task_id_in_warp = A_offsets[last_row_in_warp]; for (int fine_task_id = first_fine_task_id_in_warp + lane_id; fine_task_id < last_fine_task_id_in_warp; fine_task_id += THREADS_PER_WARP) { int row = binary_search(A_offsets, fine_task_id, first_row_in_warp, last_row_in_warp); mapped[warp_id][lane_id] = A_values[fine_task_id] * x[A_columns[fine_task_id]]; int in_seg_idx = min(lane_id, fine_task_id-A_offsets[row]); int seg_size = min(A_offsets[row+1]-fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx; for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) if (in_seg_idx + stride < seg_size) mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride]; if (in_seg_idx == 0) result[row] += mapped[warp_id][lane_id]; } } ``` ::: ### v2 :::spoiler ```cpp= template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32> __global__ void spmv_cte(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { // statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP"); int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int global_warp_id = global_thread_id / THREADS_PER_WARP; int warp_id = threadIdx.x / THREADS_PER_WARP; int lane_id = threadIdx.x % THREADS_PER_WARP; constexpr int WARPS_PER_BLOCK = (THREADS_PER_BLOCK+THREADS_PER_WARP-1) / THREADS_PER_WARP; volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP]; if (global_thread_id < num_rows) scans[warp_id][lane_id] = A_offsets[global_thread_id+1]; else scans[warp_id][lane_id] = A_offsets[num_rows]; int fine_task_start_id = A_offsets[global_warp_id*THREADS_PER_WARP]; volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP]; int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - fine_task_start_id; for (int fine_task_id = lane_id; fine_task_id < num_fine_tasks; fine_task_id += THREADS_PER_WARP) { int global_fine_task_id = fine_task_start_id + fine_task_id; mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]]; int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP); int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1]-fine_task_start_id); int in_seg_idx = min(lane_id, fine_task_id-seg_start_id); int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx; for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) if (in_seg_idx + stride < seg_size) mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride]; int global_coarse_task_id = global_warp_id * THREADS_PER_WARP + coarse_task_id; if (in_seg_idx == 0) result[global_coarse_task_id] += mapped[warp_id][lane_id]; } } ``` ::: ### v3 :::spoiler ```cpp= template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32> __global__ void spmv_cte(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { // statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP"); int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int warp_id = threadIdx.x / THREADS_PER_WARP; int lane_id = threadIdx.x % THREADS_PER_WARP; constexpr int WARPS_PER_BLOCK = (THREADS_PER_BLOCK+THREADS_PER_WARP-1) / THREADS_PER_WARP; volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP]; scans[warp_id][lane_id] = (global_thread_id < num_rows) ? A_offsets[global_thread_id+1] : A_offsets[num_rows]; int global_fine_task_start_id = A_offsets[global_thread_id - lane_id]; volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP]; volatile __shared__ int reds[WARPS_PER_BLOCK][THREADS_PER_WARP]; reds[warp_id][lane_id] = 0; int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - global_fine_task_start_id; for (int fine_task_id = lane_id; fine_task_id < num_fine_tasks; fine_task_id += THREADS_PER_WARP) { int global_fine_task_id = global_fine_task_start_id + fine_task_id; mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]]; int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP); int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1] - global_fine_task_start_id); int in_seg_idx = min(lane_id, fine_task_id-seg_start_id); int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx; for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) if (in_seg_idx + stride < seg_size) mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride]; if (in_seg_idx == 0) reds[warp_id][coarse_task_id] += mapped[warp_id][lane_id]; } if (global_thread_id < num_rows) result[global_thread_id] += reds[warp_id][lane_id]; } ``` ::: ### v4 :::spoiler ```cpp= template <int THREADS_PER_BLOCK, int THREADS_PER_WARP = 32> __global__ void spmv_cte(int num_rows, const float* __restrict__ A_values, const int* __restrict__ A_columns, const int* __restrict__ A_offsets, const float* __restrict__ x, float* __restrict__ result) { // statically assert that THREADS_PER_BLOCK is a multiple of THREADS_PER_WARP static_assert(THREADS_PER_BLOCK % THREADS_PER_WARP == 0, "THREADS_PER_BLOCK must be a multiple of THREADS_PER_WARP"); constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_WARP; volatile __shared__ int scans[WARPS_PER_BLOCK][THREADS_PER_WARP]; volatile __shared__ int mapped[WARPS_PER_BLOCK][THREADS_PER_WARP]; volatile __shared__ int reds[WARPS_PER_BLOCK][THREADS_PER_WARP]; int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int warp_id = threadIdx.x / THREADS_PER_WARP; int lane_id = threadIdx.x % THREADS_PER_WARP; // TODO: optimize load efficiency scans[warp_id][lane_id] = (global_thread_id < num_rows) ? A_offsets[global_thread_id+1] : A_offsets[num_rows]; int global_fine_task_start_id = A_offsets[global_thread_id-lane_id]; reds[warp_id][lane_id] = 0; int num_fine_tasks = scans[warp_id][THREADS_PER_WARP-1] - global_fine_task_start_id; for (int fine_task_id = lane_id; fine_task_id < num_fine_tasks; fine_task_id += THREADS_PER_WARP) { int global_fine_task_id = global_fine_task_start_id + fine_task_id; mapped[warp_id][lane_id] = A_values[global_fine_task_id] * x[A_columns[global_fine_task_id]]; int coarse_task_id = binary_search(scans[warp_id], global_fine_task_id, 0, THREADS_PER_WARP); int seg_start_id = (coarse_task_id==0) ? 0 : (scans[warp_id][coarse_task_id-1] - global_fine_task_start_id); int in_seg_idx = min(lane_id, fine_task_id-seg_start_id); int seg_size = min(scans[warp_id][coarse_task_id]-global_fine_task_id, THREADS_PER_WARP-lane_id) + in_seg_idx; for (int stride = THREADS_PER_WARP >> 1; stride > 0; stride >>= 1) { if (in_seg_idx + stride < seg_size) mapped[warp_id][lane_id] += mapped[warp_id][lane_id+stride]; } if (in_seg_idx == 0) reds[warp_id][coarse_task_id] += mapped[warp_id][lane_id]; } if (global_thread_id < num_rows) result[global_thread_id] = reds[warp_id][lane_id]; } ``` :::