Parallel Programming HW6

--- tags: 111上 --- # Parallel Programming HW6 > 310552027 郭家良 ### Q1 (5 points): Explain your implementation. How do you optimize the performance of convolution? - In the following experiment, I use 7x7 filter and images with size 400x600. - run `$./conv` #### 1. First, I just implement a basic code and `kernel.cl` - `hostFE.c` ```c=1 #include <stdio.h> #include <stdlib.h> #include "hostFE.h" #include "helper.h" #include <CL/cl.h> #define LOCAL_SIZE 10 void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage, cl_device_id *device, cl_context *context, cl_program *program) { // Get platform and device information cl_int status; int filterSize = filterWidth * filterWidth; int imgSize = imageHeight * imageWidth; // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue(*context, *device, 0, &status); // Create memory buffers on the device for each vector cl_mem filter_device = clCreateBuffer(*context, CL_MEM_READ_ONLY, filterSize * sizeof(float), NULL, &status); cl_mem input_img_device = clCreateBuffer(*context, CL_MEM_READ_ONLY, imgSize * sizeof(float), NULL, &status); cl_mem output_img_device = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, imgSize * sizeof(float), NULL, &status); // Copy the image and kernel data to the device status = clEnqueueWriteBuffer(command_queue, input_img_device, CL_TRUE, 0, sizeof(float) * imgSize, inputImage, 0, NULL, NULL); CHECK(status, "clEnqueueWriteBuffer"); status = clEnqueueWriteBuffer(command_queue, filter_device, CL_TRUE, 0, sizeof(float) * filterSize, filter, 0, NULL, NULL); CHECK(status, "clEnqueueWriteBuffer"); // create kernel function cl_kernel kernel = clCreateKernel(*program, "convolution", &status); CHECK(status, "clCreateKernel"); // Set the kernel arguments clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output_img_device); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&input_img_device); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&filter_device); clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&imageWidth); clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&imageHeight); clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&filterWidth); // Execute the kernel size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE}; size_t global_size[2] = {imageWidth, imageHeight}; status = clEnqueueNDRangeKernel(command_queue, kernel, 2, 0, global_size, local_size, 0, NULL, NULL); CHECK(status, "clEnqueueNDRangeKernel"); // Copy the output image data back to the host status = clEnqueueReadBuffer(command_queue, output_img_device, CL_TRUE, 0, sizeof(float) * imgSize, (void*) outputImage, 0, NULL, NULL); CHECK(status, "clEnqueueReadBuffer"); // release status = clReleaseCommandQueue(command_queue); status = clReleaseMemObject(input_img_device); status = clReleaseMemObject(output_img_device); status = clReleaseMemObject(filter_device); status = clReleaseKernel(kernel); } ``` - `kernel.cl` ```clike=1 __kernel void convolution( __global float *outputImage, __global const float *inputImage, __constant float *filter, int imgWidth, int imgHeight, int filterWidth) { int gc = get_global_id(0); int gr = get_global_id(1); int halffilterSize = filterWidth / 2; float sum = 0.0f; for (int kr = -halffilterSize; kr <= halffilterSize; kr++) { for (int kc = -halffilterSize; kc <= halffilterSize; kc++) { if (gr + kr >= 0 && gr + kr < imgHeight && gc + kc >= 0 && gc + kc < imgWidth) { sum += (inputImage[(gr + kr) * imgWidth + gc + kc] * filter[(kr + halffilterSize) * filterWidth + kc + halffilterSize]); } } } outputImage[gr * imgWidth + gc] = sum; } ``` - result ![](https://i.imgur.com/JKItFtd.png) #### 2. Modify the local size - hostFE.c ```diff - #define LOCAL_SIZE 10 + #define LOCAL_SIZE 25 ``` - result ![](https://i.imgur.com/2oeG0p8.png) #### 3. Tiling version - concept The main reason to implement tiling version is that we need to access GPU global memory for many times (${2 \times \text{filter_width} \times \text{filter_width} + 1}$ times) in each work item. Thus, if we task good use of shared memory and parallelism, we can drastically reduce the time to access global memory. The intuitive way is that the (x, y)'th work item load the (x, y)'th pixel of the image in the global memory. However, there is a utilization rate issue in image convolution that we need to take "padding" into account. There are some work item will return immediately after loading the pixel. ![](https://i.imgur.com/xtMjZlq.png) From the above figure, we can see that only the work items in the **orange** part can do the convolution operation. And here are some example of utilization rate: ![](https://i.imgur.com/JHUDzvc.png) Hence, I implement the modified tiling: loading **another eight pixels** in the global memory in each work item like the example below: - work item (0, 0) ![](https://i.imgur.com/C9QG4jt.png) - work item (0, 1) ![](https://i.imgur.com/gCqLOAY.png) - work item (1, 0) ![](https://i.imgur.com/mibuDm0.png) Consequently, we can achieve 100% utilization rate of all work items in GPU. :::info - Note: I also observe that we just need to loading **another four pixels** (top-left, top-right, bottom-left, bottom-right), then we can also implement it correctly ::: - modify `kernel.cl` ```clike=1 __kernel void convolution( __global float *outputImage, __global const float *inputImage, __global const float *filter, int imgWidth, int imgHeight, int filterWidth) { int halffilterSize = filterWidth / 2; float sum = 0.0f; const int gr = get_global_id(1); const int gr_npad = gr - halffilterSize; const int gr_pad = gr + halffilterSize; const int gc = get_global_id(0); const int gc_npad = gc - halffilterSize; const int gc_pad = gc + halffilterSize; const int x = get_local_id(0); const int x_pad = x + halffilterSize; const int x_2pad = x + halffilterSize + halffilterSize; const int y = get_local_id(1); const int y_pad = y + halffilterSize; const int y_2pad = y + halffilterSize + halffilterSize; if (gc >= imgWidth || gr >= imgHeight) return; __local float shared_patch[32][32]; __local float share_filter[16][16]; // tiling : load patch and kernel to shared memory in a SM shared_patch[y_pad][x_pad] = inputImage[gr * imgWidth + gc]; // top-left part shared_patch[y][x] = (gr_npad >= 0 && gc_npad >= 0) ? inputImage[gr_npad * imgWidth + gc_npad] : 0.0f; // top-right part shared_patch[y][x_2pad] = (gr_npad >= 0 && gc_pad < imgWidth) ? inputImage[gr_npad * imgWidth + gc_pad] : 0.0f; // bottom-left part shared_patch[y_2pad][x] = (gr_pad < imgHeight && gc_npad >= 0) ? inputImage[gr_pad * imgWidth + gc_npad] : 0.0f; // bottom-right part shared_patch[y_2pad][x_2pad] = (gr_pad < imgHeight && gc_pad < imgWidth) ? inputImage[gr_pad * imgWidth + gc_pad] : 0.0f; share_filter[y % filterWidth][x % filterWidth] = filter[(y % filterWidth) * filterWidth + x % filterWidth]; barrier(CLK_LOCAL_MEM_FENCE); for (int kr = -halffilterSize; kr <= halffilterSize; kr++) { for (int kc = -halffilterSize; kc <= halffilterSize; kc++) { if (gr + kr >= 0 && gr + kr < imgHeight && gc + kc >= 0 && gc + kc < imgWidth) { sum += shared_patch[y + kr + halffilterSize][x + kc + halffilterSize] * share_filter[kr + halffilterSize][kc + halffilterSize]; } } } outputImage[gr * imgWidth + gc] = sum; } ``` - result ![](https://i.imgur.com/sko7Sqh.png) #### 4. Use `__constant` instead of `const float` - modify `kernel.cl` ```diff __kernel void convolution( __global float *outputImage, __global const float *inputImage, - __global const float *filter, + __constant float *filter, int imgWidth, int imgHeight, int filterWidth) ``` - result ![](https://i.imgur.com/bDHEWGc.png) ### Q2 (10 points): Rewrite the program using CUDA. (1) Explain your CUDA implementation, (2) plot a chart to show the performance difference between using OpenCL and CUDA, and (3) explain the result. - I modify the `Makefile` as below: ```makefile default: conv NVCC = nvcc CC = gcc-10 FLAGS = -O3 -lOpenCL -m64 -ffloat-store -w -g CUDA_LINK_FLAGS = -rdc=true -gencode=arch=compute_61,code=sm_61 -Xcompiler '-fPIC' CUDA_COMPILE_FLAGS = --device-c -gencode=arch=compute_61,code=sm_61 -Xcompiler '-fPIC' -g -O3 OBJS = main.o bmpfuncs.o hostFE.o serialConv.o helper.o conv: $(OBJS) ifdef USE_CUDA $(NVCC) ${CUDA_LINK_FLAGS} -o $@ $(OBJS) -lOpenCL else $(CC) -o $@ $(OBJS) $(FLAGS) endif %.o: %.c $(CC) -c $(FLAGS) $< -o $@ hostFE.o: hostFE.cu hostFE.h ifdef USE_CUDA ${NVCC} ${CUDA_COMPILE_FLAGS} -c hostFE.cu -o $@ endif clean: rm -f conv *.o output.bmp ref.bmp ``` - To compile the `CUDA` version of image convolution, we need to type the command below: ```shell $ make USE_CUDA=1 ``` #### (1) Sililar to Q1, I also implement 3 version of cuda convolution (basic, tiling, tiling + `__constant__`) - result of basic implementation ![](https://i.imgur.com/9J17mqD.png) - result of tiling implementation ![](https://i.imgur.com/UljcZ0r.png) - result of tiling + `__constant__` implementation ![](https://i.imgur.com/DP29wab.png) - the tiling strategy is the same as my OpenCL - add new file : `hostFE.cu` ```cpp=1 #include <cuda.h> #include <stdio.h> #include <stdlib.h> #include "helper.h" extern "C"{ #include "hostFE.h" } #define IMAGE_CACHE_WIDTH 64 #define KERNEL_CACHE_WIDTH 32 #define BLOCK_SIZE_X 32 #define BLOCK_SIZE_Y 16 float *d_img, *d_ans; __constant__ float d_kernel[1024]; __global__ void conv(float *d_ans, float *d_img, int width, int height, int k_size, int pad) { const int r = blockIdx.y * blockDim.y + threadIdx.y; const int r_npad = r - pad; const int r_pad = r + pad; const int c = blockIdx.x * blockDim.x + threadIdx.x; const int c_npad = c - pad; const int c_pad = c + pad; const int x = threadIdx.x; const int x_pad = x + pad; const int x_2pad = x + pad + pad; const int y = threadIdx.y; const int y_pad = y + pad; const int y_2pad = y + pad + pad; if (r >= height || c >= width) return; __shared__ float shared_patch[IMAGE_CACHE_WIDTH][IMAGE_CACHE_WIDTH]; __shared__ float shared_kernel[KERNEL_CACHE_WIDTH][KERNEL_CACHE_WIDTH]; // tiling : load patch and kernel to shared memory in a SM shared_patch[y_pad][x_pad] = d_img[r * width + c]; // top-left part shared_patch[y][x] = (r_npad >= 0 && c_npad >= 0) ? d_img[r_npad * width + c_npad] : 0.0f; // top-right part shared_patch[y][x_2pad] = (r_npad >= 0 && c_pad < width) ? d_img[r_npad * width + c_pad] : 0.0f; // bottom-left part shared_patch[y_2pad][x] = (r_pad < height && c_npad >= 0) ? d_img[r_pad * width + c_npad] : 0.0f; // bottom-right part shared_patch[y_2pad][x_2pad] = (r_pad < height && c_pad < width) ? d_img[r_pad * width + c_pad] : 0.0f; shared_kernel[y % k_size][x % k_size] = d_kernel[(y % k_size) * k_size + x % k_size]; __syncthreads(); float res = 0.0; for (int kr = -pad; kr <= pad; kr++) { for (int kc = -pad; kc <= pad; kc++) { res += shared_patch[y + kr + pad][x + kc + pad] * shared_kernel[kr + pad][kc + pad]; } } d_ans[r * width + c] = res; } void mallocKernelAndAns(float *kernel_arr, int width, int height, int k_size, int pad) { cudaMalloc((void **)&d_img, width * height * sizeof(float)); cudaMalloc((void **)&d_ans, width * height * sizeof(float)); cudaMemcpyToSymbol(d_kernel, kernel_arr, k_size * k_size * sizeof(float), 0, cudaMemcpyHostToDevice); } void convolution(float *img_arr, float *ans_arr, int width, int height, int k_size, int pad) { // init cuda arr cudaMemcpy(d_img, img_arr, width * height * sizeof(float), cudaMemcpyHostToDevice); dim3 blockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y); dim3 numBlock(width / BLOCK_SIZE_X + 1, height / BLOCK_SIZE_Y + 1); conv<<<numBlock, blockSize>>>(d_ans, d_img, width, height, k_size, pad); cudaMemcpy(ans_arr, d_ans, width * height * sizeof(float), cudaMemcpyDeviceToHost); } void freeKernelAndAns() { cudaFree(d_img); cudaFree(d_ans); cudaFree(d_kernel); } extern "C" void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage, cl_device_id *device, cl_context *context, cl_program *program) { mallocKernelAndAns(filter, imageWidth, imageHeight, filterWidth, filterWidth / 2); convolution(inputImage, outputImage, imageWidth, imageHeight, filterWidth, filterWidth / 2); freeKernelAndAns(); } ``` #### (2) plot a chart to show the performance difference between using OpenCL and CUDA ![](https://i.imgur.com/SOnHiJu.png) #### (3) explain the result - The overall performance of CUDA is better than OpenCL, I think the main reason behind it is that CUDA is developed by NVIDIA, which is also the producer of the GPU in the homework environment (GeForce GTX 1060 6GB).