--- title: Parallel Programming HW6 --- Parallel Programming HW6 === 110550047 巫廷翰 # Q1 ## OpenCL + `hostFE.c` 1. Initialize Command Queue and buffer 2. Set up with kernel Arguments 3. Kernel queue in ```cpp= void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage, cl_device_id *device, cl_context *context, cl_program *program) { cl_int status; int filterSize = filterWidth * filterWidth; int imageSize = imageHeight * imageWidth; // Create a command queue cl_command_queue commandQueue = clCreateCommandQueue(*context, *device, 0, NULL); // Create memory buffers on the device cl_mem inputImgMem = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, imageSize*sizeof(float), inputImage, NULL); cl_mem filterMem = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, filterSize*sizeof(float), filter, NULL); cl_mem outputImgMem = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, imageSize*sizeof(float), NULL, NULL); cl_kernel kernel = clCreateKernel(*program, "convolution", NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputImgMem); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputImgMem); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&filterMem); clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&imageHeight); clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&imageWidth); clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&filterWidth); size_t global_work_size = imageSize; size_t local_work_size = 64; // Note : global_work_size must be divisible by local_work_size clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); clEnqueueReadBuffer(commandQueue, outputImgMem, CL_TRUE, 0, imageSize*sizeof(float), outputImage, 0, NULL, NULL); // release opencl object // clReleaseCommandQueue(commandQueue); // clReleaseMemObject(inputImgMem); // clReleaseMemObject(outputImgMem); // clReleaseMemObject(filterMem); // clReleaseKernel(kernel); } ``` + `kernel.cl` 1. According to this [forum](https://stackoverflow.com/questions/17991714/opencl-difference-between-constant-memory-and-const-global-memory), NIVIDIA's GPU would cache `__costant` variable, so we can speed up the kernel by setting up the constant parameters. 2. Used `#pragma unroll` for the filter loops for loop optimization. ```opencl= __kernel void convolution(const __global float *inputImage, __global float *outputImage, __constant float *filter, const int imageHeight, const int imageWidth, const int filterWidth) { int idx = get_global_id(0); int row = idx / imageWidth; int col = idx % imageWidth; int halfFilterSize = filterWidth >> 1; int k, l; float sum = 0.0f; #pragma unroll for (k = -halfFilterSize;k < halfFilterSize + 1;k++) { if ( row + k < 0 || row + k > imageHeight - 1) continue; #pragma unroll for (l = -halfFilterSize; l < halfFilterSize + 1; l++) { if(filter[(k + halfFilterSize) * filterWidth + l + halfFilterSize] != 0 && /*row + k > -1 && row + k < imageHeight &&*/ col + l > -1 && col + l < imageWidth) { sum += inputImage[(row + k) * imageWidth + col + l] * filter[(k + halfFilterSize) * filterWidth + l + halfFilterSize]; } } } outputImage[row * imageWidth + col] = sum; } ``` # Q2 ![image](https://hackmd.io/_uploads/rJ-zpb3N1x.png) From the statistics, we can observe that the difference between OpenCL and CUDA performance is not that much. However, OpenCL seems perform well in Larger filter size, this might cause by the efficiency in OpenCL's memory management for larger memory patterns. As for the smaller kernel, CUDA's native scheduling and execution modal might be more efficient for smaller computational loads. ## CUDA + `hostFE.cu` + `cudaMalloc` `cudaMemcpy` helps managing memory on GPU + launch kernel with 25x25 threads per block and grid covering entire image. + No shared memory usage for optimization. ```cuda= #include <cuda_runtime.h> #include "helper.h" extern "C" void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage); __global__ void convolutionKernel(const float *inputImage, float *outputImage, const float *filter, const int imageHeight, const int imageWidth, const int filterWidth) { int idx = blockDim.x * blockIdx.x + threadIdx.x; int idy = blockDim.y * blockIdx.y + threadIdx.y; if(idx >= imageWidth || idy >= imageHeight) return; float sum = 0.0f; int halfFilterSize = filterWidth >> 1; // Calculate bounds once int startX = idx - halfFilterSize; int startY = idy - halfFilterSize; int endX = idx + halfFilterSize; int endY = idy + halfFilterSize; #pragma unroll for(int y = startY, fy = 0; y <= endY; y++, fy++) { if(y < 0 || y >= imageHeight) continue; #pragma unroll for(int x = startX, fx = 0; x <= endX; x++, fx++) { if(x < 0 || x >= imageWidth) continue; float filterVal = filter[fy * filterWidth + fx]; if(filterVal != 0.0f) { // Skip zero filter values sum += inputImage[y * imageWidth + x] * filterVal; } } } outputImage[idy * imageWidth + idx] = sum; } // Host function void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage) { // Allocate device memory float *d_input, *d_output, *d_filter; int imageSize = imageHeight * imageWidth * sizeof(float); int filterSize = filterWidth * filterWidth * sizeof(float); cudaMalloc(&d_input, imageSize); cudaMalloc(&d_output, imageSize); cudaMalloc(&d_filter, filterSize); // Copy data to device cudaMemcpy(d_input, inputImage, imageSize, cudaMemcpyHostToDevice); cudaMemcpy(d_filter, filter, filterSize, cudaMemcpyHostToDevice); // Set up grid and block dimensions dim3 blockDim(25, 25); dim3 gridDim((imageWidth + blockDim.x - 1) / blockDim.x, (imageHeight + blockDim.y - 1) / blockDim.y); // Launch kernel convolutionKernel<<<gridDim, blockDim>>>(d_input, d_output, d_filter, imageHeight, imageWidth, filterWidth); // Copy result back to host cudaMemcpy(outputImage, d_output, imageSize, cudaMemcpyDeviceToHost); // Cleanup // cudaFree(d_input); // cudaFree(d_output); // cudaFree(d_filter); } ```