---
title: Parallel Programming HW6
---
Parallel Programming HW6
===
110550047 巫廷翰
# Q1
## OpenCL
+ `hostFE.c`
1. Initialize Command Queue and buffer
2. Set up with kernel Arguments
3. Kernel queue in
```cpp=
void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth,
float *inputImage, float *outputImage, cl_device_id *device,
cl_context *context, cl_program *program)
{
cl_int status;
int filterSize = filterWidth * filterWidth;
int imageSize = imageHeight * imageWidth;
// Create a command queue
cl_command_queue commandQueue = clCreateCommandQueue(*context, *device, 0, NULL);
// Create memory buffers on the device
cl_mem inputImgMem = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, imageSize*sizeof(float), inputImage, NULL);
cl_mem filterMem = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, filterSize*sizeof(float), filter, NULL);
cl_mem outputImgMem = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, imageSize*sizeof(float), NULL, NULL);
cl_kernel kernel = clCreateKernel(*program, "convolution", NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputImgMem);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputImgMem);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&filterMem);
clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&imageHeight);
clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&imageWidth);
clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&filterWidth);
size_t global_work_size = imageSize;
size_t local_work_size = 64; // Note : global_work_size must be divisible by local_work_size
clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
clEnqueueReadBuffer(commandQueue, outputImgMem, CL_TRUE, 0, imageSize*sizeof(float), outputImage, 0, NULL, NULL);
// release opencl object
// clReleaseCommandQueue(commandQueue);
// clReleaseMemObject(inputImgMem);
// clReleaseMemObject(outputImgMem);
// clReleaseMemObject(filterMem);
// clReleaseKernel(kernel);
}
```
+ `kernel.cl`
1. According to this [forum](https://stackoverflow.com/questions/17991714/opencl-difference-between-constant-memory-and-const-global-memory), NIVIDIA's GPU would cache `__costant` variable, so we can speed up the kernel by setting up the constant parameters.
2. Used `#pragma unroll` for the filter loops for loop optimization.
```opencl=
__kernel void convolution(const __global float *inputImage, __global float *outputImage, __constant float *filter,
const int imageHeight, const int imageWidth, const int filterWidth)
{
int idx = get_global_id(0);
int row = idx / imageWidth;
int col = idx % imageWidth;
int halfFilterSize = filterWidth >> 1;
int k, l;
float sum = 0.0f;
#pragma unroll
for (k = -halfFilterSize;k < halfFilterSize + 1;k++) {
if ( row + k < 0 || row + k > imageHeight - 1)
continue;
#pragma unroll
for (l = -halfFilterSize; l < halfFilterSize + 1; l++)
{
if(filter[(k + halfFilterSize) * filterWidth + l + halfFilterSize] != 0 &&
/*row + k > -1 && row + k < imageHeight &&*/
col + l > -1 && col + l < imageWidth)
{
sum += inputImage[(row + k) * imageWidth + col + l] *
filter[(k + halfFilterSize) * filterWidth +
l + halfFilterSize];
}
}
}
outputImage[row * imageWidth + col] = sum;
}
```
# Q2

From the statistics, we can observe that the difference between OpenCL and CUDA performance is not that much. However, OpenCL seems perform well in Larger filter size, this might cause by the efficiency in OpenCL's memory management for larger memory patterns.
As for the smaller kernel, CUDA's native scheduling and execution modal might be more efficient for smaller computational loads.
## CUDA
+ `hostFE.cu`
+ `cudaMalloc` `cudaMemcpy` helps managing memory on GPU
+ launch kernel with 25x25 threads per block and grid covering entire image.
+ No shared memory usage for optimization.
```cuda=
#include <cuda_runtime.h>
#include "helper.h"
extern "C" void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth,
float *inputImage, float *outputImage);
__global__ void convolutionKernel(const float *inputImage, float *outputImage, const float *filter,
const int imageHeight, const int imageWidth, const int filterWidth)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
if(idx >= imageWidth || idy >= imageHeight) return;
float sum = 0.0f;
int halfFilterSize = filterWidth >> 1;
// Calculate bounds once
int startX = idx - halfFilterSize;
int startY = idy - halfFilterSize;
int endX = idx + halfFilterSize;
int endY = idy + halfFilterSize;
#pragma unroll
for(int y = startY, fy = 0; y <= endY; y++, fy++) {
if(y < 0 || y >= imageHeight) continue;
#pragma unroll
for(int x = startX, fx = 0; x <= endX; x++, fx++) {
if(x < 0 || x >= imageWidth) continue;
float filterVal = filter[fy * filterWidth + fx];
if(filterVal != 0.0f) { // Skip zero filter values
sum += inputImage[y * imageWidth + x] * filterVal;
}
}
}
outputImage[idy * imageWidth + idx] = sum;
}
// Host function
void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth,
float *inputImage, float *outputImage)
{
// Allocate device memory
float *d_input, *d_output, *d_filter;
int imageSize = imageHeight * imageWidth * sizeof(float);
int filterSize = filterWidth * filterWidth * sizeof(float);
cudaMalloc(&d_input, imageSize);
cudaMalloc(&d_output, imageSize);
cudaMalloc(&d_filter, filterSize);
// Copy data to device
cudaMemcpy(d_input, inputImage, imageSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_filter, filter, filterSize, cudaMemcpyHostToDevice);
// Set up grid and block dimensions
dim3 blockDim(25, 25);
dim3 gridDim((imageWidth + blockDim.x - 1) / blockDim.x,
(imageHeight + blockDim.y - 1) / blockDim.y);
// Launch kernel
convolutionKernel<<<gridDim, blockDim>>>(d_input, d_output, d_filter,
imageHeight, imageWidth, filterWidth);
// Copy result back to host
cudaMemcpy(outputImage, d_output, imageSize, cudaMemcpyDeviceToHost);
// Cleanup
// cudaFree(d_input);
// cudaFree(d_output);
// cudaFree(d_filter);
}
```