# 平行程式作業-hw6
###### tags: `HW` `PP`
## Q1
>Explain your implementation. How do you optimize the performance of convolution?
在HostFE.c中,我使用了CL_MEM_USE_HOST_PTR來傳遞data到device中。
並且將local size設為8
```cpp=
#define LOCAL_SIZE 8
void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth,
float *inputImage, float *outputImage, cl_device_id *device,
cl_context *context, cl_program *program)
{
cl_int status;
int filterSize = filterWidth * filterWidth * sizeof(float);
int imgsize = imageHeight * imageWidth * sizeof(float);
size_t global_work_size[2] = {imageWidth, imageHeight};
size_t local_work_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
cl_command_queue queue = clCreateCommandQueue( *context, *device, 0, NULL );
cl_mem inputImgMem = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, imgsize, inputImage, NULL);
cl_mem filterMem = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, filterSize, filter, NULL);
cl_mem outputImgMem = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, imgsize, NULL, NULL);
cl_kernel kernel = clCreateKernel(*program, "convolution", NULL);
clSetKernelArg(kernel, 0, sizeof(cl_int), (void *) &filterWidth);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &filterMem);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &inputImgMem);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &outputImgMem);
clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work_size, local_work_size, 0 , NULL ,NULL );
clEnqueueReadBuffer( queue, outputImgMem, CL_TRUE, 0, imgsize, (void *)outputImage, 0, NULL, NULL);
clReleaseCommandQueue( queue );
clReleaseMemObject(filterMem);
clReleaseMemObject(inputImgMem);
clReleaseMemObject(outputImgMem);
clReleaseKernel(kernel);
}
```
在kernel function參考助教給的code並改為以下
```cpp=
__kernel void convolution( int filterWidth, __constant float *filter, __global float *inputImage, __global float *outputImage)
{
// Iterate over the rows of the source image
int halffilterSize = filterWidth / 2;
int imageHeight = get_global_size(1);
int imageWidth = get_global_size(0);
int i = get_global_id(1);
int j = get_global_id(0);
float sum;
int k, l;
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for (k = -halffilterSize; k <= halffilterSize; k++)
{
for (l = -halffilterSize; l <= halffilterSize; l++)
{
if (i + k >= 0 && i + k < imageHeight &&
j + l >= 0 && j + l < imageWidth)
{
sum += inputImage[(i + k) * imageWidth + j + l] *
filter[(k + halffilterSize) * filterWidth +
l + halffilterSize];
}
}
}
outputImage[i * imageWidth + j] = sum;
}
```
[reference1](https://www.cnblogs.com/mikewolf2002/archive/2012/09/05/2671261.html)
[reference2](https://www.cnblogs.com/mikewolf2002/archive/2012/09/07/2675634.html)