# PP hw6
## Q1
### hostFE.c
運用 opencl 來提高 performance,以下我將解釋程式碼:
1.創建 buffer:
```cpp=
cl_mem c_filter = clCreateBuffer(*context, CL_MEM_READ_ONLY, filterSize, NULL, &status);
cl_mem c_inputImage = clCreateBuffer(*context, CL_MEM_READ_ONLY, mem_size, NULL, &status);
cl_mem c_outputImage = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, mem_size, NULL, &status);
```
2.將資料寫入 buffer:
```
clEnqueueWriteBuffer(myqueue, c_filter, CL_TRUE, 0, filterSize, (void *)filter, 0, NULL, NULL);
CHECK(status, "clEnqueueWriteBuffer");
clEnqueueWriteBuffer(myqueue, c_inputImage, CL_TRUE, 0, mem_size, (void *)inputImage, 0, NULL, NULL);
CHECK(status, "clEnqueueWriteBuffer");
```
3. 創建 kernel program,呼叫我們所撰寫的 convolution function:
```
cl_kernel mykernel = clCreateKernel(*program, "convolution", status);
clSetKernelArg(mykernel, 0, sizeof(cl_int), (void *)&filterWidth);
clSetKernelArg(mykernel, 1, sizeof(cl_mem), (void *)&c_filter);
clSetKernelArg(mykernel, 2, sizeof(cl_int), (void *)&imageHeight);
clSetKernelArg(mykernel, 3, sizeof(cl_int), (void *)&imageWidth);
clSetKernelArg(mykernel, 4, sizeof(cl_mem), (void *)&c_inputImage);
clSetKernelArg(mykernel, 5, sizeof(cl_mem), (void *)&c_outputImage);
```
4. 執行 kernel program:
```
size_t localws[2] = {10, 50};
size_t globalws[2] = {imageWidth, imageHeight};
clEnqueueNDRangeKernel(myqueue, mykernel, 2, 0, globalws, localws, 0, NULL, NULL);
CHECK(status, "clEnqueueNDRangeKernel");
```
globalws 是所有 global work size;localws 是 local work size,這個取值將決定效能。
5. 讀入資料並回收資源:
```
clEnqueueReadBuffer(myqueue, c_outputImage, CL_TRUE, 0, mem_size, (void *)outputImage, NULL, NULL, NULL);
CHECK(status, "clEnqueueReadBuffer");
clReleaseCommandQueue(myqueue);
clReleaseMemObject(c_filter);
clReleaseMemObject(c_inputImage);
clReleaseMemObject(c_outputImage);
clReleaseKernel(mykernel);
```
### Convolution kernel code
將 serial 的 convolution code 稍作改寫:
```
__kernel void convolution(int filterWidth, __global float *filter, int imageHeight, int imageWidth, __global float *inputImage, __global float *outputImage)
{
// Iterate over the rows of the source image
int halffilterSize = filterWidth / 2;
float sum;
int i, j, k, l;
i = get_global_id(1);
j = get_global_id(0);
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for (k = -halffilterSize; k <= halffilterSize; k++)
{
for (l = -halffilterSize; l <= halffilterSize; l++)
{
if (i + k >= 0 && i + k < imageHeight &&
j + l >= 0 && j + l < imageWidth)
{
sum += inputImage[(i + k) * imageWidth + j + l] *
filter[(k + halffilterSize) * filterWidth +
l + halffilterSize];
}
}
}
outputImage[i * imageWidth + j] = sum;
}
```