# PP hw6 ## Q1 ### hostFE.c 運用 opencl 來提高 performance,以下我將解釋程式碼: 1.創建 buffer: ```cpp= cl_mem c_filter = clCreateBuffer(*context, CL_MEM_READ_ONLY, filterSize, NULL, &status); cl_mem c_inputImage = clCreateBuffer(*context, CL_MEM_READ_ONLY, mem_size, NULL, &status); cl_mem c_outputImage = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, mem_size, NULL, &status); ``` 2.將資料寫入 buffer: ``` clEnqueueWriteBuffer(myqueue, c_filter, CL_TRUE, 0, filterSize, (void *)filter, 0, NULL, NULL); CHECK(status, "clEnqueueWriteBuffer"); clEnqueueWriteBuffer(myqueue, c_inputImage, CL_TRUE, 0, mem_size, (void *)inputImage, 0, NULL, NULL); CHECK(status, "clEnqueueWriteBuffer"); ``` 3. 創建 kernel program,呼叫我們所撰寫的 convolution function: ``` cl_kernel mykernel = clCreateKernel(*program, "convolution", status); clSetKernelArg(mykernel, 0, sizeof(cl_int), (void *)&filterWidth); clSetKernelArg(mykernel, 1, sizeof(cl_mem), (void *)&c_filter); clSetKernelArg(mykernel, 2, sizeof(cl_int), (void *)&imageHeight); clSetKernelArg(mykernel, 3, sizeof(cl_int), (void *)&imageWidth); clSetKernelArg(mykernel, 4, sizeof(cl_mem), (void *)&c_inputImage); clSetKernelArg(mykernel, 5, sizeof(cl_mem), (void *)&c_outputImage); ``` 4. 執行 kernel program: ``` size_t localws[2] = {10, 50}; size_t globalws[2] = {imageWidth, imageHeight}; clEnqueueNDRangeKernel(myqueue, mykernel, 2, 0, globalws, localws, 0, NULL, NULL); CHECK(status, "clEnqueueNDRangeKernel"); ``` globalws 是所有 global work size;localws 是 local work size,這個取值將決定效能。 5. 讀入資料並回收資源: ``` clEnqueueReadBuffer(myqueue, c_outputImage, CL_TRUE, 0, mem_size, (void *)outputImage, NULL, NULL, NULL); CHECK(status, "clEnqueueReadBuffer"); clReleaseCommandQueue(myqueue); clReleaseMemObject(c_filter); clReleaseMemObject(c_inputImage); clReleaseMemObject(c_outputImage); clReleaseKernel(mykernel); ``` ### Convolution kernel code 將 serial 的 convolution code 稍作改寫: ``` __kernel void convolution(int filterWidth, __global float *filter, int imageHeight, int imageWidth, __global float *inputImage, __global float *outputImage) { // Iterate over the rows of the source image int halffilterSize = filterWidth / 2; float sum; int i, j, k, l; i = get_global_id(1); j = get_global_id(0); sum = 0; // Reset sum for new source pixel // Apply the filter to the neighborhood for (k = -halffilterSize; k <= halffilterSize; k++) { for (l = -halffilterSize; l <= halffilterSize; l++) { if (i + k >= 0 && i + k < imageHeight && j + l >= 0 && j + l < imageWidth) { sum += inputImage[(i + k) * imageWidth + j + l] * filter[(k + halffilterSize) * filterWidth + l + halffilterSize]; } } } outputImage[i * imageWidth + j] = sum; } ```