# Parallel Programming HW-6 :::info <font color=#4381FA>***Q1***</font>: (5 points) Explain your implementation. How do you optimize the performance of convolution? ::: 首先我們先把各種所需的參數先定義好。 ```cpp= int globalSize = imageHeight * imageWidth; int filterSize = filterWidth * filterWidth * sizeof(float); ``` 設置 kernel 及我們所需要的 memory 空間。 ```cpp= cl_kernel kernel = clCreateKernel(*program, "convolution", NULL); cl_mem filterBuffer = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, filterSize, filter, NULL); cl_mem inputBuffer = clCreateBuffer(*context, CL_MEM_USE_HOST_PTR, imageSize, inputImage, NULL); cl_mem outputBuffer = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, imageSize, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(*context, *device, 0, NULL); ``` 設置 kernel 參數。 ```cpp= clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &filterBuffer); clSetKernelArg(kernel, 3, sizeof(int), &imageWidth); clSetKernelArg(kernel, 4, sizeof(int), &imageHeight); clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); ``` 執行 kernel 及 複製記憶體。 ```cpp= size_t globalThreads[2] = {imageWidth, imageHeight}; size_t localws[2] = {8, 8}; clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localws, 0, NULL, NULL); clFinish(queue); clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, 0, imageSize, outputImage, 0, NULL, NULL); ``` kernel 的部份基本上和 serial 一樣 差別在於我們指定一個 thread 計算某一個 pixel。 ```cpp= __kernel void convolution(const __global float *inputBuffer, __global float *outputBuffer, __constant float *filterBuffer, const int imageWidth, const int imageHeight, const int filterWidth) { float sum = 0.0; const int gid_x = get_global_id(0), gid_y = get_global_id(1); int xx, yy, ky, kx, pos; int half_filter_width = filterWidth >> 1; for (ky = -half_filter_width ; ky <= half_filter_width ; ++ky) { yy = gid_y + ky; if (yy >= 0 && yy < imageHeight) { for (kx = -half_filter_width ; kx <= half_filter_width ; ++kx ) { if (filterBuffer[(ky + half_filter_width) * filterWidth + kx + half_filter_width] == 0) continue; xx = gid_x + kx; if (xx >= 0 && xx < imageWidth) { pos = xx + yy * imageWidth; sum += inputBuffer[pos] * filterBuffer[(ky + half_filter_width) * filterWidth + kx + half_filter_width]; } } } } outputBuffer[gid_y * imageWidth + gid_x] = sum; } ``` :::info <font color=#4381FA>***Q2***</font>: (10 points) Rewrite the program using CUDA. * (1) Explain your CUDA implementation. * (2) plot a chart to show the performance difference between using OpenCL and CUDA. * (3) explain the result. ::: none