# HW5 ## CUDA1 ```c=1 #include <cuda.h> #include <stdio.h> #include <stdlib.h> #include <math.h> __device__ int mandel(float c_re, float c_im, int count) { float z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) break; float new_re = z_re * z_re - z_im * z_im; float new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } __global__ void mandelKernel(int* img_dev, float lowerX, float stepX, int resX, float lowerY, float stepY, int resY, int maxIterations) { int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int thisX = bx * blockDim.x + tx; int thisY = by * blockDim.y + ty; if(thisX >= resX || thisY >= resY) return; float x = lowerX + thisX * stepX; float y = lowerY + thisY * stepY; int index = resX * thisY+ thisX; img_dev[index] = mandel(x, y, maxIterations); } // Host front-end function that allocates the memory and launches the GPU kernel void hostFE (float upperX, float upperY, float lowerX, float lowerY, int* img, int resX, int resY, int maxIterations) { float stepX = (upperX - lowerX) / resX; float stepY = (upperY - lowerY) / resY; // allocate memory to device memory int size = resX * resY * sizeof(int); int* img_host = (int * )malloc(size * sizeof(int)); if (!img_host) { free(img_host); return; } int* img_dev; cudaMalloc(&img_dev, size); // kernel execution int block_x = 32; int block_y = 32; int grid_x = (int) ceil(resX/(float) block_x); int grid_y = (int) ceil(resY/(float) block_y); dim3 dimBlock(block_x, block_y); dim3 dimGrid(grid_x, grid_y); mandelKernel<<<dimGrid, dimBlock>>>(img_dev, lowerX, stepX, resX, lowerY, stepY, resY, maxIterations); // copy img_dev from device to host cudaMemcpy(img_host, img_dev, size, cudaMemcpyDeviceToHost); memcpy(img, img_host, size); free(img_host); cudaFree(img_dev); } ``` ## CUDA2 ```c=1 // allocate memory to device memory int size = resX * resY * sizeof(int); int width = resX * sizeof(int); int* img_host = (int * )malloc(size * sizeof(int)); if (!img_host) { free(img_host); return; } cudaHostAlloc(&img_host, size, cudaHostAllocMapped); int* img_dev; size_t pitch; cudaMallocPitch(&img_dev, &pitch, width, resY); // kernel execution int block_x = 32; int block_y = 32; int grid_x = (int) ceil(resX/(float) block_x); int grid_y = (int) ceil(resY/(float) block_y); dim3 dimBlock(block_x, block_y); dim3 dimGrid(grid_x, grid_y); mandelKernel<<<dimGrid, dimBlock>>>(img_dev, lowerX, stepX, resX, lowerY, stepY, resY, maxIterations); // Synchronize to ensure no memory, which is not pageable, // access by host and device at the same time cudaDeviceSynchronize(); ```