# HW5
## CUDA1
```c=1
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
__device__ int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i)
{
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re * z_re - z_im * z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}
return i;
}
__global__ void mandelKernel(int* img_dev, float lowerX, float stepX, int resX, float lowerY, float stepY, int resY, int maxIterations) {
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int thisX = bx * blockDim.x + tx;
int thisY = by * blockDim.y + ty;
if(thisX >= resX || thisY >= resY) return;
float x = lowerX + thisX * stepX;
float y = lowerY + thisY * stepY;
int index = resX * thisY+ thisX;
img_dev[index] = mandel(x, y, maxIterations);
}
// Host front-end function that allocates the memory and launches the GPU kernel
void hostFE (float upperX, float upperY, float lowerX, float lowerY, int* img, int resX, int resY, int maxIterations)
{
float stepX = (upperX - lowerX) / resX;
float stepY = (upperY - lowerY) / resY;
// allocate memory to device memory
int size = resX * resY * sizeof(int);
int* img_host = (int * )malloc(size * sizeof(int));
if (!img_host) {
free(img_host);
return;
}
int* img_dev;
cudaMalloc(&img_dev, size);
// kernel execution
int block_x = 32;
int block_y = 32;
int grid_x = (int) ceil(resX/(float) block_x);
int grid_y = (int) ceil(resY/(float) block_y);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(grid_x, grid_y);
mandelKernel<<<dimGrid, dimBlock>>>(img_dev, lowerX, stepX, resX, lowerY, stepY, resY, maxIterations);
// copy img_dev from device to host
cudaMemcpy(img_host, img_dev, size, cudaMemcpyDeviceToHost);
memcpy(img, img_host, size);
free(img_host);
cudaFree(img_dev);
}
```
## CUDA2
```c=1
// allocate memory to device memory
int size = resX * resY * sizeof(int);
int width = resX * sizeof(int);
int* img_host = (int * )malloc(size * sizeof(int));
if (!img_host) {
free(img_host);
return;
}
cudaHostAlloc(&img_host, size, cudaHostAllocMapped);
int* img_dev;
size_t pitch;
cudaMallocPitch(&img_dev, &pitch, width, resY);
// kernel execution
int block_x = 32;
int block_y = 32;
int grid_x = (int) ceil(resX/(float) block_x);
int grid_y = (int) ceil(resY/(float) block_y);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(grid_x, grid_y);
mandelKernel<<<dimGrid, dimBlock>>>(img_dev, lowerX, stepX, resX, lowerY, stepY, resY, maxIterations);
// Synchronize to ensure no memory, which is not pageable,
// access by host and device at the same time
cudaDeviceSynchronize();
```