Last active
April 5, 2024 08:02
-
-
Save wh5a/4641641 to your computer and use it in GitHub Desktop.
CUDA 2D Convolution.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <wb.h> | |
// Check ec2-174-129-21-232.compute-1.amazonaws.com:8080/mp/6 for more information | |
#define wbCheck(stmt) do { \ | |
cudaError_t err = stmt; \ | |
if (err != cudaSuccess) { \ | |
wbLog(ERROR, "Failed to run stmt ", #stmt); \ | |
return -1; \ | |
} \ | |
} while(0) | |
#define Mask_width 5 | |
#define Mask_radius Mask_width/2 | |
#define TILE_WIDTH 16 | |
#define w (TILE_WIDTH + Mask_width - 1) | |
#define clamp(x) (min(max((x), 0.0), 1.0)) | |
//@@ INSERT CODE HERE | |
__global__ void convolution(float *I, const float* __restrict__ M, float *P, | |
int channels, int width, int height) { | |
__shared__ float N_ds[w][w]; | |
int k; | |
for (k = 0; k < channels; k++) { | |
// First batch loading | |
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x, | |
destY = dest / w, destX = dest % w, | |
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius, | |
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius, | |
src = (srcY * width + srcX) * channels + k; | |
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width) | |
N_ds[destY][destX] = I[src]; | |
else | |
N_ds[destY][destX] = 0; | |
// Second batch loading | |
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH; | |
destY = dest / w, destX = dest % w; | |
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius; | |
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius; | |
src = (srcY * width + srcX) * channels + k; | |
if (destY < w) { | |
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width) | |
N_ds[destY][destX] = I[src]; | |
else | |
N_ds[destY][destX] = 0; | |
} | |
__syncthreads(); | |
float accum = 0; | |
int y, x; | |
for (y = 0; y < Mask_width; y++) | |
for (x = 0; x < Mask_width; x++) | |
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x]; | |
y = blockIdx.y * TILE_WIDTH + threadIdx.y; | |
x = blockIdx.x * TILE_WIDTH + threadIdx.x; | |
if (y < height && x < width) | |
P[(y * width + x) * channels + k] = clamp(accum); | |
__syncthreads(); | |
} | |
} | |
int main(int argc, char* argv[]) { | |
wbArg_t arg; | |
int maskRows; | |
int maskColumns; | |
int imageChannels; | |
int imageWidth; | |
int imageHeight; | |
char * inputImageFile; | |
char * inputMaskFile; | |
wbImage_t inputImage; | |
wbImage_t outputImage; | |
float * hostInputImageData; | |
float * hostOutputImageData; | |
float * hostMaskData; | |
float * deviceInputImageData; | |
float * deviceOutputImageData; | |
float * deviceMaskData; | |
arg = wbArg_read(argc, argv); /* parse the input arguments */ | |
inputImageFile = wbArg_getInputFile(arg, 0); | |
inputMaskFile = wbArg_getInputFile(arg, 1); | |
inputImage = wbImport(inputImageFile); | |
hostMaskData = (float *) wbImport(inputMaskFile, &maskRows, &maskColumns); | |
assert(maskRows == 5); /* mask height is fixed to 5 in this mp */ | |
assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */ | |
imageWidth = wbImage_getWidth(inputImage); | |
imageHeight = wbImage_getHeight(inputImage); | |
imageChannels = wbImage_getChannels(inputImage); | |
outputImage = wbImage_new(imageWidth, imageHeight, imageChannels); | |
hostInputImageData = wbImage_getData(inputImage); | |
hostOutputImageData = wbImage_getData(outputImage); | |
wbTime_start(GPU, "Doing GPU Computation (memory + compute)"); | |
wbTime_start(GPU, "Doing GPU memory allocation"); | |
cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)); | |
cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)); | |
cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float)); | |
wbTime_stop(GPU, "Doing GPU memory allocation"); | |
wbTime_start(Copy, "Copying data to the GPU"); | |
cudaMemcpy(deviceInputImageData, | |
hostInputImageData, | |
imageWidth * imageHeight * imageChannels * sizeof(float), | |
cudaMemcpyHostToDevice); | |
cudaMemcpy(deviceMaskData, | |
hostMaskData, | |
maskRows * maskColumns * sizeof(float), | |
cudaMemcpyHostToDevice); | |
wbTime_stop(Copy, "Copying data to the GPU"); | |
wbTime_start(Compute, "Doing the computation on the GPU"); | |
//@@ INSERT CODE HERE | |
dim3 dimGrid(ceil((float)imageWidth/TILE_WIDTH), ceil((float)imageHeight/TILE_WIDTH)); | |
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); | |
convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, | |
imageChannels, imageWidth, imageHeight); | |
wbTime_stop(Compute, "Doing the computation on the GPU"); | |
wbTime_start(Copy, "Copying data from the GPU"); | |
cudaMemcpy(hostOutputImageData, | |
deviceOutputImageData, | |
imageWidth * imageHeight * imageChannels * sizeof(float), | |
cudaMemcpyDeviceToHost); | |
wbTime_stop(Copy, "Copying data from the GPU"); | |
wbTime_stop(GPU, "Doing GPU Computation (memory + compute)"); | |
wbSolution(arg, outputImage); | |
cudaFree(deviceInputImageData); | |
cudaFree(deviceOutputImageData); | |
cudaFree(deviceMaskData); | |
free(hostMaskData); | |
wbImage_delete(outputImage); | |
wbImage_delete(inputImage); | |
return 0; | |
} |
I tested and it's very slowly , 10 seconds :( , with an image 512 x 512
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
I wanted to know if I can use your code to debug my own version of the program.
Regards.