Created
December 16, 2012 22:30
-
-
Save wh5a/4313739 to your computer and use it in GitHub Desktop.
CUDA Matrix Multiplication with Shared Memory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MP 2: Due Sunday, Dec 16, 2012 at 11:59 p.m. PST | |
#include <wb.h> | |
#define wbCheck(stmt) do { \ | |
cudaError_t err = stmt; \ | |
if (err != cudaSuccess) { \ | |
wbLog(ERROR, "Failed to run stmt ", #stmt); \ | |
return -1; \ | |
} \ | |
} while(0) | |
#define TILE_WIDTH 16 | |
// Compute C = A * B | |
__global__ void matrixMultiply(float * A, float * B, float * C, | |
int numARows, int numAColumns, | |
int numBRows, int numBColumns, | |
int numCRows, int numCColumns) { | |
//@@ Insert code to implement matrix multiplication here | |
__shared__ float ds_M[TILE_WIDTH][TILE_WIDTH]; | |
__shared__ float ds_N[TILE_WIDTH][TILE_WIDTH]; | |
int bx = blockIdx.x, by = blockIdx.y, | |
tx = threadIdx.x, ty = threadIdx.y, | |
Row = by * TILE_WIDTH + ty, | |
Col = bx * TILE_WIDTH + tx; | |
float Pvalue = 0; | |
for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) { | |
if (Row < numARows && m*TILE_WIDTH+tx < numAColumns) | |
ds_M[ty][tx] = A[Row*numAColumns + m*TILE_WIDTH+tx]; | |
else | |
ds_M[ty][tx] = 0; | |
if (Col < numBColumns && m*TILE_WIDTH+ty < numBRows) | |
ds_N[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+Col]; | |
else | |
ds_N[ty][tx] = 0; | |
__syncthreads(); | |
for (int k = 0; k < TILE_WIDTH; ++k) | |
Pvalue += ds_M[ty][k] * ds_N[k][tx]; | |
__syncthreads(); | |
} | |
if (Row < numCRows && Col < numCColumns) | |
C[Row*numCColumns+Col] = Pvalue; | |
} | |
int main(int argc, char ** argv) { | |
wbArg_t args; | |
float * hostA; // The A matrix | |
float * hostB; // The B matrix | |
float * hostC; // The output C matrix | |
float * deviceA; | |
float * deviceB; | |
float * deviceC; | |
int numARows; // number of rows in the matrix A | |
int numAColumns; // number of columns in the matrix A | |
int numBRows; // number of rows in the matrix B | |
int numBColumns; // number of columns in the matrix B | |
int numCRows; // number of rows in the matrix C (you have to set this) | |
int numCColumns; // number of columns in the matrix C (you have to set this) | |
args = wbArg_read(argc, argv); | |
wbTime_start(Generic, "Importing data and creating memory on host"); | |
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); | |
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); | |
//@@ Set numCRows and numCColumns | |
numCRows = numARows; | |
numCColumns = numBColumns; | |
//@@ Allocate the hostC matrix | |
hostC = (float *)malloc(sizeof(float) * numCRows * numCColumns); | |
wbTime_stop(Generic, "Importing data and creating memory on host"); | |
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); | |
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); | |
wbTime_start(GPU, "Allocating GPU memory."); | |
//@@ Allocate GPU memory here | |
cudaMalloc(&deviceA, sizeof(float) * numARows * numAColumns); | |
cudaMalloc(&deviceB, sizeof(float) * numBRows * numBColumns); | |
cudaMalloc(&deviceC, sizeof(float) * numCRows * numCColumns); | |
wbTime_stop(GPU, "Allocating GPU memory."); | |
wbTime_start(GPU, "Copying input memory to the GPU."); | |
//@@ Copy memory to the GPU here | |
cudaMemcpy(deviceA, hostA, sizeof(float) * numARows * numAColumns, cudaMemcpyHostToDevice); | |
cudaMemcpy(deviceB, hostB, sizeof(float) * numBRows * numBColumns, cudaMemcpyHostToDevice); | |
wbTime_stop(GPU, "Copying input memory to the GPU."); | |
//@@ Initialize the grid and block dimensions here | |
dim3 dimGrid((numCColumns-1)/TILE_WIDTH+1, (numCRows-1)/TILE_WIDTH+1, 1); | |
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); | |
wbTime_start(Compute, "Performing CUDA computation"); | |
//@@ Launch the GPU Kernel here | |
matrixMultiply<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, | |
numARows, numAColumns, | |
numBRows, numBColumns, | |
numCRows, numCColumns); | |
cudaThreadSynchronize(); | |
wbTime_stop(Compute, "Performing CUDA computation"); | |
wbTime_start(Copy, "Copying output memory to the CPU"); | |
//@@ Copy the GPU memory back to the CPU here | |
cudaMemcpy(hostC, deviceC, sizeof(float) * numCRows * numCColumns, cudaMemcpyDeviceToHost); | |
wbTime_stop(Copy, "Copying output memory to the CPU"); | |
wbTime_start(GPU, "Freeing GPU Memory"); | |
//@@ Free the GPU memory here | |
cudaFree(deviceA); | |
cudaFree(deviceB); | |
cudaFree(deviceC); | |
wbTime_stop(GPU, "Freeing GPU Memory"); | |
wbSolution(args, hostC, numCRows, numCColumns); | |
free(hostA); | |
free(hostB); | |
free(hostC); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment