Skip to content

Instantly share code, notes, and snippets.

@dniku
Last active August 29, 2015 14:14
Show Gist options
  • Save dniku/f168582fe98ba97346e4 to your computer and use it in GitHub Desktop.
Save dniku/f168582fe98ba97346e4 to your computer and use it in GitHub Desktop.
Tiled matrix multiplication with CUDA, the algorithm is from https://www.coursera.org/course/hetero
#define TILE_WIDTH 16
// NOTE: Since TILE_WIDTH is a constant, I cannot use the more readable idiom
// ceil(whatever/16.0) for calculating the number of tiles. Instead I use
// (whatever + TILE_WIDTH - 1) / TILE_WIDTH, which is equivalent. Trust me.
// Compute C = A * B
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows,
int numAColumns, int numBRows,
int numBColumns, int numCRows,
int numCColumns) {
__shared__ float A_tile[TILE_WIDTH][TILE_WIDTH];
__shared__ float B_tile[TILE_WIDTH][TILE_WIDTH];
float sum = 0.0;
// where am I?
// tx for thread_x or tile_x
int tx = threadIdx.x; int ty = threadIdx.y;
// cx for top left corner of tile in C
int cx = blockIdx.x * blockDim.x; int cy = blockIdx.y * blockDim.y;
// Cx for cell coordinates in C
int Cx = cx + tx; int Cy = cy + ty;
int total_tiles = (numAColumns + TILE_WIDTH - 1) / TILE_WIDTH;
for (int tile_idx = 0; tile_idx < total_tiles; tile_idx++) {
// the corresponding tiles' top left corners are:
// for A: row = blockIdx.y * blockDim.y, col = tile_idx * TILE_WIDTH
// for B: row = tile_idx * TILE_WIDTH, col = blockIdx.x * blockDim.x
// loading tiles
int Ax = tile_idx * TILE_WIDTH + tx; int Ay = cy + ty;
int Bx = cx + tx; int By = tile_idx * TILE_WIDTH + ty;
if (Ax < numAColumns && Ay < numARows) {
A_tile[ty][tx] = A[Ay * numAColumns + Ax];
}
else {
A_tile[ty][tx] = 0.0;
}
if (Bx < numBColumns && By < numBRows) {
B_tile[ty][tx] = B[By * numBColumns + Bx];
}
else {
B_tile[ty][tx] = 0.0;
}
__syncthreads();
// multiplying tiles
for (int i = 0; i < TILE_WIDTH; i++) {
sum += A_tile[ty][i] * B_tile[i][tx];
}
__syncthreads();
}
// saving result (discarded if we're in the wrong thread)
if (Cx < numCColumns && Cy < numCRows) {
C[Cy * numCColumns + Cx] = sum;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment