Last active
August 29, 2015 14:14
-
-
Save dniku/f168582fe98ba97346e4 to your computer and use it in GitHub Desktop.
Tiled matrix multiplication with CUDA, the algorithm is from https://www.coursera.org/course/hetero
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define TILE_WIDTH 16 | |
// NOTE: Since TILE_WIDTH is a constant, I cannot use the more readable idiom | |
// ceil(whatever/16.0) for calculating the number of tiles. Instead I use | |
// (whatever + TILE_WIDTH - 1) / TILE_WIDTH, which is equivalent. Trust me. | |
// Compute C = A * B | |
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows, | |
int numAColumns, int numBRows, | |
int numBColumns, int numCRows, | |
int numCColumns) { | |
__shared__ float A_tile[TILE_WIDTH][TILE_WIDTH]; | |
__shared__ float B_tile[TILE_WIDTH][TILE_WIDTH]; | |
float sum = 0.0; | |
// where am I? | |
// tx for thread_x or tile_x | |
int tx = threadIdx.x; int ty = threadIdx.y; | |
// cx for top left corner of tile in C | |
int cx = blockIdx.x * blockDim.x; int cy = blockIdx.y * blockDim.y; | |
// Cx for cell coordinates in C | |
int Cx = cx + tx; int Cy = cy + ty; | |
int total_tiles = (numAColumns + TILE_WIDTH - 1) / TILE_WIDTH; | |
for (int tile_idx = 0; tile_idx < total_tiles; tile_idx++) { | |
// the corresponding tiles' top left corners are: | |
// for A: row = blockIdx.y * blockDim.y, col = tile_idx * TILE_WIDTH | |
// for B: row = tile_idx * TILE_WIDTH, col = blockIdx.x * blockDim.x | |
// loading tiles | |
int Ax = tile_idx * TILE_WIDTH + tx; int Ay = cy + ty; | |
int Bx = cx + tx; int By = tile_idx * TILE_WIDTH + ty; | |
if (Ax < numAColumns && Ay < numARows) { | |
A_tile[ty][tx] = A[Ay * numAColumns + Ax]; | |
} | |
else { | |
A_tile[ty][tx] = 0.0; | |
} | |
if (Bx < numBColumns && By < numBRows) { | |
B_tile[ty][tx] = B[By * numBColumns + Bx]; | |
} | |
else { | |
B_tile[ty][tx] = 0.0; | |
} | |
__syncthreads(); | |
// multiplying tiles | |
for (int i = 0; i < TILE_WIDTH; i++) { | |
sum += A_tile[ty][i] * B_tile[i][tx]; | |
} | |
__syncthreads(); | |
} | |
// saving result (discarded if we're in the wrong thread) | |
if (Cx < numCColumns && Cy < numCRows) { | |
C[Cy * numCColumns + Cx] = sum; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment