Skip to content

Instantly share code, notes, and snippets.

@michaelfortunato
Last active April 28, 2025 14:29
Show Gist options
  • Save michaelfortunato/a057a9e1231925ec4455b0e901039d1c to your computer and use it in GitHub Desktop.
Save michaelfortunato/a057a9e1231925ec4455b0e901039d1c to your computer and use it in GitHub Desktop.
Matrix Multiplication Using Shared Memory
#include <cuda_runtime_api.h>
#include <stdio.h>
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
struct Matrix {
int width;
int height;
float *elements;
};
__global__ void mat_add(float *A, float *B, float *C, int width) {
int i = threadIdx.y + blockIdx.y * blockDim.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
C[i * width + j] = A[i * width + j] + B[i * width + j];
}
__global__ void mat_mul_tiled(struct Matrix *A, struct Matrix *B,
struct Matrix *C) {
int BLOCK_SIZE = blockDim.x;
int i = threadIdx.y + blockIdx.y * blockDim.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float As[5][5];
__shared__ float Bs[5][5];
float sum = 0;
for (int m = 0; m < A->width / BLOCK_SIZE; ++m) {
int AIdx = blockIdx.y * BLOCK_SIZE * A->width + threadIdx.y * A->width +
m * BLOCK_SIZE + threadIdx.x;
int BIdx = m * BLOCK_SIZE * B->width + B->width * threadIdx.y +
blockIdx.x * BLOCK_SIZE + threadIdx.x;
if ((AIdx >= A->width * A->height)) {
As[threadIdx.y][threadIdx.x] = 0.0f;
// Bs[threadIdx.y][threadIdx.x] = 0.0f;
} else {
As[threadIdx.y][threadIdx.x] = A->elements[AIdx];
// Bs[threadIdx.y][threadIdx.x] = B->elements[BIdx];
}
if ((BIdx >= B->width * B->height)) {
Bs[threadIdx.y][threadIdx.x] = 0.0f;
// Bs[threadIdx.y][threadIdx.x] = 0.0f;
} else {
Bs[threadIdx.y][threadIdx.x] = B->elements[BIdx];
// Bs[threadIdx.y][threadIdx.x] = B->elements[BIdx];
}
// make sure all threads wait before going in
__syncthreads();
for (int q = 0; q < BLOCK_SIZE; ++q) {
sum += As[threadIdx.y][q] * Bs[q][threadIdx.x];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
C->elements[(threadIdx.y + blockIdx.y * blockDim.y) * C->width + threadIdx.x +
blockIdx.x * blockDim.x] = sum;
}
__global__ void mat_mul(struct Matrix *A, struct Matrix *B, struct Matrix *C) {
int i = threadIdx.y + blockIdx.y * blockDim.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
float acc = 0;
for (int k = 0; k < A->width; k++) {
acc += A->elements[k + A->width * i] * B->elements[B->width * k + j];
}
C->elements[i * C->width + j] = acc;
}
struct Matrix *Matrix_init(int height, int width) {
struct Matrix *m = (struct Matrix *)malloc(sizeof(float));
m->height = height;
m->width = width;
m->elements = (float *)malloc(sizeof(float) * width * height);
return m;
}
struct Matrix *Matrix_fill(struct Matrix *m, float value) {
for (int i = 0; i < m->height; i++) {
for (int j = 0; j < m->width; j++) {
m->elements[i * m->width + j] = value;
}
}
return m;
}
typedef void (*matrix_callback)(int i, int j, struct Matrix *m);
struct Matrix *Matrix_iter(struct Matrix *m, matrix_callback cb) {
for (int i = 0; i < m->height; i++) {
for (int j = 0; j < m->width; j++) {
cb(i, j, m);
}
}
return m;
}
void Matrix_print(struct Matrix *m) {
for (int i = 0; i < m->height; i++) {
for (int j = 0; j < m->width; j++) {
printf("M[%d][%d] = %f\n", i, j, m->elements[i * m->width + j]);
}
}
}
typedef void (*matrix_callback_old)(int i, int j, int n, float *A);
void iterate_and_call(float *A, int n, matrix_callback_old cb) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cb(i, j, n, A);
}
}
}
void print_matrix(int i, int j, int n, float *A) {
printf("X[%d][%d] = %f\n", i, j, A[i * n + j]);
}
void init_matrix_1(int i, int j, int n, float *A) { A[i * n + j] = 1.0f; }
void init_matrix_2(int i, int j, int n, float *A) { A[i * n + j] = 2.0f; }
// dim3 threadsPerBlock(int x_dim, int y_dim) {
// dim3 shape;
// shape.x= x_dim;
// shape.y = x_dim;
// return shape;
// }
// void assert_cudaError(cudaError_t err) {
// if (err != cudaSuccess) {
// fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
// cudaGetErrorString(err));
// exit(EXIT_FAILURE);
// }
// }
inline void _assert_cudaError(cudaError_t err, const char *file, int line,
bool abort = true) {
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
inline void _cudaDeviceFlushPrintf() { cudaDeviceSynchronize(); }
#define assert_cudaError(ans) \
{ \
_assert_cudaError((ans), __FILE__, __LINE__); \
}
int main(int argc, char *argv[]) {
cudaError_t err = cudaSuccess;
const int n = 1 << 10;
struct Matrix *X = Matrix_init(10, 100);
X = Matrix_fill(X, 3.0f);
struct Matrix *Y = Matrix_init(100, 20);
Y = Matrix_fill(Y, 5.0f);
struct Matrix *Z = Matrix_init(10, 20);
struct Matrix X_d, Y_d, Z_d;
X_d.width = X->width;
X_d.height = X->height;
Y_d.width = Y->width;
Y_d.height = Y->height;
Z_d.width = Z->width;
Z_d.height = Z->height;
float *X_d_e, *Y_d_e, *Z_d_e;
err = cudaMalloc(&X_d_e, sizeof(float) * X->width * X->height);
assert_cudaError(err);
err = cudaMalloc(&Y_d_e, sizeof(float) * Y->width * Y->height);
assert_cudaError(err);
err = cudaMalloc(&Z_d_e, sizeof(float) * Z->width * Z->height);
assert_cudaError(err);
X_d.elements = X_d_e;
Y_d.elements = Y_d_e;
Z_d.elements = Z_d_e;
assert_cudaError(cudaMemcpy(X_d.elements, X->elements,
sizeof(float) * X->width * X->height,
cudaMemcpyHostToDevice));
assert_cudaError(cudaMemcpy(Y_d.elements, Y->elements,
sizeof(float) * Y->width * Y->height,
cudaMemcpyHostToDevice));
assert_cudaError(cudaMemcpy(Z_d.elements, Z->elements,
sizeof(float) * Z->width * Z->height,
cudaMemcpyHostToDevice));
// what's a good way to block this out?
// The first obvious way to is block it over Z
// Let's say each grid is 5 units high and 1 unit long.
// y is height and x is widht
dim3 threadsPerBlock1(10, 10);
dim3 blocksPerGrid(Z->width / threadsPerBlock1.x,
Z->height / threadsPerBlock1.y);
mat_mul_tiled<<<blocksPerGrid, threadsPerBlock1>>>(&X_d, &Y_d, &Z_d);
assert_cudaError(cudaGetLastError());
cudaMemcpy(Z->elements, Z_d.elements, sizeof(float) * Z_d.width * Z_d.height,
cudaMemcpyDeviceToHost);
Matrix_print(Z);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment