Skip to content

Instantly share code, notes, and snippets.

@michaelfortunato
Created April 26, 2025 22:59
Show Gist options
  • Save michaelfortunato/bb03dc4a38c463956df386ed78f8f322 to your computer and use it in GitHub Desktop.
Save michaelfortunato/bb03dc4a38c463956df386ed78f8f322 to your computer and use it in GitHub Desktop.
CUDA Matrix Multiplication Beginner
#include <cuda_runtime_api.h>
#include <stdio.h>
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
struct Matrix {
int width;
int height;
float *elements;
};
__global__ void mat_add(float *A, float *B, float *C, int width) {
int i = threadIdx.y + blockIdx.y * blockDim.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
C[i * width + j] = A[i * width + j] + B[i * width + j];
}
__global__ void mat_mul(struct Matrix *A, struct Matrix *B, struct Matrix *C) {
int i = threadIdx.y + blockIdx.y * blockDim.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
float acc = 0;
for (int k = 0; k < A->width; k++) {
acc += A->elements[k + A->width * i] * B->elements[B->width * k + j];
}
C->elements[i * A->width + j] = acc;
}
struct Matrix *Matrix_init(int height, int width) {
struct Matrix *m = (struct Matrix *)malloc(sizeof(float));
m->height = height;
m->width = width;
m->elements = (float *)malloc(sizeof(float) * width * height);
return m;
}
struct Matrix *Matrix_fill(struct Matrix *m, float value) {
for (int i = 0; i < m->height; i++) {
for (int j = 0; j < m->width; j++) {
m->elements[i * m->width + j] = value;
}
}
return m;
}
typedef void (*matrix_callback)(int i, int j, struct Matrix *m);
struct Matrix *Matrix_iter(struct Matrix *m, matrix_callback cb) {
for (int i = 0; i < m->height; i++) {
for (int j = 0; j < m->width; j++) {
cb(i, j, m);
}
}
return m;
}
void Matrix_print(struct Matrix *m) {
for (int i = 0; i < m->height; i++) {
for (int j = 0; j < m->width; j++) {
printf("M[%d][%d] = %f\n", i, j, m->elements[i * m->width + j]);
}
}
}
typedef void (*matrix_callback_old)(int i, int j, int n, float *A);
void iterate_and_call(float *A, int n, matrix_callback_old cb) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cb(i, j, n, A);
}
}
}
void print_matrix(int i, int j, int n, float *A) {
printf("X[%d][%d] = %f\n", i, j, A[i * n + j]);
}
void init_matrix_1(int i, int j, int n, float *A) { A[i * n + j] = 1.0f; }
void init_matrix_2(int i, int j, int n, float *A) { A[i * n + j] = 2.0f; }
// dim3 threadsPerBlock(int x_dim, int y_dim) {
// dim3 shape;
// shape.x= x_dim;
// shape.y = x_dim;
// return shape;
// }
// void assert_cudaError(cudaError_t err) {
// if (err != cudaSuccess) {
// fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
// cudaGetErrorString(err));
// exit(EXIT_FAILURE);
// }
// }
inline void _assert_cudaError(cudaError_t err, const char *file, int line,
bool abort = true) {
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
inline void _cudaDeviceFlushPrintf() { cudaDeviceSynchronize(); }
#define assert_cudaError(ans) \
{ \
_assert_cudaError((ans), __FILE__, __LINE__); \
}
int main(int argc, char *argv[]) {
printf("Hello, World!\n");
cudaError_t err = cudaSuccess;
const int n = 1 << 10;
// float *A = (float *)malloc(sizeof(float) * n * n);
// float *B = (float *)malloc(sizeof(float) * n * n);
// float *C = (float *)malloc(sizeof(float) * n * n);
// float *A_d = NULL;
// float *B_d = NULL;
// float *C_d = NULL;
// // Error code to check return values for CUDA calls
// err = cudaMalloc((void **)&A_d, sizeof(float) * n * n);
// assert_cudaError(err);
// err = cudaMalloc((void **)&B_d, sizeof(float) * n * n);
// assert_cudaError(err);
// err = cudaMalloc((void **)&C_d, sizeof(float) * n * n);
// assert_cudaError(err);
// iterate_and_call(A, n, init_matrix_1);
// iterate_and_call(B, n, init_matrix_2);
// cudaMemcpy(A_d, A, sizeof(float) * n * n, cudaMemcpyHostToDevice);
// cudaMemcpy(B_d, B, sizeof(float) * n * n, cudaMemcpyHostToDevice);
// // cudaMemcpy(C_d, C, sizeof(float) * n * n, cudaMemcpyHostToDevice);
//
// dim3 threadsPerBlock(16, 16, 1);
// dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y, 1);
// mat_add<<<numBlocks, threadsPerBlock>>>(A_d, B_d, C_d, n);
// assert_cudaError(cudaPeekAtLastError());
// cudaMemcpy(C, C_d, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
// // iterate_and_call(C, n, print_matrix);
// // printf("Goodbye, World!\n");
// _cudaDeviceFlushPrintf();
struct Matrix *X = Matrix_init(20, 10);
X = Matrix_fill(X, 3.0f);
struct Matrix *Y = Matrix_init(10, 5);
Y = Matrix_fill(Y, 5.0f);
struct Matrix *Z = Matrix_init(20, 5);
// what's a good way to block this out?
// The first obvious way to is block it over Z
// Let's say each grid is 5 units high and 1 unit long.
// y is height and x is widht
dim3 threadsPerBlock1(1, 5);
dim3 blocksPerGrid(Z->width / threadsPerBlock1.x,
Z->height / threadsPerBlock1.y);
struct Matrix *X_d, *Y_d, *Z_d;
cudaMalloc((void **)&X_d, sizeof(struct Matrix));
cudaMalloc((void **)&Y_d, sizeof(struct Matrix));
cudaMalloc((void **)&Z_d, sizeof(struct Matrix));
cudaMemcpy(X_d, X, sizeof(struct Matrix), cudaMemcpyHostToDevice);
cudaMemcpy(Y_d, Y, sizeof(struct Matrix), cudaMemcpyHostToDevice);
cudaMemcpy(Z_d, Z, sizeof(struct Matrix), cudaMemcpyHostToDevice);
float *X_d_e, *Y_d_e, *Z_d_e;
cudaMalloc((void **)&X_d_e, sizeof(float) * X->width * X->height);
cudaMalloc((void **)&Y_d_e, sizeof(float) * Y->width * Y->height);
cudaMalloc((void **)&Z_d_e, sizeof(float) * Z->width * Z->height);
X_d->elements = X_d_e;
Y_d->elements = Y_d_e;
Z_d->elements = Z_d_e;
cudaMemcpy(X_d->elements, X->elements, sizeof(float) * X->width * X->height,
cudaMemcpyHostToDevice);
cudaMemcpy(Y_d->elements, Y->elements, sizeof(float) * Y->width * Y->height,
cudaMemcpyHostToDevice);
cudaMemcpy(Z_d->elements, Z->elements, sizeof(float) * Z->width * Z->height,
cudaMemcpyHostToDevice);
mat_mul<<<blocksPerGrid, threadsPerBlock1>>>(X_d, Y_d, Z_d);
cudaMemcpy(Z->elements, Z_d->elements,
sizeof(float) * Z_d->width * Z_d->height, cudaMemcpyDeviceToHost);
Matrix_print(Z);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment