Created
April 26, 2025 22:59
-
-
Save michaelfortunato/bb03dc4a38c463956df386ed78f8f322 to your computer and use it in GitHub Desktop.
CUDA Matrix Multiplication Beginner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime_api.h> | |
#include <stdio.h> | |
// Matrices are stored in row-major order: | |
// M(row, col) = *(M.elements + row * M.width + col) | |
struct Matrix { | |
int width; | |
int height; | |
float *elements; | |
}; | |
__global__ void mat_add(float *A, float *B, float *C, int width) { | |
int i = threadIdx.y + blockIdx.y * blockDim.y; | |
int j = threadIdx.x + blockIdx.x * blockDim.x; | |
C[i * width + j] = A[i * width + j] + B[i * width + j]; | |
} | |
__global__ void mat_mul(struct Matrix *A, struct Matrix *B, struct Matrix *C) { | |
int i = threadIdx.y + blockIdx.y * blockDim.y; | |
int j = threadIdx.x + blockIdx.x * blockDim.x; | |
float acc = 0; | |
for (int k = 0; k < A->width; k++) { | |
acc += A->elements[k + A->width * i] * B->elements[B->width * k + j]; | |
} | |
C->elements[i * A->width + j] = acc; | |
} | |
struct Matrix *Matrix_init(int height, int width) { | |
struct Matrix *m = (struct Matrix *)malloc(sizeof(float)); | |
m->height = height; | |
m->width = width; | |
m->elements = (float *)malloc(sizeof(float) * width * height); | |
return m; | |
} | |
struct Matrix *Matrix_fill(struct Matrix *m, float value) { | |
for (int i = 0; i < m->height; i++) { | |
for (int j = 0; j < m->width; j++) { | |
m->elements[i * m->width + j] = value; | |
} | |
} | |
return m; | |
} | |
typedef void (*matrix_callback)(int i, int j, struct Matrix *m); | |
struct Matrix *Matrix_iter(struct Matrix *m, matrix_callback cb) { | |
for (int i = 0; i < m->height; i++) { | |
for (int j = 0; j < m->width; j++) { | |
cb(i, j, m); | |
} | |
} | |
return m; | |
} | |
void Matrix_print(struct Matrix *m) { | |
for (int i = 0; i < m->height; i++) { | |
for (int j = 0; j < m->width; j++) { | |
printf("M[%d][%d] = %f\n", i, j, m->elements[i * m->width + j]); | |
} | |
} | |
} | |
typedef void (*matrix_callback_old)(int i, int j, int n, float *A); | |
void iterate_and_call(float *A, int n, matrix_callback_old cb) { | |
for (int i = 0; i < n; i++) { | |
for (int j = 0; j < n; j++) { | |
cb(i, j, n, A); | |
} | |
} | |
} | |
void print_matrix(int i, int j, int n, float *A) { | |
printf("X[%d][%d] = %f\n", i, j, A[i * n + j]); | |
} | |
void init_matrix_1(int i, int j, int n, float *A) { A[i * n + j] = 1.0f; } | |
void init_matrix_2(int i, int j, int n, float *A) { A[i * n + j] = 2.0f; } | |
// dim3 threadsPerBlock(int x_dim, int y_dim) { | |
// dim3 shape; | |
// shape.x= x_dim; | |
// shape.y = x_dim; | |
// return shape; | |
// } | |
// void assert_cudaError(cudaError_t err) { | |
// if (err != cudaSuccess) { | |
// fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", | |
// cudaGetErrorString(err)); | |
// exit(EXIT_FAILURE); | |
// } | |
// } | |
inline void _assert_cudaError(cudaError_t err, const char *file, int line, | |
bool abort = true) { | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", | |
cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
} | |
inline void _cudaDeviceFlushPrintf() { cudaDeviceSynchronize(); } | |
#define assert_cudaError(ans) \ | |
{ \ | |
_assert_cudaError((ans), __FILE__, __LINE__); \ | |
} | |
int main(int argc, char *argv[]) { | |
printf("Hello, World!\n"); | |
cudaError_t err = cudaSuccess; | |
const int n = 1 << 10; | |
// float *A = (float *)malloc(sizeof(float) * n * n); | |
// float *B = (float *)malloc(sizeof(float) * n * n); | |
// float *C = (float *)malloc(sizeof(float) * n * n); | |
// float *A_d = NULL; | |
// float *B_d = NULL; | |
// float *C_d = NULL; | |
// // Error code to check return values for CUDA calls | |
// err = cudaMalloc((void **)&A_d, sizeof(float) * n * n); | |
// assert_cudaError(err); | |
// err = cudaMalloc((void **)&B_d, sizeof(float) * n * n); | |
// assert_cudaError(err); | |
// err = cudaMalloc((void **)&C_d, sizeof(float) * n * n); | |
// assert_cudaError(err); | |
// iterate_and_call(A, n, init_matrix_1); | |
// iterate_and_call(B, n, init_matrix_2); | |
// cudaMemcpy(A_d, A, sizeof(float) * n * n, cudaMemcpyHostToDevice); | |
// cudaMemcpy(B_d, B, sizeof(float) * n * n, cudaMemcpyHostToDevice); | |
// // cudaMemcpy(C_d, C, sizeof(float) * n * n, cudaMemcpyHostToDevice); | |
// | |
// dim3 threadsPerBlock(16, 16, 1); | |
// dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y, 1); | |
// mat_add<<<numBlocks, threadsPerBlock>>>(A_d, B_d, C_d, n); | |
// assert_cudaError(cudaPeekAtLastError()); | |
// cudaMemcpy(C, C_d, sizeof(float) * n * n, cudaMemcpyDeviceToHost); | |
// // iterate_and_call(C, n, print_matrix); | |
// // printf("Goodbye, World!\n"); | |
// _cudaDeviceFlushPrintf(); | |
struct Matrix *X = Matrix_init(20, 10); | |
X = Matrix_fill(X, 3.0f); | |
struct Matrix *Y = Matrix_init(10, 5); | |
Y = Matrix_fill(Y, 5.0f); | |
struct Matrix *Z = Matrix_init(20, 5); | |
// what's a good way to block this out? | |
// The first obvious way to is block it over Z | |
// Let's say each grid is 5 units high and 1 unit long. | |
// y is height and x is widht | |
dim3 threadsPerBlock1(1, 5); | |
dim3 blocksPerGrid(Z->width / threadsPerBlock1.x, | |
Z->height / threadsPerBlock1.y); | |
struct Matrix *X_d, *Y_d, *Z_d; | |
cudaMalloc((void **)&X_d, sizeof(struct Matrix)); | |
cudaMalloc((void **)&Y_d, sizeof(struct Matrix)); | |
cudaMalloc((void **)&Z_d, sizeof(struct Matrix)); | |
cudaMemcpy(X_d, X, sizeof(struct Matrix), cudaMemcpyHostToDevice); | |
cudaMemcpy(Y_d, Y, sizeof(struct Matrix), cudaMemcpyHostToDevice); | |
cudaMemcpy(Z_d, Z, sizeof(struct Matrix), cudaMemcpyHostToDevice); | |
float *X_d_e, *Y_d_e, *Z_d_e; | |
cudaMalloc((void **)&X_d_e, sizeof(float) * X->width * X->height); | |
cudaMalloc((void **)&Y_d_e, sizeof(float) * Y->width * Y->height); | |
cudaMalloc((void **)&Z_d_e, sizeof(float) * Z->width * Z->height); | |
X_d->elements = X_d_e; | |
Y_d->elements = Y_d_e; | |
Z_d->elements = Z_d_e; | |
cudaMemcpy(X_d->elements, X->elements, sizeof(float) * X->width * X->height, | |
cudaMemcpyHostToDevice); | |
cudaMemcpy(Y_d->elements, Y->elements, sizeof(float) * Y->width * Y->height, | |
cudaMemcpyHostToDevice); | |
cudaMemcpy(Z_d->elements, Z->elements, sizeof(float) * Z->width * Z->height, | |
cudaMemcpyHostToDevice); | |
mat_mul<<<blocksPerGrid, threadsPerBlock1>>>(X_d, Y_d, Z_d); | |
cudaMemcpy(Z->elements, Z_d->elements, | |
sizeof(float) * Z_d->width * Z_d->height, cudaMemcpyDeviceToHost); | |
Matrix_print(Z); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment