Last active
April 28, 2025 14:29
-
-
Save michaelfortunato/a057a9e1231925ec4455b0e901039d1c to your computer and use it in GitHub Desktop.
Matrix Multiplication Using Shared Memory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime_api.h> | |
#include <stdio.h> | |
// Matrices are stored in row-major order: | |
// M(row, col) = *(M.elements + row * M.width + col) | |
struct Matrix { | |
int width; | |
int height; | |
float *elements; | |
}; | |
__global__ void mat_add(float *A, float *B, float *C, int width) { | |
int i = threadIdx.y + blockIdx.y * blockDim.y; | |
int j = threadIdx.x + blockIdx.x * blockDim.x; | |
C[i * width + j] = A[i * width + j] + B[i * width + j]; | |
} | |
__global__ void mat_mul_tiled(struct Matrix *A, struct Matrix *B, | |
struct Matrix *C) { | |
int BLOCK_SIZE = blockDim.x; | |
int i = threadIdx.y + blockIdx.y * blockDim.y; | |
int j = threadIdx.x + blockIdx.x * blockDim.x; | |
__shared__ float As[5][5]; | |
__shared__ float Bs[5][5]; | |
float sum = 0; | |
for (int m = 0; m < A->width / BLOCK_SIZE; ++m) { | |
int AIdx = blockIdx.y * BLOCK_SIZE * A->width + threadIdx.y * A->width + | |
m * BLOCK_SIZE + threadIdx.x; | |
int BIdx = m * BLOCK_SIZE * B->width + B->width * threadIdx.y + | |
blockIdx.x * BLOCK_SIZE + threadIdx.x; | |
if ((AIdx >= A->width * A->height)) { | |
As[threadIdx.y][threadIdx.x] = 0.0f; | |
// Bs[threadIdx.y][threadIdx.x] = 0.0f; | |
} else { | |
As[threadIdx.y][threadIdx.x] = A->elements[AIdx]; | |
// Bs[threadIdx.y][threadIdx.x] = B->elements[BIdx]; | |
} | |
if ((BIdx >= B->width * B->height)) { | |
Bs[threadIdx.y][threadIdx.x] = 0.0f; | |
// Bs[threadIdx.y][threadIdx.x] = 0.0f; | |
} else { | |
Bs[threadIdx.y][threadIdx.x] = B->elements[BIdx]; | |
// Bs[threadIdx.y][threadIdx.x] = B->elements[BIdx]; | |
} | |
// make sure all threads wait before going in | |
__syncthreads(); | |
for (int q = 0; q < BLOCK_SIZE; ++q) { | |
sum += As[threadIdx.y][q] * Bs[q][threadIdx.x]; | |
} | |
// Synchronize to make sure that the preceding | |
// computation is done before loading two new | |
// sub-matrices of A and B in the next iteration | |
__syncthreads(); | |
} | |
C->elements[(threadIdx.y + blockIdx.y * blockDim.y) * C->width + threadIdx.x + | |
blockIdx.x * blockDim.x] = sum; | |
} | |
__global__ void mat_mul(struct Matrix *A, struct Matrix *B, struct Matrix *C) { | |
int i = threadIdx.y + blockIdx.y * blockDim.y; | |
int j = threadIdx.x + blockIdx.x * blockDim.x; | |
float acc = 0; | |
for (int k = 0; k < A->width; k++) { | |
acc += A->elements[k + A->width * i] * B->elements[B->width * k + j]; | |
} | |
C->elements[i * C->width + j] = acc; | |
} | |
struct Matrix *Matrix_init(int height, int width) { | |
struct Matrix *m = (struct Matrix *)malloc(sizeof(float)); | |
m->height = height; | |
m->width = width; | |
m->elements = (float *)malloc(sizeof(float) * width * height); | |
return m; | |
} | |
struct Matrix *Matrix_fill(struct Matrix *m, float value) { | |
for (int i = 0; i < m->height; i++) { | |
for (int j = 0; j < m->width; j++) { | |
m->elements[i * m->width + j] = value; | |
} | |
} | |
return m; | |
} | |
typedef void (*matrix_callback)(int i, int j, struct Matrix *m); | |
struct Matrix *Matrix_iter(struct Matrix *m, matrix_callback cb) { | |
for (int i = 0; i < m->height; i++) { | |
for (int j = 0; j < m->width; j++) { | |
cb(i, j, m); | |
} | |
} | |
return m; | |
} | |
void Matrix_print(struct Matrix *m) { | |
for (int i = 0; i < m->height; i++) { | |
for (int j = 0; j < m->width; j++) { | |
printf("M[%d][%d] = %f\n", i, j, m->elements[i * m->width + j]); | |
} | |
} | |
} | |
typedef void (*matrix_callback_old)(int i, int j, int n, float *A); | |
void iterate_and_call(float *A, int n, matrix_callback_old cb) { | |
for (int i = 0; i < n; i++) { | |
for (int j = 0; j < n; j++) { | |
cb(i, j, n, A); | |
} | |
} | |
} | |
void print_matrix(int i, int j, int n, float *A) { | |
printf("X[%d][%d] = %f\n", i, j, A[i * n + j]); | |
} | |
void init_matrix_1(int i, int j, int n, float *A) { A[i * n + j] = 1.0f; } | |
void init_matrix_2(int i, int j, int n, float *A) { A[i * n + j] = 2.0f; } | |
// dim3 threadsPerBlock(int x_dim, int y_dim) { | |
// dim3 shape; | |
// shape.x= x_dim; | |
// shape.y = x_dim; | |
// return shape; | |
// } | |
// void assert_cudaError(cudaError_t err) { | |
// if (err != cudaSuccess) { | |
// fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", | |
// cudaGetErrorString(err)); | |
// exit(EXIT_FAILURE); | |
// } | |
// } | |
inline void _assert_cudaError(cudaError_t err, const char *file, int line, | |
bool abort = true) { | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", | |
cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
} | |
inline void _cudaDeviceFlushPrintf() { cudaDeviceSynchronize(); } | |
#define assert_cudaError(ans) \ | |
{ \ | |
_assert_cudaError((ans), __FILE__, __LINE__); \ | |
} | |
int main(int argc, char *argv[]) { | |
cudaError_t err = cudaSuccess; | |
const int n = 1 << 10; | |
struct Matrix *X = Matrix_init(10, 100); | |
X = Matrix_fill(X, 3.0f); | |
struct Matrix *Y = Matrix_init(100, 20); | |
Y = Matrix_fill(Y, 5.0f); | |
struct Matrix *Z = Matrix_init(10, 20); | |
struct Matrix X_d, Y_d, Z_d; | |
X_d.width = X->width; | |
X_d.height = X->height; | |
Y_d.width = Y->width; | |
Y_d.height = Y->height; | |
Z_d.width = Z->width; | |
Z_d.height = Z->height; | |
float *X_d_e, *Y_d_e, *Z_d_e; | |
err = cudaMalloc(&X_d_e, sizeof(float) * X->width * X->height); | |
assert_cudaError(err); | |
err = cudaMalloc(&Y_d_e, sizeof(float) * Y->width * Y->height); | |
assert_cudaError(err); | |
err = cudaMalloc(&Z_d_e, sizeof(float) * Z->width * Z->height); | |
assert_cudaError(err); | |
X_d.elements = X_d_e; | |
Y_d.elements = Y_d_e; | |
Z_d.elements = Z_d_e; | |
assert_cudaError(cudaMemcpy(X_d.elements, X->elements, | |
sizeof(float) * X->width * X->height, | |
cudaMemcpyHostToDevice)); | |
assert_cudaError(cudaMemcpy(Y_d.elements, Y->elements, | |
sizeof(float) * Y->width * Y->height, | |
cudaMemcpyHostToDevice)); | |
assert_cudaError(cudaMemcpy(Z_d.elements, Z->elements, | |
sizeof(float) * Z->width * Z->height, | |
cudaMemcpyHostToDevice)); | |
// what's a good way to block this out? | |
// The first obvious way to is block it over Z | |
// Let's say each grid is 5 units high and 1 unit long. | |
// y is height and x is widht | |
dim3 threadsPerBlock1(10, 10); | |
dim3 blocksPerGrid(Z->width / threadsPerBlock1.x, | |
Z->height / threadsPerBlock1.y); | |
mat_mul_tiled<<<blocksPerGrid, threadsPerBlock1>>>(&X_d, &Y_d, &Z_d); | |
assert_cudaError(cudaGetLastError()); | |
cudaMemcpy(Z->elements, Z_d.elements, sizeof(float) * Z_d.width * Z_d.height, | |
cudaMemcpyDeviceToHost); | |
Matrix_print(Z); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment