Skip to content

Instantly share code, notes, and snippets.

@ekzhang
Last active February 2, 2025 19:24
Show Gist options
  • Save ekzhang/5cc261b39d607bae3dbd2a817b897a04 to your computer and use it in GitHub Desktop.
Save ekzhang/5cc261b39d607bae3dbd2a817b897a04 to your computer and use it in GitHub Desktop.
Get started with matmul on CUDA — following https://siboehm.com/articles/22/CUDA-MMM
// nvcc hello_matmul.cu -o hello_matmul
// ./hello_matmul
// Quick setup for cloud GPU with VS Code (Modal):
//
// $ pip install modal && modal setup
// $ modal volume create learn-cuda
// $ modal launch vscode --gpu t4 --volume learn-cuda --image nvidia/cuda:12.4.0-devel-ubuntu22.04
#include <cstdio>
#include <cuda_runtime.h>
#include <string>
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define CUDA_CHECK(call) \
{ \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error in %s at line %d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A,
const float *B, float beta, float *C) {
// compute position in C that this thread is responsible for
const uint x = blockIdx.x * blockDim.x + threadIdx.x;
const uint y = blockIdx.y * blockDim.y + threadIdx.y;
// `if` condition is necessary for when M or N aren't multiples of 32.
if (x < M && y < N) {
float tmp = 0.0;
for (int i = 0; i < K; ++i) {
tmp += A[x * K + i] * B[i * N + y];
}
// C = α*(A@B)+β*C
C[x * N + y] = alpha * tmp + beta * C[x * N + y];
}
}
int main(int argc, const char* argv[]) {
if (argc != 4) {
fprintf(stderr, "usage: %s <M> <N> <K>\n", argv[0]);
return 1;
}
int M = std::stoi(argv[1]);
int N = std::stoi(argv[2]);
int K = std::stoi(argv[3]);
float alpha = 1.0;
float beta = 1.0;
// Allocate memory on the host
float *h_A = new float[M * K];
float *h_B = new float[K * N];
float *h_C = new float[M * N];
// Initialize matrices with random values
for (int i = 0; i < M * K; ++i) h_A[i] = static_cast<float>(rand()) / RAND_MAX;
for (int i = 0; i < K * N; ++i) h_B[i] = static_cast<float>(rand()) / RAND_MAX;
for (int i = 0; i < M * N; ++i) h_C[i] = 0.0f;
// Allocate memory on the GPU
float *d_A, *d_B, *d_C;
CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
// Copy data from host to device
CUDA_CHECK(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice));
// Set up timing
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
// create as many blocks as necessary to map all of C
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
dim3 blockDim(32, 32, 1);
// Start recording time
CUDA_CHECK(cudaEventRecord(start));
// launch the kernel
sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
CUDA_CHECK(cudaGetLastError()); // Check for launch errors
// Stop recording time
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
float milliseconds = 0;
CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
// Copy result back to host
CUDA_CHECK(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
const float gflops = ((float) M * N * (2 * K - 1)) / (1e6 * milliseconds);
printf("SGEMM execution time: %.2f ms, %.1f GFLOPs/s\n", milliseconds, gflops);
// Cleanup
delete[] h_A;
delete[] h_B;
delete[] h_C;
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment