Last active
February 2, 2025 19:24
-
-
Save ekzhang/5cc261b39d607bae3dbd2a817b897a04 to your computer and use it in GitHub Desktop.
Get started with matmul on CUDA — following https://siboehm.com/articles/22/CUDA-MMM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// nvcc hello_matmul.cu -o hello_matmul | |
// ./hello_matmul | |
// Quick setup for cloud GPU with VS Code (Modal): | |
// | |
// $ pip install modal && modal setup | |
// $ modal volume create learn-cuda | |
// $ modal launch vscode --gpu t4 --volume learn-cuda --image nvidia/cuda:12.4.0-devel-ubuntu22.04 | |
#include <cstdio> | |
#include <cuda_runtime.h> | |
#include <string> | |
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y)) | |
#define CUDA_CHECK(call) \ | |
{ \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
fprintf(stderr, "CUDA error in %s at line %d: %s\n", __FILE__, __LINE__, \ | |
cudaGetErrorString(err)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} | |
__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A, | |
const float *B, float beta, float *C) { | |
// compute position in C that this thread is responsible for | |
const uint x = blockIdx.x * blockDim.x + threadIdx.x; | |
const uint y = blockIdx.y * blockDim.y + threadIdx.y; | |
// `if` condition is necessary for when M or N aren't multiples of 32. | |
if (x < M && y < N) { | |
float tmp = 0.0; | |
for (int i = 0; i < K; ++i) { | |
tmp += A[x * K + i] * B[i * N + y]; | |
} | |
// C = α*(A@B)+β*C | |
C[x * N + y] = alpha * tmp + beta * C[x * N + y]; | |
} | |
} | |
int main(int argc, const char* argv[]) { | |
if (argc != 4) { | |
fprintf(stderr, "usage: %s <M> <N> <K>\n", argv[0]); | |
return 1; | |
} | |
int M = std::stoi(argv[1]); | |
int N = std::stoi(argv[2]); | |
int K = std::stoi(argv[3]); | |
float alpha = 1.0; | |
float beta = 1.0; | |
// Allocate memory on the host | |
float *h_A = new float[M * K]; | |
float *h_B = new float[K * N]; | |
float *h_C = new float[M * N]; | |
// Initialize matrices with random values | |
for (int i = 0; i < M * K; ++i) h_A[i] = static_cast<float>(rand()) / RAND_MAX; | |
for (int i = 0; i < K * N; ++i) h_B[i] = static_cast<float>(rand()) / RAND_MAX; | |
for (int i = 0; i < M * N; ++i) h_C[i] = 0.0f; | |
// Allocate memory on the GPU | |
float *d_A, *d_B, *d_C; | |
CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float))); | |
CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float))); | |
CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float))); | |
// Copy data from host to device | |
CUDA_CHECK(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice)); | |
CUDA_CHECK(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice)); | |
CUDA_CHECK(cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice)); | |
// Set up timing | |
cudaEvent_t start, stop; | |
CUDA_CHECK(cudaEventCreate(&start)); | |
CUDA_CHECK(cudaEventCreate(&stop)); | |
// create as many blocks as necessary to map all of C | |
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1); | |
dim3 blockDim(32, 32, 1); | |
// Start recording time | |
CUDA_CHECK(cudaEventRecord(start)); | |
// launch the kernel | |
sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C); | |
CUDA_CHECK(cudaGetLastError()); // Check for launch errors | |
// Stop recording time | |
CUDA_CHECK(cudaEventRecord(stop)); | |
CUDA_CHECK(cudaEventSynchronize(stop)); | |
float milliseconds = 0; | |
CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop)); | |
// Copy result back to host | |
CUDA_CHECK(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost)); | |
const float gflops = ((float) M * N * (2 * K - 1)) / (1e6 * milliseconds); | |
printf("SGEMM execution time: %.2f ms, %.1f GFLOPs/s\n", milliseconds, gflops); | |
// Cleanup | |
delete[] h_A; | |
delete[] h_B; | |
delete[] h_C; | |
CUDA_CHECK(cudaFree(d_A)); | |
CUDA_CHECK(cudaFree(d_B)); | |
CUDA_CHECK(cudaFree(d_C)); | |
CUDA_CHECK(cudaEventDestroy(start)); | |
CUDA_CHECK(cudaEventDestroy(stop)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment