ekzhang · February 2, 2025 19:24
diff --git a/hello_matmul.cu b/hello_matmul.cu
 // nvcc hello_matmul.cu -o hello_matmul
 // ./hello_matmul

 // Quick setup for cloud GPU with VS Code (Modal):
 //
 // $ pip install modal && modal setup
 // $ modal volume create learn-cuda
 // $ modal launch vscode --gpu t4 --volume learn-cuda --image nvidia/cuda:12.4.0-devel-ubuntu22.04

 #include <cstdio>
 #include <cuda_runtime.h>
 #include <string>

 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))

 #define CUDA_CHECK(call)                                                        \
  {                                                                             \
    cudaError_t err = call;                                                     \
    if (err != cudaSuccess) {                                                   \
      fprintf(stderr, "CUDA error in %s at line %d: %s\n", __FILE__, __LINE__,  \
              cudaGetErrorString(err));                                         \
      exit(EXIT_FAILURE);                                                        \
    }                                                                           \
  }

 __global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A,
                            const float *B, float beta, float *C) {
  // compute position in C that this thread is responsible for
  const uint x = blockIdx.x * blockDim.x + threadIdx.x;
  const uint y = blockIdx.y * blockDim.y + threadIdx.y;

  // `if` condition is necessary for when M or N aren't multiples of 32.
  if (x < M && y < N) {
    float tmp = 0.0;
    for (int i = 0; i < K; ++i) {
      tmp += A[x * K + i] * B[i * N + y];
    }
    // C = α*(A@B)+β*C
    C[x * N + y] = alpha * tmp + beta * C[x * N + y];
  }
 }

 int main(int argc, const char* argv[]) {
  if (argc != 4) {
    fprintf(stderr, "usage: %s <M> <N> <K>\n", argv[0]);
    return 1;
  }

  int M = std::stoi(argv[1]);
  int N = std::stoi(argv[2]);
  int K = std::stoi(argv[3]);

  float alpha = 1.0;
  float beta = 1.0;

  // Allocate memory on the host
  float *h_A = new float[M * K];
  float *h_B = new float[K * N];
  float *h_C = new float[M * N];

  // Initialize matrices with random values
  for (int i = 0; i < M * K; ++i) h_A[i] = static_cast<float>(rand()) / RAND_MAX;
  for (int i = 0; i < K * N; ++i) h_B[i] = static_cast<float>(rand()) / RAND_MAX;
  for (int i = 0; i < M * N; ++i) h_C[i] = 0.0f;

  // Allocate memory on the GPU
  float *d_A, *d_B, *d_C;
  CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
  CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
  CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));

  // Copy data from host to device
  CUDA_CHECK(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice));

  // Set up timing
  cudaEvent_t start, stop;
  CUDA_CHECK(cudaEventCreate(&start));
  CUDA_CHECK(cudaEventCreate(&stop));

  // create as many blocks as necessary to map all of C
  dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
  dim3 blockDim(32, 32, 1);

  // Start recording time
  CUDA_CHECK(cudaEventRecord(start));

  // launch the kernel
  sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
  CUDA_CHECK(cudaGetLastError());  // Check for launch errors

  // Stop recording time
  CUDA_CHECK(cudaEventRecord(stop));
  CUDA_CHECK(cudaEventSynchronize(stop));

  float milliseconds = 0;
  CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));

  // Copy result back to host
  CUDA_CHECK(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));

  const float gflops = ((float) M * N * (2 * K - 1)) / (1e6 * milliseconds);
  printf("SGEMM execution time: %.2f ms, %.1f GFLOPs/s\n", milliseconds, gflops);

  // Cleanup
  delete[] h_A;
  delete[] h_B;
  delete[] h_C;
  CUDA_CHECK(cudaFree(d_A));
  CUDA_CHECK(cudaFree(d_B));
  CUDA_CHECK(cudaFree(d_C));
  CUDA_CHECK(cudaEventDestroy(start));
  CUDA_CHECK(cudaEventDestroy(stop));

  return 0;
 }
	// nvcc hello_matmul.cu -o hello_matmul
	// ./hello_matmul

	// Quick setup for cloud GPU with VS Code (Modal):
	//
	// $ pip install modal && modal setup
	// $ modal volume create learn-cuda
	// $ modal launch vscode --gpu t4 --volume learn-cuda --image nvidia/cuda:12.4.0-devel-ubuntu22.04

	#include <cstdio>
	#include <cuda_runtime.h>
	#include <string>

	#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))

	#define CUDA_CHECK(call) \
	{ \
	cudaError_t err = call; \
	if (err != cudaSuccess) { \
	fprintf(stderr, "CUDA error in %s at line %d: %s\n", __FILE__, __LINE__, \
	cudaGetErrorString(err)); \
	exit(EXIT_FAILURE); \
	} \
	}

	__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A,
	const float B, float beta, float C) {
	// compute position in C that this thread is responsible for
	const uint x = blockIdx.x * blockDim.x + threadIdx.x;
	const uint y = blockIdx.y * blockDim.y + threadIdx.y;

	// `if` condition is necessary for when M or N aren't multiples of 32.
	if (x < M && y < N) {
	float tmp = 0.0;
	for (int i = 0; i < K; ++i) {
	tmp += A[x * K + i] * B[i * N + y];
	}
	// C = α(A@B)+βC
	C[x * N + y] = alpha * tmp + beta * C[x * N + y];
	}
	}

	int main(int argc, const char* argv[]) {
	if (argc != 4) {
	fprintf(stderr, "usage: %s <M> <N> <K>\n", argv[0]);
	return 1;
	}

	int M = std::stoi(argv[1]);
	int N = std::stoi(argv[2]);
	int K = std::stoi(argv[3]);

	float alpha = 1.0;
	float beta = 1.0;

	// Allocate memory on the host
	float h_A = new float[M K];
	float h_B = new float[K N];
	float h_C = new float[M N];

	// Initialize matrices with random values
	for (int i = 0; i < M * K; ++i) h_A[i] = static_cast<float>(rand()) / RAND_MAX;
	for (int i = 0; i < K * N; ++i) h_B[i] = static_cast<float>(rand()) / RAND_MAX;
	for (int i = 0; i < M * N; ++i) h_C[i] = 0.0f;

	// Allocate memory on the GPU
	float d_A, d_B, *d_C;
	CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
	CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
	CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));

	// Copy data from host to device
	CUDA_CHECK(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice));

	// Set up timing
	cudaEvent_t start, stop;
	CUDA_CHECK(cudaEventCreate(&start));
	CUDA_CHECK(cudaEventCreate(&stop));

	// create as many blocks as necessary to map all of C
	dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
	dim3 blockDim(32, 32, 1);

	// Start recording time
	CUDA_CHECK(cudaEventRecord(start));

	// launch the kernel
	sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
	CUDA_CHECK(cudaGetLastError()); // Check for launch errors

	// Stop recording time
	CUDA_CHECK(cudaEventRecord(stop));
	CUDA_CHECK(cudaEventSynchronize(stop));

	float milliseconds = 0;
	CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));

	// Copy result back to host
	CUDA_CHECK(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));

	const float gflops = ((float) M * N * (2 * K - 1)) / (1e6 * milliseconds);
	printf("SGEMM execution time: %.2f ms, %.1f GFLOPs/s\n", milliseconds, gflops);

	// Cleanup
	delete[] h_A;
	delete[] h_B;
	delete[] h_C;
	CUDA_CHECK(cudaFree(d_A));
	CUDA_CHECK(cudaFree(d_B));
	CUDA_CHECK(cudaFree(d_C));
	CUDA_CHECK(cudaEventDestroy(start));
	CUDA_CHECK(cudaEventDestroy(stop));

	return 0;
	}