Skip to content

Instantly share code, notes, and snippets.

@crrapi
Created December 17, 2024 02:23
Show Gist options
  • Save crrapi/38ba5b83b2c873d18591265bf12d99b1 to your computer and use it in GitHub Desktop.
Save crrapi/38ba5b83b2c873d18591265bf12d99b1 to your computer and use it in GitHub Desktop.
Optimal matrix multiplication as a Cuda kernel
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <chrono>
#include <iostream>
#include <vector>
#include <cassert>
// Define constants for precision
#define USE_HALF_PRECISION 0 // Set to 1 to use half precision
#if USE_HALF_PRECISION
#include <cuda_fp16.h>
typedef half FloatType;
#else
typedef float FloatType;
#endif
// Macro for CUDA error checking
#define CUDA_CHECK_ERROR(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA Error at %s:%d - %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Constants for default matrix sizes
const int DEFAULT_N = 1024; // Rows of A and C
const int DEFAULT_K = 512; // Columns of A and Rows of B
const int DEFAULT_M = 2048; // Columns of B and C
// Utility function to get current time for profiling
inline double getCurrentTime() {
using namespace std::chrono;
return duration<double, std::milli>(steady_clock::now().time_since_epoch()).count();
}
// Utility function to initialize matrices with random values
void initializeMatrix(FloatType* mat, int rows, int cols) {
for(int i = 0; i < rows * cols; ++i)
mat[i] = static_cast<FloatType>(rand()) / static_cast<FloatType>(RAND_MAX);
}
// Utility function to verify the correctness of GPU results against CPU computation
bool verifyResult(const FloatType* h_A, const FloatType* h_B, const FloatType* h_C,
int N, int M, int K, FloatType epsilon = 1e-2) {
bool correct = true;
for(int row = 0; row < N && correct; ++row) {
for(int col = 0; col < M && correct; ++col) {
FloatType cpu_val = 0.0f;
for(int e = 0; e < K; ++e)
cpu_val += h_A[row*K + e] * h_B[e*M + col];
FloatType gpu_val = h_C[row*M + col];
FloatType diff = fabs(gpu_val - cpu_val);
if (diff > epsilon) {
printf("Mismatch at C[%d][%d]: GPU = %f, CPU = %f, Diff = %f\n",
row, col, (float)gpu_val, (float)cpu_val, (float)diff);
correct = false;
}
}
}
return correct;
}
// CUDA kernel for advanced tiled matrix multiplication
template <int TILE_WIDTH>
__global__ void matrixMulAdvancedKernel(const FloatType* __restrict__ d_A,
const FloatType* __restrict__ d_B,
FloatType* __restrict__ d_C,
int N, int M, int K) {
// Shared memory for tiles of A and B
extern __shared__ FloatType shared_mem[];
FloatType* ds_A = shared_mem;
FloatType* ds_B = shared_mem + TILE_WIDTH * TILE_WIDTH;
// Calculate global row and column indices
int global_row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int global_col = blockIdx.x * TILE_WIDTH + threadIdx.x;
FloatType Cvalue = 0.0f;
// Loop over tiles of A and B
for(int tile = 0; tile < (K + TILE_WIDTH - 1) / TILE_WIDTH; ++tile) {
// Load data into shared memory with boundary checks
if (global_row < N && (tile*TILE_WIDTH + threadIdx.x) < K)
ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_A[global_row * K + tile * TILE_WIDTH + threadIdx.x];
else
ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f;
if (global_col < M && (tile*TILE_WIDTH + threadIdx.y) < K)
ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_B[(tile * TILE_WIDTH + threadIdx.y) * M + global_col];
else
ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f;
__syncthreads();
// Compute partial products
#pragma unroll
for(int i = 0; i < TILE_WIDTH; ++i)
Cvalue += ds_A[threadIdx.y * TILE_WIDTH + i] * ds_B[i * TILE_WIDTH + threadIdx.x];
__syncthreads();
}
// Write the result to global memory
if (global_row < N && global_col < M)
d_C[global_row * M + global_col] = Cvalue;
}
// Device Manager: Initializes CUDA device and retrieves properties
struct DeviceManager {
int device_id;
cudaDeviceProp device_prop;
DeviceManager() {
CUDA_CHECK_ERROR(cudaGetDevice(&device_id));
CUDA_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
printf("Using CUDA Device [%d]: %s\n", device_id, device_prop.name);
printf(" Compute Capability: %d.%d\n", device_prop.major, device_prop.minor);
printf(" Total Global Memory: %.2f GB\n", (float)device_prop.totalGlobalMem / (1 << 30));
printf(" Shared Memory per Block: %.2f KB\n", (float)device_prop.sharedMemPerBlock / 1024.0f);
printf(" CUDA Cores: %d\n", device_prop.multiProcessorCount *
_ConvertSMVer2Cores(device_prop.major, device_prop.minor));
}
private:
// Utility to convert SM version to number of CUDA cores
int _ConvertSMVer2Cores(int major, int minor) {
typedef struct {
int SM;
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{ 0x30, 192 }, // Kepler
{ 0x32, 192 },
{ 0x35, 192 },
{ 0x37, 192 },
{ 0x50, 128 }, // Maxwell
{ 0x52, 128 },
{ 0x53, 128 },
{ 0x60, 64 }, // Pascal
{ 0x61, 128 },
{ 0x62, 128 },
{ 0x70, 64 }, // Volta
{ 0x72, 64 },
{ 0x75, 64 }, // Turing
{ 0x80, 64 }, // Ampere
{ -1, -1 }
};
int index = 0;
while(nGpuArchCoresPerSM[index].SM != -1) {
if(nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores undefined SM version %d.%d!\n", major, minor);
return -1;
}
};
// Memory Manager: Handles memory allocations and data transfers
struct MemoryManager {
FloatType *h_A, *h_B, *h_C;
FloatType *d_A, *d_B, *d_C;
size_t size_A, size_B, size_C;
MemoryManager(int N, int M, int K) {
size_A = N * K * sizeof(FloatType);
size_B = K * M * sizeof(FloatType);
size_C = N * M * sizeof(FloatType);
// Allocate host memory
h_A = (FloatType*)malloc(size_A);
h_B = (FloatType*)malloc(size_B);
h_C = (FloatType*)malloc(size_C);
if(!h_A || !h_B || !h_C) {
fprintf(stderr, "Failed to allocate host matrices.\n");
exit(EXIT_FAILURE);
}
// Initialize host matrices
initializeMatrix(h_A, N, K);
initializeMatrix(h_B, K, M);
// Allocate device memory
CUDA_CHECK_ERROR(cudaMalloc((void**)&d_A, size_A));
CUDA_CHECK_ERROR(cudaMalloc((void**)&d_B, size_B));
CUDA_CHECK_ERROR(cudaMalloc((void**)&d_C, size_C));
// Initialize device memory to zero
CUDA_CHECK_ERROR(cudaMemset(d_C, 0, size_C));
}
~MemoryManager() {
// Free host memory
free(h_A);
free(h_B);
free(h_C);
// Free device memory
CUDA_CHECK_ERROR(cudaFree(d_A));
CUDA_CHECK_ERROR(cudaFree(d_B));
CUDA_CHECK_ERROR(cudaFree(d_C));
}
// Asynchronous data transfer to device using streams
void transferToDevice(cudaStream_t stream = 0) {
CUDA_CHECK_ERROR(cudaMemcpyAsync(d_A, h_A, size_A, cudaMemcpyHostToDevice, stream));
CUDA_CHECK_ERROR(cudaMemcpyAsync(d_B, h_B, size_B, cudaMemcpyHostToDevice, stream));
}
// Asynchronous data transfer from device using streams
void transferToHost(cudaStream_t stream = 0) {
CUDA_CHECK_ERROR(cudaMemcpyAsync(h_C, d_C, size_C, cudaMemcpyDeviceToHost, stream));
}
};
// Function to calculate GFLOPS
double calculateGFLOPS(int N, int M, int K, double time_ms) {
double flops = 2.0 * N * M * K;
double gflops = flops / (time_ms * 1e6);
return gflops;
}
// Function to select optimal TILE_WIDTH based on shared memory
int selectOptimalTileWidth(const cudaDeviceProp& prop) {
// Example logic: choose TILE_WIDTH that fits shared memory constraints
// This can be enhanced with empirical tuning or heuristics
std::vector<int> possible_tile_sizes = {8, 16, 32};
for(auto tile_size : possible_tile_sizes) {
size_t required_shared_mem = 2 * tile_size * tile_size * sizeof(FloatType);
if(required_shared_mem <= prop.sharedMemPerBlock)
return tile_size;
}
// Default to 16 if no optimal size found
return 16;
}
// Execution Engine: Manages kernel launches and performance measurements
struct ExecutionEngine {
int N, M, K;
int TILE_WIDTH;
size_t shared_mem_size;
MemoryManager* mem_manager;
cudaStream_t stream;
ExecutionEngine(int n, int m, int k, int tile_width, MemoryManager* mm)
: N(n), M(m), K(k), TILE_WIDTH(tile_width), mem_manager(mm), stream(0) {
// Calculate shared memory size: 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType)
shared_mem_size = 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType);
}
// Initialize CUDA stream for asynchronous operations
void initializeStream() {
CUDA_CHECK_ERROR(cudaStreamCreate(&stream));
}
// Destroy CUDA stream
void destroyStream() {
CUDA_CHECK_ERROR(cudaStreamDestroy(stream));
}
// Launch the matrix multiplication kernel
double launchKernel() {
// Define grid and block dimensions
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
dim3 dimGrid( (M + TILE_WIDTH -1)/TILE_WIDTH,
(N + TILE_WIDTH -1)/TILE_WIDTH, 1);
// Record start time
double start_time = getCurrentTime();
// Launch the kernel with dynamic shared memory
switch(TILE_WIDTH) {
case 8:
matrixMulAdvancedKernel<8><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
break;
case 16:
matrixMulAdvancedKernel<16><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
break;
case 32:
matrixMulAdvancedKernel<32><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
break;
default:
fprintf(stderr, "Unsupported TILE_WIDTH: %d\n", TILE_WIDTH);
exit(EXIT_FAILURE);
}
// Check for kernel launch errors
CUDA_CHECK_ERROR(cudaGetLastError());
// Record end time after kernel execution
CUDA_CHECK_ERROR(cudaStreamSynchronize(stream));
double end_time = getCurrentTime();
double elapsed_time = end_time - start_time;
printf("Kernel execution time: %.3f ms\n", elapsed_time);
return elapsed_time;
}
};
// Function to calculate GFLOPS
double calculateGFLOPS(int N, int M, int K, double time_ms) {
double flops = 2.0 * N * M * K;
double gflops = flops / (time_ms * 1e6);
return gflops;
}
int main(int argc, char** argv) {
// Initialize CUDA device and retrieve properties
DeviceManager device_manager;
// Define matrix dimensions
int N = DEFAULT_N; // Rows of A and C
int K = DEFAULT_K; // Columns of A and Rows of B
int M = DEFAULT_M; // Columns of B and C
// Allow custom dimensions via command-line arguments
if(argc == 4) {
N = atoi(argv[1]);
K = atoi(argv[2]);
M = atoi(argv[3]);
printf("Custom matrix dimensions set: N=%d, K=%d, M=%d\n", N, K, M);
} else {
printf("Using default matrix dimensions: N=%d, K=%d, M=%d\n", N, K, M);
}
// Select optimal TILE_WIDTH based on shared memory
int TILE_WIDTH = selectOptimalTileWidth(device_manager.device_prop);
printf("Selected TILE_WIDTH: %d\n", TILE_WIDTH);
// Initialize Memory Manager
MemoryManager mem_manager(N, M, K);
// Initialize Execution Engine
ExecutionEngine exec_engine(N, M, K, TILE_WIDTH, &mem_manager);
exec_engine.initializeStream();
// Transfer data to device asynchronously
double transfer_to_device_start = getCurrentTime();
mem_manager.transferToDevice(exec_engine.stream);
double transfer_to_device_end = getCurrentTime();
printf("Data transfer to device time: %.3f ms\n", transfer_to_device_end - transfer_to_device_start);
// Launch kernel and measure execution time
double kernel_time = exec_engine.launchKernel();
// Transfer result back to host asynchronously
double transfer_to_host_start = getCurrentTime();
mem_manager.transferToHost(exec_engine.stream);
CUDA_CHECK_ERROR(cudaStreamSynchronize(exec_engine.stream));
double transfer_to_host_end = getCurrentTime();
printf("Data transfer to host time: %.3f ms\n", transfer_to_host_end - transfer_to_host_start);
// Destroy CUDA stream
exec_engine.destroyStream();
// Validate the result
printf("Verifying results...\n");
bool is_correct = verifyResult(mem_manager.h_A, mem_manager.h_B, mem_manager.h_C, N, M, K);
if(is_correct)
printf("Matrix multiplication successful and verified.\n");
else
printf("Matrix multiplication failed verification.\n");
// Calculate and display GFLOPS
double gflops = calculateGFLOPS(N, M, K, kernel_time);
printf("Performance: %.3f GFLOPS\n", gflops);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment