Created
December 17, 2024 02:23
-
-
Save crrapi/38ba5b83b2c873d18591265bf12d99b1 to your computer and use it in GitHub Desktop.
Optimal matrix multiplication as a Cuda kernel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime.h> | |
#include <device_launch_parameters.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#include <chrono> | |
#include <iostream> | |
#include <vector> | |
#include <cassert> | |
// Define constants for precision | |
#define USE_HALF_PRECISION 0 // Set to 1 to use half precision | |
#if USE_HALF_PRECISION | |
#include <cuda_fp16.h> | |
typedef half FloatType; | |
#else | |
typedef float FloatType; | |
#endif | |
// Macro for CUDA error checking | |
#define CUDA_CHECK_ERROR(call) \ | |
do { \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
fprintf(stderr, "CUDA Error at %s:%d - %s\n", \ | |
__FILE__, __LINE__, cudaGetErrorString(err)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while (0) | |
// Constants for default matrix sizes | |
const int DEFAULT_N = 1024; // Rows of A and C | |
const int DEFAULT_K = 512; // Columns of A and Rows of B | |
const int DEFAULT_M = 2048; // Columns of B and C | |
// Utility function to get current time for profiling | |
inline double getCurrentTime() { | |
using namespace std::chrono; | |
return duration<double, std::milli>(steady_clock::now().time_since_epoch()).count(); | |
} | |
// Utility function to initialize matrices with random values | |
void initializeMatrix(FloatType* mat, int rows, int cols) { | |
for(int i = 0; i < rows * cols; ++i) | |
mat[i] = static_cast<FloatType>(rand()) / static_cast<FloatType>(RAND_MAX); | |
} | |
// Utility function to verify the correctness of GPU results against CPU computation | |
bool verifyResult(const FloatType* h_A, const FloatType* h_B, const FloatType* h_C, | |
int N, int M, int K, FloatType epsilon = 1e-2) { | |
bool correct = true; | |
for(int row = 0; row < N && correct; ++row) { | |
for(int col = 0; col < M && correct; ++col) { | |
FloatType cpu_val = 0.0f; | |
for(int e = 0; e < K; ++e) | |
cpu_val += h_A[row*K + e] * h_B[e*M + col]; | |
FloatType gpu_val = h_C[row*M + col]; | |
FloatType diff = fabs(gpu_val - cpu_val); | |
if (diff > epsilon) { | |
printf("Mismatch at C[%d][%d]: GPU = %f, CPU = %f, Diff = %f\n", | |
row, col, (float)gpu_val, (float)cpu_val, (float)diff); | |
correct = false; | |
} | |
} | |
} | |
return correct; | |
} | |
// CUDA kernel for advanced tiled matrix multiplication | |
template <int TILE_WIDTH> | |
__global__ void matrixMulAdvancedKernel(const FloatType* __restrict__ d_A, | |
const FloatType* __restrict__ d_B, | |
FloatType* __restrict__ d_C, | |
int N, int M, int K) { | |
// Shared memory for tiles of A and B | |
extern __shared__ FloatType shared_mem[]; | |
FloatType* ds_A = shared_mem; | |
FloatType* ds_B = shared_mem + TILE_WIDTH * TILE_WIDTH; | |
// Calculate global row and column indices | |
int global_row = blockIdx.y * TILE_WIDTH + threadIdx.y; | |
int global_col = blockIdx.x * TILE_WIDTH + threadIdx.x; | |
FloatType Cvalue = 0.0f; | |
// Loop over tiles of A and B | |
for(int tile = 0; tile < (K + TILE_WIDTH - 1) / TILE_WIDTH; ++tile) { | |
// Load data into shared memory with boundary checks | |
if (global_row < N && (tile*TILE_WIDTH + threadIdx.x) < K) | |
ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_A[global_row * K + tile * TILE_WIDTH + threadIdx.x]; | |
else | |
ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f; | |
if (global_col < M && (tile*TILE_WIDTH + threadIdx.y) < K) | |
ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_B[(tile * TILE_WIDTH + threadIdx.y) * M + global_col]; | |
else | |
ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f; | |
__syncthreads(); | |
// Compute partial products | |
#pragma unroll | |
for(int i = 0; i < TILE_WIDTH; ++i) | |
Cvalue += ds_A[threadIdx.y * TILE_WIDTH + i] * ds_B[i * TILE_WIDTH + threadIdx.x]; | |
__syncthreads(); | |
} | |
// Write the result to global memory | |
if (global_row < N && global_col < M) | |
d_C[global_row * M + global_col] = Cvalue; | |
} | |
// Device Manager: Initializes CUDA device and retrieves properties | |
struct DeviceManager { | |
int device_id; | |
cudaDeviceProp device_prop; | |
DeviceManager() { | |
CUDA_CHECK_ERROR(cudaGetDevice(&device_id)); | |
CUDA_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id)); | |
printf("Using CUDA Device [%d]: %s\n", device_id, device_prop.name); | |
printf(" Compute Capability: %d.%d\n", device_prop.major, device_prop.minor); | |
printf(" Total Global Memory: %.2f GB\n", (float)device_prop.totalGlobalMem / (1 << 30)); | |
printf(" Shared Memory per Block: %.2f KB\n", (float)device_prop.sharedMemPerBlock / 1024.0f); | |
printf(" CUDA Cores: %d\n", device_prop.multiProcessorCount * | |
_ConvertSMVer2Cores(device_prop.major, device_prop.minor)); | |
} | |
private: | |
// Utility to convert SM version to number of CUDA cores | |
int _ConvertSMVer2Cores(int major, int minor) { | |
typedef struct { | |
int SM; | |
int Cores; | |
} sSMtoCores; | |
sSMtoCores nGpuArchCoresPerSM[] = { | |
{ 0x30, 192 }, // Kepler | |
{ 0x32, 192 }, | |
{ 0x35, 192 }, | |
{ 0x37, 192 }, | |
{ 0x50, 128 }, // Maxwell | |
{ 0x52, 128 }, | |
{ 0x53, 128 }, | |
{ 0x60, 64 }, // Pascal | |
{ 0x61, 128 }, | |
{ 0x62, 128 }, | |
{ 0x70, 64 }, // Volta | |
{ 0x72, 64 }, | |
{ 0x75, 64 }, // Turing | |
{ 0x80, 64 }, // Ampere | |
{ -1, -1 } | |
}; | |
int index = 0; | |
while(nGpuArchCoresPerSM[index].SM != -1) { | |
if(nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { | |
return nGpuArchCoresPerSM[index].Cores; | |
} | |
index++; | |
} | |
printf("MapSMtoCores undefined SM version %d.%d!\n", major, minor); | |
return -1; | |
} | |
}; | |
// Memory Manager: Handles memory allocations and data transfers | |
struct MemoryManager { | |
FloatType *h_A, *h_B, *h_C; | |
FloatType *d_A, *d_B, *d_C; | |
size_t size_A, size_B, size_C; | |
MemoryManager(int N, int M, int K) { | |
size_A = N * K * sizeof(FloatType); | |
size_B = K * M * sizeof(FloatType); | |
size_C = N * M * sizeof(FloatType); | |
// Allocate host memory | |
h_A = (FloatType*)malloc(size_A); | |
h_B = (FloatType*)malloc(size_B); | |
h_C = (FloatType*)malloc(size_C); | |
if(!h_A || !h_B || !h_C) { | |
fprintf(stderr, "Failed to allocate host matrices.\n"); | |
exit(EXIT_FAILURE); | |
} | |
// Initialize host matrices | |
initializeMatrix(h_A, N, K); | |
initializeMatrix(h_B, K, M); | |
// Allocate device memory | |
CUDA_CHECK_ERROR(cudaMalloc((void**)&d_A, size_A)); | |
CUDA_CHECK_ERROR(cudaMalloc((void**)&d_B, size_B)); | |
CUDA_CHECK_ERROR(cudaMalloc((void**)&d_C, size_C)); | |
// Initialize device memory to zero | |
CUDA_CHECK_ERROR(cudaMemset(d_C, 0, size_C)); | |
} | |
~MemoryManager() { | |
// Free host memory | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
// Free device memory | |
CUDA_CHECK_ERROR(cudaFree(d_A)); | |
CUDA_CHECK_ERROR(cudaFree(d_B)); | |
CUDA_CHECK_ERROR(cudaFree(d_C)); | |
} | |
// Asynchronous data transfer to device using streams | |
void transferToDevice(cudaStream_t stream = 0) { | |
CUDA_CHECK_ERROR(cudaMemcpyAsync(d_A, h_A, size_A, cudaMemcpyHostToDevice, stream)); | |
CUDA_CHECK_ERROR(cudaMemcpyAsync(d_B, h_B, size_B, cudaMemcpyHostToDevice, stream)); | |
} | |
// Asynchronous data transfer from device using streams | |
void transferToHost(cudaStream_t stream = 0) { | |
CUDA_CHECK_ERROR(cudaMemcpyAsync(h_C, d_C, size_C, cudaMemcpyDeviceToHost, stream)); | |
} | |
}; | |
// Function to calculate GFLOPS | |
double calculateGFLOPS(int N, int M, int K, double time_ms) { | |
double flops = 2.0 * N * M * K; | |
double gflops = flops / (time_ms * 1e6); | |
return gflops; | |
} | |
// Function to select optimal TILE_WIDTH based on shared memory | |
int selectOptimalTileWidth(const cudaDeviceProp& prop) { | |
// Example logic: choose TILE_WIDTH that fits shared memory constraints | |
// This can be enhanced with empirical tuning or heuristics | |
std::vector<int> possible_tile_sizes = {8, 16, 32}; | |
for(auto tile_size : possible_tile_sizes) { | |
size_t required_shared_mem = 2 * tile_size * tile_size * sizeof(FloatType); | |
if(required_shared_mem <= prop.sharedMemPerBlock) | |
return tile_size; | |
} | |
// Default to 16 if no optimal size found | |
return 16; | |
} | |
// Execution Engine: Manages kernel launches and performance measurements | |
struct ExecutionEngine { | |
int N, M, K; | |
int TILE_WIDTH; | |
size_t shared_mem_size; | |
MemoryManager* mem_manager; | |
cudaStream_t stream; | |
ExecutionEngine(int n, int m, int k, int tile_width, MemoryManager* mm) | |
: N(n), M(m), K(k), TILE_WIDTH(tile_width), mem_manager(mm), stream(0) { | |
// Calculate shared memory size: 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType) | |
shared_mem_size = 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType); | |
} | |
// Initialize CUDA stream for asynchronous operations | |
void initializeStream() { | |
CUDA_CHECK_ERROR(cudaStreamCreate(&stream)); | |
} | |
// Destroy CUDA stream | |
void destroyStream() { | |
CUDA_CHECK_ERROR(cudaStreamDestroy(stream)); | |
} | |
// Launch the matrix multiplication kernel | |
double launchKernel() { | |
// Define grid and block dimensions | |
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); | |
dim3 dimGrid( (M + TILE_WIDTH -1)/TILE_WIDTH, | |
(N + TILE_WIDTH -1)/TILE_WIDTH, 1); | |
// Record start time | |
double start_time = getCurrentTime(); | |
// Launch the kernel with dynamic shared memory | |
switch(TILE_WIDTH) { | |
case 8: | |
matrixMulAdvancedKernel<8><<<dimGrid, dimBlock, shared_mem_size, stream>>>( | |
mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K); | |
break; | |
case 16: | |
matrixMulAdvancedKernel<16><<<dimGrid, dimBlock, shared_mem_size, stream>>>( | |
mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K); | |
break; | |
case 32: | |
matrixMulAdvancedKernel<32><<<dimGrid, dimBlock, shared_mem_size, stream>>>( | |
mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K); | |
break; | |
default: | |
fprintf(stderr, "Unsupported TILE_WIDTH: %d\n", TILE_WIDTH); | |
exit(EXIT_FAILURE); | |
} | |
// Check for kernel launch errors | |
CUDA_CHECK_ERROR(cudaGetLastError()); | |
// Record end time after kernel execution | |
CUDA_CHECK_ERROR(cudaStreamSynchronize(stream)); | |
double end_time = getCurrentTime(); | |
double elapsed_time = end_time - start_time; | |
printf("Kernel execution time: %.3f ms\n", elapsed_time); | |
return elapsed_time; | |
} | |
}; | |
// Function to calculate GFLOPS | |
double calculateGFLOPS(int N, int M, int K, double time_ms) { | |
double flops = 2.0 * N * M * K; | |
double gflops = flops / (time_ms * 1e6); | |
return gflops; | |
} | |
int main(int argc, char** argv) { | |
// Initialize CUDA device and retrieve properties | |
DeviceManager device_manager; | |
// Define matrix dimensions | |
int N = DEFAULT_N; // Rows of A and C | |
int K = DEFAULT_K; // Columns of A and Rows of B | |
int M = DEFAULT_M; // Columns of B and C | |
// Allow custom dimensions via command-line arguments | |
if(argc == 4) { | |
N = atoi(argv[1]); | |
K = atoi(argv[2]); | |
M = atoi(argv[3]); | |
printf("Custom matrix dimensions set: N=%d, K=%d, M=%d\n", N, K, M); | |
} else { | |
printf("Using default matrix dimensions: N=%d, K=%d, M=%d\n", N, K, M); | |
} | |
// Select optimal TILE_WIDTH based on shared memory | |
int TILE_WIDTH = selectOptimalTileWidth(device_manager.device_prop); | |
printf("Selected TILE_WIDTH: %d\n", TILE_WIDTH); | |
// Initialize Memory Manager | |
MemoryManager mem_manager(N, M, K); | |
// Initialize Execution Engine | |
ExecutionEngine exec_engine(N, M, K, TILE_WIDTH, &mem_manager); | |
exec_engine.initializeStream(); | |
// Transfer data to device asynchronously | |
double transfer_to_device_start = getCurrentTime(); | |
mem_manager.transferToDevice(exec_engine.stream); | |
double transfer_to_device_end = getCurrentTime(); | |
printf("Data transfer to device time: %.3f ms\n", transfer_to_device_end - transfer_to_device_start); | |
// Launch kernel and measure execution time | |
double kernel_time = exec_engine.launchKernel(); | |
// Transfer result back to host asynchronously | |
double transfer_to_host_start = getCurrentTime(); | |
mem_manager.transferToHost(exec_engine.stream); | |
CUDA_CHECK_ERROR(cudaStreamSynchronize(exec_engine.stream)); | |
double transfer_to_host_end = getCurrentTime(); | |
printf("Data transfer to host time: %.3f ms\n", transfer_to_host_end - transfer_to_host_start); | |
// Destroy CUDA stream | |
exec_engine.destroyStream(); | |
// Validate the result | |
printf("Verifying results...\n"); | |
bool is_correct = verifyResult(mem_manager.h_A, mem_manager.h_B, mem_manager.h_C, N, M, K); | |
if(is_correct) | |
printf("Matrix multiplication successful and verified.\n"); | |
else | |
printf("Matrix multiplication failed verification.\n"); | |
// Calculate and display GFLOPS | |
double gflops = calculateGFLOPS(N, M, K, kernel_time); | |
printf("Performance: %.3f GFLOPS\n", gflops); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment