crrapi · December 17, 2024 02:23
diff --git a/matmul.cu b/matmul.cu
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <chrono>
 #include <iostream>
 #include <vector>
 #include <cassert>

 // Define constants for precision
 #define USE_HALF_PRECISION 0  // Set to 1 to use half precision

 #if USE_HALF_PRECISION
 #include <cuda_fp16.h>
 typedef half FloatType;
 #else
 typedef float FloatType;
 #endif

 // Macro for CUDA error checking
 #define CUDA_CHECK_ERROR(call)                                              \
    do {                                                                    \
        cudaError_t err = call;                                             \
        if (err != cudaSuccess) {                                           \
            fprintf(stderr, "CUDA Error at %s:%d - %s\n",                  \
                    __FILE__, __LINE__, cudaGetErrorString(err));           \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    } while (0)

 // Constants for default matrix sizes
 const int DEFAULT_N = 1024;   // Rows of A and C
 const int DEFAULT_K = 512;    // Columns of A and Rows of B
 const int DEFAULT_M = 2048;   // Columns of B and C

 // Utility function to get current time for profiling
 inline double getCurrentTime() {
    using namespace std::chrono;
    return duration<double, std::milli>(steady_clock::now().time_since_epoch()).count();
 }

 // Utility function to initialize matrices with random values
 void initializeMatrix(FloatType* mat, int rows, int cols) {
    for(int i = 0; i < rows * cols; ++i)
        mat[i] = static_cast<FloatType>(rand()) / static_cast<FloatType>(RAND_MAX);
 }

 // Utility function to verify the correctness of GPU results against CPU computation
 bool verifyResult(const FloatType* h_A, const FloatType* h_B, const FloatType* h_C, 
                 int N, int M, int K, FloatType epsilon = 1e-2) {
    bool correct = true;
    for(int row = 0; row < N && correct; ++row) {
        for(int col = 0; col < M && correct; ++col) {
            FloatType cpu_val = 0.0f;
            for(int e = 0; e < K; ++e)
                cpu_val += h_A[row*K + e] * h_B[e*M + col];
            FloatType gpu_val = h_C[row*M + col];
            FloatType diff = fabs(gpu_val - cpu_val);
            if (diff > epsilon) {
                printf("Mismatch at C[%d][%d]: GPU = %f, CPU = %f, Diff = %f\n", 
                       row, col, (float)gpu_val, (float)cpu_val, (float)diff);
                correct = false;
            }
        }
    }
    return correct;
 }

 // CUDA kernel for advanced tiled matrix multiplication
 template <int TILE_WIDTH>
 __global__ void matrixMulAdvancedKernel(const FloatType* __restrict__ d_A, 
                                        const FloatType* __restrict__ d_B, 
                                        FloatType* __restrict__ d_C, 
                                        int N, int M, int K) {
    // Shared memory for tiles of A and B
    extern __shared__ FloatType shared_mem[];
    FloatType* ds_A = shared_mem;
    FloatType* ds_B = shared_mem + TILE_WIDTH * TILE_WIDTH;

    // Calculate global row and column indices
    int global_row = blockIdx.y * TILE_WIDTH + threadIdx.y;
    int global_col = blockIdx.x * TILE_WIDTH + threadIdx.x;

    FloatType Cvalue = 0.0f;

    // Loop over tiles of A and B
    for(int tile = 0; tile < (K + TILE_WIDTH - 1) / TILE_WIDTH; ++tile) {
        // Load data into shared memory with boundary checks
        if (global_row < N && (tile*TILE_WIDTH + threadIdx.x) < K)
            ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_A[global_row * K + tile * TILE_WIDTH + threadIdx.x];
        else
            ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f;

        if (global_col < M && (tile*TILE_WIDTH + threadIdx.y) < K)
            ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_B[(tile * TILE_WIDTH + threadIdx.y) * M + global_col];
        else
            ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f;

        __syncthreads();

        // Compute partial products
        #pragma unroll
        for(int i = 0; i < TILE_WIDTH; ++i)
            Cvalue += ds_A[threadIdx.y * TILE_WIDTH + i] * ds_B[i * TILE_WIDTH + threadIdx.x];

        __syncthreads();
    }

    // Write the result to global memory
    if (global_row < N && global_col < M)
        d_C[global_row * M + global_col] = Cvalue;
 }

 // Device Manager: Initializes CUDA device and retrieves properties
 struct DeviceManager {
    int device_id;
    cudaDeviceProp device_prop;

    DeviceManager() {
        CUDA_CHECK_ERROR(cudaGetDevice(&device_id));
        CUDA_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
        printf("Using CUDA Device [%d]: %s\n", device_id, device_prop.name);
        printf("  Compute Capability: %d.%d\n", device_prop.major, device_prop.minor);
        printf("  Total Global Memory: %.2f GB\n", (float)device_prop.totalGlobalMem / (1 << 30));
        printf("  Shared Memory per Block: %.2f KB\n", (float)device_prop.sharedMemPerBlock / 1024.0f);
        printf("  CUDA Cores: %d\n", device_prop.multiProcessorCount * 
               _ConvertSMVer2Cores(device_prop.major, device_prop.minor));
    }

 private:
    // Utility to convert SM version to number of CUDA cores
    int _ConvertSMVer2Cores(int major, int minor) {
        typedef struct {
            int SM;
            int Cores;
        } sSMtoCores;

        sSMtoCores nGpuArchCoresPerSM[] = {
            { 0x30, 192 }, // Kepler
            { 0x32, 192 },
            { 0x35, 192 },
            { 0x37, 192 },
            { 0x50, 128 }, // Maxwell
            { 0x52, 128 },
            { 0x53, 128 },
            { 0x60, 64 },  // Pascal
            { 0x61, 128 },
            { 0x62, 128 },
            { 0x70, 64 },  // Volta
            { 0x72, 64 },
            { 0x75, 64 },  // Turing
            { 0x80, 64 },  // Ampere
            { -1, -1 }
        };

        int index = 0;
        while(nGpuArchCoresPerSM[index].SM != -1) {
            if(nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
                return nGpuArchCoresPerSM[index].Cores;
            }
            index++;
        }
        printf("MapSMtoCores undefined SM version %d.%d!\n", major, minor);
        return -1;
    }
 };

 // Memory Manager: Handles memory allocations and data transfers
 struct MemoryManager {
    FloatType *h_A, *h_B, *h_C;
    FloatType *d_A, *d_B, *d_C;
    size_t size_A, size_B, size_C;

    MemoryManager(int N, int M, int K) {
        size_A = N * K * sizeof(FloatType);
        size_B = K * M * sizeof(FloatType);
        size_C = N * M * sizeof(FloatType);

        // Allocate host memory
        h_A = (FloatType*)malloc(size_A);
        h_B = (FloatType*)malloc(size_B);
        h_C = (FloatType*)malloc(size_C);

        if(!h_A || !h_B || !h_C) {
            fprintf(stderr, "Failed to allocate host matrices.\n");
            exit(EXIT_FAILURE);
        }

        // Initialize host matrices
        initializeMatrix(h_A, N, K);
        initializeMatrix(h_B, K, M);

        // Allocate device memory
        CUDA_CHECK_ERROR(cudaMalloc((void**)&d_A, size_A));
        CUDA_CHECK_ERROR(cudaMalloc((void**)&d_B, size_B));
        CUDA_CHECK_ERROR(cudaMalloc((void**)&d_C, size_C));

        // Initialize device memory to zero
        CUDA_CHECK_ERROR(cudaMemset(d_C, 0, size_C));
    }

    ~MemoryManager() {
        // Free host memory
        free(h_A);
        free(h_B);
        free(h_C);

        // Free device memory
        CUDA_CHECK_ERROR(cudaFree(d_A));
        CUDA_CHECK_ERROR(cudaFree(d_B));
        CUDA_CHECK_ERROR(cudaFree(d_C));
    }

    // Asynchronous data transfer to device using streams
    void transferToDevice(cudaStream_t stream = 0) {
        CUDA_CHECK_ERROR(cudaMemcpyAsync(d_A, h_A, size_A, cudaMemcpyHostToDevice, stream));
        CUDA_CHECK_ERROR(cudaMemcpyAsync(d_B, h_B, size_B, cudaMemcpyHostToDevice, stream));
    }

    // Asynchronous data transfer from device using streams
    void transferToHost(cudaStream_t stream = 0) {
        CUDA_CHECK_ERROR(cudaMemcpyAsync(h_C, d_C, size_C, cudaMemcpyDeviceToHost, stream));
    }
 };

 // Function to calculate GFLOPS
 double calculateGFLOPS(int N, int M, int K, double time_ms) {
    double flops = 2.0 * N * M * K;
    double gflops = flops / (time_ms * 1e6);
    return gflops;
 }

 // Function to select optimal TILE_WIDTH based on shared memory
 int selectOptimalTileWidth(const cudaDeviceProp& prop) {
    // Example logic: choose TILE_WIDTH that fits shared memory constraints
    // This can be enhanced with empirical tuning or heuristics
    std::vector<int> possible_tile_sizes = {8, 16, 32};
    for(auto tile_size : possible_tile_sizes) {
        size_t required_shared_mem = 2 * tile_size * tile_size * sizeof(FloatType);
        if(required_shared_mem <= prop.sharedMemPerBlock)
            return tile_size;
    }
    // Default to 16 if no optimal size found
    return 16;
 }

 // Execution Engine: Manages kernel launches and performance measurements
 struct ExecutionEngine {
    int N, M, K;
    int TILE_WIDTH;
    size_t shared_mem_size;
    MemoryManager* mem_manager;
    cudaStream_t stream;

    ExecutionEngine(int n, int m, int k, int tile_width, MemoryManager* mm)
        : N(n), M(m), K(k), TILE_WIDTH(tile_width), mem_manager(mm), stream(0) {
        
        // Calculate shared memory size: 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType)
        shared_mem_size = 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType);
    }

    // Initialize CUDA stream for asynchronous operations
    void initializeStream() {
        CUDA_CHECK_ERROR(cudaStreamCreate(&stream));
    }

    // Destroy CUDA stream
    void destroyStream() {
        CUDA_CHECK_ERROR(cudaStreamDestroy(stream));
    }

    // Launch the matrix multiplication kernel
    double launchKernel() {
        // Define grid and block dimensions
        dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
        dim3 dimGrid( (M + TILE_WIDTH -1)/TILE_WIDTH, 
                     (N + TILE_WIDTH -1)/TILE_WIDTH, 1);

        // Record start time
        double start_time = getCurrentTime();

        // Launch the kernel with dynamic shared memory
        switch(TILE_WIDTH) {
            case 8:
                matrixMulAdvancedKernel<8><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
                    mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
                break;
            case 16:
                matrixMulAdvancedKernel<16><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
                    mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
                break;
            case 32:
                matrixMulAdvancedKernel<32><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
                    mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
                break;
            default:
                fprintf(stderr, "Unsupported TILE_WIDTH: %d\n", TILE_WIDTH);
                exit(EXIT_FAILURE);
        }

        // Check for kernel launch errors
        CUDA_CHECK_ERROR(cudaGetLastError());

        // Record end time after kernel execution
        CUDA_CHECK_ERROR(cudaStreamSynchronize(stream));
        double end_time = getCurrentTime();

        double elapsed_time = end_time - start_time;
        printf("Kernel execution time: %.3f ms\n", elapsed_time);
        return elapsed_time;
    }
 };

 // Function to calculate GFLOPS
 double calculateGFLOPS(int N, int M, int K, double time_ms) {
    double flops = 2.0 * N * M * K;
    double gflops = flops / (time_ms * 1e6);
    return gflops;
 }

 int main(int argc, char** argv) {
    // Initialize CUDA device and retrieve properties
    DeviceManager device_manager;

    // Define matrix dimensions
    int N = DEFAULT_N; // Rows of A and C
    int K = DEFAULT_K; // Columns of A and Rows of B
    int M = DEFAULT_M; // Columns of B and C

    // Allow custom dimensions via command-line arguments
    if(argc == 4) {
        N = atoi(argv[1]);
        K = atoi(argv[2]);
        M = atoi(argv[3]);
        printf("Custom matrix dimensions set: N=%d, K=%d, M=%d\n", N, K, M);
    } else {
        printf("Using default matrix dimensions: N=%d, K=%d, M=%d\n", N, K, M);
    }

    // Select optimal TILE_WIDTH based on shared memory
    int TILE_WIDTH = selectOptimalTileWidth(device_manager.device_prop);
    printf("Selected TILE_WIDTH: %d\n", TILE_WIDTH);

    // Initialize Memory Manager
    MemoryManager mem_manager(N, M, K);

    // Initialize Execution Engine
    ExecutionEngine exec_engine(N, M, K, TILE_WIDTH, &mem_manager);
    exec_engine.initializeStream();

    // Transfer data to device asynchronously
    double transfer_to_device_start = getCurrentTime();
    mem_manager.transferToDevice(exec_engine.stream);
    double transfer_to_device_end = getCurrentTime();
    printf("Data transfer to device time: %.3f ms\n", transfer_to_device_end - transfer_to_device_start);

    // Launch kernel and measure execution time
    double kernel_time = exec_engine.launchKernel();

    // Transfer result back to host asynchronously
    double transfer_to_host_start = getCurrentTime();
    mem_manager.transferToHost(exec_engine.stream);
    CUDA_CHECK_ERROR(cudaStreamSynchronize(exec_engine.stream));
    double transfer_to_host_end = getCurrentTime();
    printf("Data transfer to host time: %.3f ms\n", transfer_to_host_end - transfer_to_host_start);

    // Destroy CUDA stream
    exec_engine.destroyStream();

    // Validate the result
    printf("Verifying results...\n");
    bool is_correct = verifyResult(mem_manager.h_A, mem_manager.h_B, mem_manager.h_C, N, M, K);
    if(is_correct)
        printf("Matrix multiplication successful and verified.\n");
    else
        printf("Matrix multiplication failed verification.\n");

    // Calculate and display GFLOPS
    double gflops = calculateGFLOPS(N, M, K, kernel_time);
    printf("Performance: %.3f GFLOPS\n", gflops);

    return 0;
 }
	#include <cuda_runtime.h>
	#include <device_launch_parameters.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <chrono>
	#include <iostream>
	#include <vector>
	#include <cassert>

	// Define constants for precision
	#define USE_HALF_PRECISION 0 // Set to 1 to use half precision

	#if USE_HALF_PRECISION
	#include <cuda_fp16.h>
	typedef half FloatType;
	#else
	typedef float FloatType;
	#endif

	// Macro for CUDA error checking
	#define CUDA_CHECK_ERROR(call) \
	do { \
	cudaError_t err = call; \
	if (err != cudaSuccess) { \
	fprintf(stderr, "CUDA Error at %s:%d - %s\n", \
	__FILE__, __LINE__, cudaGetErrorString(err)); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	// Constants for default matrix sizes
	const int DEFAULT_N = 1024; // Rows of A and C
	const int DEFAULT_K = 512; // Columns of A and Rows of B
	const int DEFAULT_M = 2048; // Columns of B and C

	// Utility function to get current time for profiling
	inline double getCurrentTime() {
	using namespace std::chrono;
	return duration<double, std::milli>(steady_clock::now().time_since_epoch()).count();
	}

	// Utility function to initialize matrices with random values
	void initializeMatrix(FloatType* mat, int rows, int cols) {
	for(int i = 0; i < rows * cols; ++i)
	mat[i] = static_cast<FloatType>(rand()) / static_cast<FloatType>(RAND_MAX);
	}

	// Utility function to verify the correctness of GPU results against CPU computation
	bool verifyResult(const FloatType* h_A, const FloatType* h_B, const FloatType* h_C,
	int N, int M, int K, FloatType epsilon = 1e-2) {
	bool correct = true;
	for(int row = 0; row < N && correct; ++row) {
	for(int col = 0; col < M && correct; ++col) {
	FloatType cpu_val = 0.0f;
	for(int e = 0; e < K; ++e)
	cpu_val += h_A[rowK + e] h_B[e*M + col];
	FloatType gpu_val = h_C[row*M + col];
	FloatType diff = fabs(gpu_val - cpu_val);
	if (diff > epsilon) {
	printf("Mismatch at C[%d][%d]: GPU = %f, CPU = %f, Diff = %f\n",
	row, col, (float)gpu_val, (float)cpu_val, (float)diff);
	correct = false;
	}
	}
	}
	return correct;
	}

	// CUDA kernel for advanced tiled matrix multiplication
	template <int TILE_WIDTH>
	__global__ void matrixMulAdvancedKernel(const FloatType* __restrict__ d_A,
	const FloatType* __restrict__ d_B,
	FloatType* __restrict__ d_C,
	int N, int M, int K) {
	// Shared memory for tiles of A and B
	extern __shared__ FloatType shared_mem[];
	FloatType* ds_A = shared_mem;
	FloatType* ds_B = shared_mem + TILE_WIDTH * TILE_WIDTH;

	// Calculate global row and column indices
	int global_row = blockIdx.y * TILE_WIDTH + threadIdx.y;
	int global_col = blockIdx.x * TILE_WIDTH + threadIdx.x;

	FloatType Cvalue = 0.0f;

	// Loop over tiles of A and B
	for(int tile = 0; tile < (K + TILE_WIDTH - 1) / TILE_WIDTH; ++tile) {
	// Load data into shared memory with boundary checks
	if (global_row < N && (tile*TILE_WIDTH + threadIdx.x) < K)
	ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_A[global_row * K + tile * TILE_WIDTH + threadIdx.x];
	else
	ds_A[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f;

	if (global_col < M && (tile*TILE_WIDTH + threadIdx.y) < K)
	ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = d_B[(tile * TILE_WIDTH + threadIdx.y) * M + global_col];
	else
	ds_B[threadIdx.y * TILE_WIDTH + threadIdx.x] = 0.0f;

	__syncthreads();

	// Compute partial products
	#pragma unroll
	for(int i = 0; i < TILE_WIDTH; ++i)
	Cvalue += ds_A[threadIdx.y * TILE_WIDTH + i] * ds_B[i * TILE_WIDTH + threadIdx.x];

	__syncthreads();
	}

	// Write the result to global memory
	if (global_row < N && global_col < M)
	d_C[global_row * M + global_col] = Cvalue;
	}

	// Device Manager: Initializes CUDA device and retrieves properties
	struct DeviceManager {
	int device_id;
	cudaDeviceProp device_prop;

	DeviceManager() {
	CUDA_CHECK_ERROR(cudaGetDevice(&device_id));
	CUDA_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
	printf("Using CUDA Device [%d]: %s\n", device_id, device_prop.name);
	printf(" Compute Capability: %d.%d\n", device_prop.major, device_prop.minor);
	printf(" Total Global Memory: %.2f GB\n", (float)device_prop.totalGlobalMem / (1 << 30));
	printf(" Shared Memory per Block: %.2f KB\n", (float)device_prop.sharedMemPerBlock / 1024.0f);
	printf(" CUDA Cores: %d\n", device_prop.multiProcessorCount *
	_ConvertSMVer2Cores(device_prop.major, device_prop.minor));
	}

	private:
	// Utility to convert SM version to number of CUDA cores
	int _ConvertSMVer2Cores(int major, int minor) {
	typedef struct {
	int SM;
	int Cores;
	} sSMtoCores;

	sSMtoCores nGpuArchCoresPerSM[] = {
	{ 0x30, 192 }, // Kepler
	{ 0x32, 192 },
	{ 0x35, 192 },
	{ 0x37, 192 },
	{ 0x50, 128 }, // Maxwell
	{ 0x52, 128 },
	{ 0x53, 128 },
	{ 0x60, 64 }, // Pascal
	{ 0x61, 128 },
	{ 0x62, 128 },
	{ 0x70, 64 }, // Volta
	{ 0x72, 64 },
	{ 0x75, 64 }, // Turing
	{ 0x80, 64 }, // Ampere
	{ -1, -1 }
	};

	int index = 0;
	while(nGpuArchCoresPerSM[index].SM != -1) {
	if(nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
	return nGpuArchCoresPerSM[index].Cores;
	}
	index++;
	}
	printf("MapSMtoCores undefined SM version %d.%d!\n", major, minor);
	return -1;
	}
	};

	// Memory Manager: Handles memory allocations and data transfers
	struct MemoryManager {
	FloatType h_A, h_B, *h_C;
	FloatType d_A, d_B, *d_C;
	size_t size_A, size_B, size_C;

	MemoryManager(int N, int M, int K) {
	size_A = N * K * sizeof(FloatType);
	size_B = K * M * sizeof(FloatType);
	size_C = N * M * sizeof(FloatType);

	// Allocate host memory
	h_A = (FloatType*)malloc(size_A);
	h_B = (FloatType*)malloc(size_B);
	h_C = (FloatType*)malloc(size_C);

	if(!h_A \|\| !h_B \|\| !h_C) {
	fprintf(stderr, "Failed to allocate host matrices.\n");
	exit(EXIT_FAILURE);
	}

	// Initialize host matrices
	initializeMatrix(h_A, N, K);
	initializeMatrix(h_B, K, M);

	// Allocate device memory
	CUDA_CHECK_ERROR(cudaMalloc((void**)&d_A, size_A));
	CUDA_CHECK_ERROR(cudaMalloc((void**)&d_B, size_B));
	CUDA_CHECK_ERROR(cudaMalloc((void**)&d_C, size_C));

	// Initialize device memory to zero
	CUDA_CHECK_ERROR(cudaMemset(d_C, 0, size_C));
	}

	~MemoryManager() {
	// Free host memory
	free(h_A);
	free(h_B);
	free(h_C);

	// Free device memory
	CUDA_CHECK_ERROR(cudaFree(d_A));
	CUDA_CHECK_ERROR(cudaFree(d_B));
	CUDA_CHECK_ERROR(cudaFree(d_C));
	}

	// Asynchronous data transfer to device using streams
	void transferToDevice(cudaStream_t stream = 0) {
	CUDA_CHECK_ERROR(cudaMemcpyAsync(d_A, h_A, size_A, cudaMemcpyHostToDevice, stream));
	CUDA_CHECK_ERROR(cudaMemcpyAsync(d_B, h_B, size_B, cudaMemcpyHostToDevice, stream));
	}

	// Asynchronous data transfer from device using streams
	void transferToHost(cudaStream_t stream = 0) {
	CUDA_CHECK_ERROR(cudaMemcpyAsync(h_C, d_C, size_C, cudaMemcpyDeviceToHost, stream));
	}
	};

	// Function to calculate GFLOPS
	double calculateGFLOPS(int N, int M, int K, double time_ms) {
	double flops = 2.0 * N * M * K;
	double gflops = flops / (time_ms * 1e6);
	return gflops;
	}

	// Function to select optimal TILE_WIDTH based on shared memory
	int selectOptimalTileWidth(const cudaDeviceProp& prop) {
	// Example logic: choose TILE_WIDTH that fits shared memory constraints
	// This can be enhanced with empirical tuning or heuristics
	std::vector<int> possible_tile_sizes = {8, 16, 32};
	for(auto tile_size : possible_tile_sizes) {
	size_t required_shared_mem = 2 * tile_size * tile_size * sizeof(FloatType);
	if(required_shared_mem <= prop.sharedMemPerBlock)
	return tile_size;
	}
	// Default to 16 if no optimal size found
	return 16;
	}

	// Execution Engine: Manages kernel launches and performance measurements
	struct ExecutionEngine {
	int N, M, K;
	int TILE_WIDTH;
	size_t shared_mem_size;
	MemoryManager* mem_manager;
	cudaStream_t stream;

	ExecutionEngine(int n, int m, int k, int tile_width, MemoryManager* mm)
	: N(n), M(m), K(k), TILE_WIDTH(tile_width), mem_manager(mm), stream(0) {

	// Calculate shared memory size: 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType)
	shared_mem_size = 2 * TILE_WIDTH * TILE_WIDTH * sizeof(FloatType);
	}

	// Initialize CUDA stream for asynchronous operations
	void initializeStream() {
	CUDA_CHECK_ERROR(cudaStreamCreate(&stream));
	}

	// Destroy CUDA stream
	void destroyStream() {
	CUDA_CHECK_ERROR(cudaStreamDestroy(stream));
	}

	// Launch the matrix multiplication kernel
	double launchKernel() {
	// Define grid and block dimensions
	dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
	dim3 dimGrid( (M + TILE_WIDTH -1)/TILE_WIDTH,
	(N + TILE_WIDTH -1)/TILE_WIDTH, 1);

	// Record start time
	double start_time = getCurrentTime();

	// Launch the kernel with dynamic shared memory
	switch(TILE_WIDTH) {
	case 8:
	matrixMulAdvancedKernel<8><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
	mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
	break;
	case 16:
	matrixMulAdvancedKernel<16><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
	mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
	break;
	case 32:
	matrixMulAdvancedKernel<32><<<dimGrid, dimBlock, shared_mem_size, stream>>>(
	mem_manager->d_A, mem_manager->d_B, mem_manager->d_C, N, M, K);
	break;
	default:
	fprintf(stderr, "Unsupported TILE_WIDTH: %d\n", TILE_WIDTH);
	exit(EXIT_FAILURE);
	}

	// Check for kernel launch errors
	CUDA_CHECK_ERROR(cudaGetLastError());

	// Record end time after kernel execution
	CUDA_CHECK_ERROR(cudaStreamSynchronize(stream));
	double end_time = getCurrentTime();

	double elapsed_time = end_time - start_time;
	printf("Kernel execution time: %.3f ms\n", elapsed_time);
	return elapsed_time;
	}
	};

	// Function to calculate GFLOPS
	double calculateGFLOPS(int N, int M, int K, double time_ms) {
	double flops = 2.0 * N * M * K;
	double gflops = flops / (time_ms * 1e6);
	return gflops;
	}

	int main(int argc, char** argv) {
	// Initialize CUDA device and retrieve properties
	DeviceManager device_manager;

	// Define matrix dimensions
	int N = DEFAULT_N; // Rows of A and C
	int K = DEFAULT_K; // Columns of A and Rows of B
	int M = DEFAULT_M; // Columns of B and C

	// Allow custom dimensions via command-line arguments
	if(argc == 4) {
	N = atoi(argv[1]);
	K = atoi(argv[2]);
	M = atoi(argv[3]);
	printf("Custom matrix dimensions set: N=%d, K=%d, M=%d\n", N, K, M);
	} else {
	printf("Using default matrix dimensions: N=%d, K=%d, M=%d\n", N, K, M);
	}

	// Select optimal TILE_WIDTH based on shared memory
	int TILE_WIDTH = selectOptimalTileWidth(device_manager.device_prop);
	printf("Selected TILE_WIDTH: %d\n", TILE_WIDTH);

	// Initialize Memory Manager
	MemoryManager mem_manager(N, M, K);

	// Initialize Execution Engine
	ExecutionEngine exec_engine(N, M, K, TILE_WIDTH, &mem_manager);
	exec_engine.initializeStream();

	// Transfer data to device asynchronously
	double transfer_to_device_start = getCurrentTime();
	mem_manager.transferToDevice(exec_engine.stream);
	double transfer_to_device_end = getCurrentTime();
	printf("Data transfer to device time: %.3f ms\n", transfer_to_device_end - transfer_to_device_start);

	// Launch kernel and measure execution time
	double kernel_time = exec_engine.launchKernel();

	// Transfer result back to host asynchronously
	double transfer_to_host_start = getCurrentTime();
	mem_manager.transferToHost(exec_engine.stream);
	CUDA_CHECK_ERROR(cudaStreamSynchronize(exec_engine.stream));
	double transfer_to_host_end = getCurrentTime();
	printf("Data transfer to host time: %.3f ms\n", transfer_to_host_end - transfer_to_host_start);

	// Destroy CUDA stream
	exec_engine.destroyStream();

	// Validate the result
	printf("Verifying results...\n");
	bool is_correct = verifyResult(mem_manager.h_A, mem_manager.h_B, mem_manager.h_C, N, M, K);
	if(is_correct)
	printf("Matrix multiplication successful and verified.\n");
	else
	printf("Matrix multiplication failed verification.\n");

	// Calculate and display GFLOPS
	double gflops = calculateGFLOPS(N, M, K, kernel_time);
	printf("Performance: %.3f GFLOPS\n", gflops);

	return 0;
	}