Skip to content

Instantly share code, notes, and snippets.

@Bhavya031
Created September 19, 2023 14:53
Show Gist options
  • Save Bhavya031/735a07178ed79d3f72b52a99d0aa8f68 to your computer and use it in GitHub Desktop.
Save Bhavya031/735a07178ed79d3f72b52a99d0aa8f68 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <cuda_runtime.h>
const int N = 256; // Matrix size (N x N)
// Kernel function for matrix multiplication
__global__ void matrixMultiply(float* A, float* B, float* C, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n && col < n) {
float sum = 0.0f;
for (int i = 0; i < n; ++i) {
sum += A[row * n + i] * B[i * n + col];
}
C[row * n + col] = sum;
}
}
int main() {
float *h_A, *h_B, *h_C; // Host matrices
float *d_A, *d_B, *d_C; // Device matrices
// Allocate memory on the host
h_A = new float[N * N];
h_B = new float[N * N];
h_C = new float[N * N];
// Initialize matrices h_A and h_B
// Allocate memory on the device
cudaMalloc((void**)&d_A, N * N * sizeof(float));
cudaMalloc((void**)&d_B, N * N * sizeof(float));
cudaMalloc((void**)&d_C, N * N * sizeof(float));
// Copy data from host to device
cudaMemcpy(d_A, h_A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * N * sizeof(float), cudaMemcpyHostToDevice);
// Define thread block and grid dimensions
dim3 blockDim(16, 16);
dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);
// Launch the kernel
matrixMultiply<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
// Copy the result back to the host
cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment