Created
January 7, 2025 18:48
-
-
Save f0ster/65f03a880c112bf06b4a81613074ea7a to your computer and use it in GitHub Desktop.
Custom CUDA Kernel Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cuda.h> | |
// CUDA Kernel | |
__global__ void add_arrays(float *a, float *b, float *c, int n) { | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (idx < n) { | |
c[idx] = a[idx] + b[idx]; | |
} | |
} | |
int main() { | |
const int N = 1 << 20; // 1 million elements | |
const int size = N * sizeof(float); | |
// Allocate host memory | |
float *h_a = new float[N]; | |
float *h_b = new float[N]; | |
float *h_c = new float[N]; | |
// Initialize host arrays | |
for (int i = 0; i < N; i++) { | |
h_a[i] = static_cast<float>(i); | |
h_b[i] = static_cast<float>(i * 2); | |
} | |
// Allocate device memory | |
float *d_a, *d_b, *d_c; | |
cudaMalloc(&d_a, size); | |
cudaMalloc(&d_b, size); | |
cudaMalloc(&d_c, size); | |
// Copy data to device | |
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice); | |
// Launch kernel | |
int threads_per_block = 256; | |
int blocks_per_grid = (N + threads_per_block - 1) / threads_per_block; | |
add_arrays<<<blocks_per_grid, threads_per_block>>>(d_a, d_b, d_c, N); | |
// Copy result back to host | |
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost); | |
// Verify result | |
for (int i = 0; i < 10; i++) { | |
std::cout << h_c[i] << std::endl; | |
} | |
// Free memory | |
delete[] h_a; | |
delete[] h_b; | |
delete[] h_c; | |
cudaFree(d_a); | |
cudaFree(d_b); | |
cudaFree(d_c); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nvcc -ccbin=/usr/bin/gcc-13.2 add_arrays.cu -o add_arrays -std=c++17 -lstdc++