Skip to content

Instantly share code, notes, and snippets.

@f0ster
Created January 7, 2025 18:48
Show Gist options
  • Save f0ster/65f03a880c112bf06b4a81613074ea7a to your computer and use it in GitHub Desktop.
Save f0ster/65f03a880c112bf06b4a81613074ea7a to your computer and use it in GitHub Desktop.
Custom CUDA Kernel Example
#include <iostream>
#include <cuda.h>
// CUDA Kernel
__global__ void add_arrays(float *a, float *b, float *c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
const int N = 1 << 20; // 1 million elements
const int size = N * sizeof(float);
// Allocate host memory
float *h_a = new float[N];
float *h_b = new float[N];
float *h_c = new float[N];
// Initialize host arrays
for (int i = 0; i < N; i++) {
h_a[i] = static_cast<float>(i);
h_b[i] = static_cast<float>(i * 2);
}
// Allocate device memory
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy data to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Launch kernel
int threads_per_block = 256;
int blocks_per_grid = (N + threads_per_block - 1) / threads_per_block;
add_arrays<<<blocks_per_grid, threads_per_block>>>(d_a, d_b, d_c, N);
// Copy result back to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Verify result
for (int i = 0; i < 10; i++) {
std::cout << h_c[i] << std::endl;
}
// Free memory
delete[] h_a;
delete[] h_b;
delete[] h_c;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
@f0ster
Copy link
Author

f0ster commented Jan 7, 2025

nvcc -ccbin=/usr/bin/gcc-13.2 add_arrays.cu -o add_arrays -std=c++17 -lstdc++

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment