#include <iostream> #include <cuda.h> // CUDA Kernel __global__ void add_arrays(float *a, float *b, float *c, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { c[idx] = a[idx] + b[idx]; } } int main() { const int N = 1 << 20; // 1 million elements const int size = N * sizeof(float); // Allocate host memory float *h_a = new float[N]; float *h_b = new float[N]; float *h_c = new float[N]; // Initialize host arrays for (int i = 0; i < N; i++) { h_a[i] = static_cast<float>(i); h_b[i] = static_cast<float>(i * 2); } // Allocate device memory float *d_a, *d_b, *d_c; cudaMalloc(&d_a, size); cudaMalloc(&d_b, size); cudaMalloc(&d_c, size); // Copy data to device cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice); // Launch kernel int threads_per_block = 256; int blocks_per_grid = (N + threads_per_block - 1) / threads_per_block; add_arrays<<<blocks_per_grid, threads_per_block>>>(d_a, d_b, d_c, N); // Copy result back to host cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost); // Verify result for (int i = 0; i < 10; i++) { std::cout << h_c[i] << std::endl; } // Free memory delete[] h_a; delete[] h_b; delete[] h_c; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0; }