Last active
October 12, 2017 09:50
-
-
Save etale-cohomology/17d016dfb5e498c07c76c237740b1829 to your computer and use it in GitHub Desktop.
A minimal CUDA program that naively performs vector addition, component-wise!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compile and run with: | |
// nvcc add1.cu -o add1 && ./add1 | |
// This minimal CUDA program performs vector addition, component-wise! | |
#include <stdio.h> | |
#define N_ELEMENTS 8 | |
// ------------------------------------------------------------------------------------------------ | |
// __global__functions run on the device, so they must be pointers (to device memory)! | |
__global__ void cuda_vec_add(uint* a, uint* b, uint* c){ | |
uint i = blockIdx.x * blockDim.x + threadIdx.x; | |
c[i] = a[i] + b[i]; | |
} | |
// ------------------------------------------------------------------------------------------------ | |
__host__ int main(){ | |
puts("MSG This minimal CUDA program performs vector addition, component-wise!"); | |
// ---------------------------- | |
uint *cpu_a, *cpu_b, *cpu_c; // Host copies of a, b, c! | |
uint *gpu_a, *gpu_b, *gpu_c; // Device copies of a, b, c! | |
// ---------------------------- | |
puts("MALLOC Data on host!"); | |
cpu_a = (uint*)malloc(N_ELEMENTS * sizeof(uint)); | |
cpu_b = (uint*)malloc(N_ELEMENTS * sizeof(uint)); | |
cpu_c = (uint*)malloc(N_ELEMENTS * sizeof(uint)); | |
puts("RUN cudaMalloc() Allocate memory for device copies of a, b, c!"); | |
cudaMalloc((void**)&gpu_a, N_ELEMENTS * sizeof(uint)); // For some reason, we don't pass `gpu_a` but `&gpu_a`. Why?? | |
cudaMalloc((void**)&gpu_b, N_ELEMENTS * sizeof(uint)); | |
cudaMalloc((void**)&gpu_c, N_ELEMENTS * sizeof(uint)); | |
// ---------------------------- | |
puts("INIT Data on host!"); | |
for(uint i=0; i<N_ELEMENTS; ++i){ | |
cpu_a[i] = i; | |
cpu_b[i] = 2 * i; | |
} | |
puts("RUN cudaMemcpy() Copy data from host to device!"); | |
cudaMemcpy(gpu_a, cpu_a, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice); | |
cudaMemcpy(gpu_b, cpu_b, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice); | |
// ---------------------------- | |
puts("RUN cuda_vec_add() Launch CUDA kernel on the device!"); | |
cuda_vec_add<<<1, N_ELEMENTS>>>(gpu_a, gpu_b, gpu_c); | |
puts("RUN cudaMemcpy() Copy results back to the host"); | |
cudaMemcpy(cpu_c, gpu_c, N_ELEMENTS * sizeof(uint), cudaMemcpyDeviceToHost); // Store resulting matrix `gpu_c` (GPU-side) in `cpu_c` (CPU-side) | |
// ---------------------------- | |
puts("\nSHOW Data on host (regardless of where it was computed)!"); | |
printf("cpu_a\n "); | |
for(uint i=0; i<N_ELEMENTS; ++i) | |
printf("%u ", cpu_a[i]); | |
puts(""); | |
printf("cpu_b\n "); | |
for(uint i=0; i<N_ELEMENTS; ++i) | |
printf("%u ", cpu_b[i]); | |
puts(""); | |
printf("cpu_c (from gpu_c, computed on GPU)\n "); | |
for(uint i=0; i<N_ELEMENTS; ++i) | |
printf("%u ", cpu_c[i]); | |
puts(""); | |
// ---------------------------- | |
puts("\nRUN cudaFree() Free device memory"); | |
cudaFree(gpu_a); | |
cudaFree(gpu_b); | |
cudaFree(gpu_c); | |
puts("RUN free() Free host memory"); | |
free(cpu_a); | |
free(cpu_b); | |
free(cpu_c); | |
// ---------------------------- | |
puts("\nExit success!"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment