Created
May 5, 2015 02:03
-
-
Save thiagokokada/345c2291ee694b0dd7bd to your computer and use it in GitHub Desktop.
Matrix sum in CUDA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <cuda.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stddef.h> | |
/* Example from "Introduction to CUDA C" from NVIDIA website: | |
https://developer.nvidia.com/cuda-education | |
Compile with: | |
$ nvcc example_intro.cu */ | |
const int N = 16384; | |
const int THREADS_PER_BLOCK = 512; | |
/* Running one thread in each block (slides 24-32) */ | |
__global__ void add_blocks (int *a, int *b, int *c) { | |
/* blockIdx.x gives each block ID */ | |
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x]; | |
} | |
/* Running multiple threads in one block (slides 33-36) */ | |
__global__ void add_threads (int *a, int *b, int *c) { | |
/* threadIdx.x gives the thread ID in each block */ | |
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x]; | |
} | |
/* Running multiple threads in multiple blocks (slides 37-45). | |
While doing this seems unecessary, in some cases we need threads | |
since they have communication (__shared__ variables) and | |
synchronization (__syncthreads()) mechanisms, | |
*/ | |
__global__ void add_threads_blocks (int *a, int *b, int *c, int n) { | |
/* blockDim.x gives the number of threads per block, combining it | |
with threadIdx.x and blockIdx.x gives the index of each global | |
thread in the device */ | |
int index = threadIdx.x * blockIdx.x * threadIdx.x; | |
/* Typical problems are not friendly multiples of blockDim.x. | |
Avoid accesing data beyond the end of the arrays */ | |
if (index < n) { | |
c[index] = a[index] + b[index]; | |
} | |
} | |
int main(void) { | |
int *a, *b, *c; /* Host (CPU) copies of a, b, c */ | |
int *d_a, *d_b, *d_c; /* Device (GPU) copies of a, b, c */ | |
size_t size = N * sizeof(int); | |
srand(1); | |
/* Allocate memory in device */ | |
cudaMalloc((void **) &d_a, size); | |
cudaMalloc((void **) &d_b, size); | |
cudaMalloc((void **) &d_c, size); | |
/* Allocate memory in host */ | |
a = (int *) malloc(size); | |
b = (int *) malloc(size); | |
c = (int *) malloc(size); | |
/* Allocate random data in vectors a and b (inside host) */ | |
for (int i = 0; i < N; ++i) { | |
a[i] = rand(); | |
b[i] = rand(); | |
} | |
/* Copy data to device */ | |
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); | |
/* Launch add() kernel on device with N blocks */ | |
add_blocks<<<N,1>>>(d_a, d_b, d_c); | |
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); | |
/* Check if everything is alright */ | |
for (int i = 0; i < N; ++i) { | |
assert(c[i] == a[i] + b[i]); | |
} | |
printf("Version with %d blocks executed succesfully!\n", N); | |
/* Launch add() kernel on device with N threads */ | |
add_threads<<<1,N>>>(d_a, d_b, d_c); | |
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); | |
/* Check if everything is alright */ | |
for (int i = 0; i < N; ++i) { | |
assert(c[i] == a[i] + b[i]); | |
} | |
printf("Version with %d threads executed succesfully!\n", N); | |
/* Launch add() kernel on device with N threads in N blocks */ | |
add_threads_blocks<<<(N + (THREADS_PER_BLOCK - 1)) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a, d_b, d_c, N); | |
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); | |
/* Check if everything is alright */ | |
for (int i = 0; i < N; ++i) { | |
assert(c[i] == a[i] + b[i]); | |
} | |
printf("Version with %d threads/blocks executed succesfully!\n", N); | |
/* Clean-up */ | |
free(a); free(b); free(c); | |
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment