Last active
April 24, 2020 18:58
-
-
Save AymenFJA/e6814a7ee84a239681673b559c38cbed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
// CUDA kernel. Each thread takes care of one element of c | |
__global__ void vecAdd(double *a, double *b, double *c, int n) | |
{ | |
// Get our global thread ID | |
int id = blockIdx.x*blockDim.x+threadIdx.x; | |
// Make sure we do not go out of bounds | |
if (id < n) | |
c[id] = a[id] + b[id]; | |
} | |
int main( int argc, char* argv[] ) | |
{ | |
// Size of vectors | |
int n = 100000; | |
// Host input vectors | |
double *h_a; | |
double *h_b; | |
//Host output vector | |
double *h_c; | |
// Device input vectors | |
double *d_a; | |
double *d_b; | |
//Device output vector | |
double *d_c; | |
// Size, in bytes, of each vector | |
size_t bytes = n*sizeof(double); | |
// Allocate memory for each vector on host | |
h_a = (double*)malloc(bytes); | |
h_b = (double*)malloc(bytes); | |
h_c = (double*)malloc(bytes); | |
// Allocate memory for each vector on GPU | |
cudaMalloc(&d_a, bytes); | |
cudaMalloc(&d_b, bytes); | |
cudaMalloc(&d_c, bytes); | |
int i; | |
// Initialize vectors on host | |
for( i = 0; i < n; i++ ) { | |
h_a[i] = sin(i)*sin(i); | |
h_b[i] = cos(i)*cos(i); | |
} | |
// Copy host vectors to device | |
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice); | |
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice); | |
int blockSize, gridSize; | |
// Number of threads in each thread block | |
blockSize = 1024; | |
// Number of thread blocks in grid | |
gridSize = (int)ceil((float)n/blockSize); | |
// Execute the kernel | |
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n); | |
// Copy array back to host | |
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); | |
// Sum up vector c and print result divided by n, this should equal 1 within error | |
double sum = 0; | |
for(i=0; i<n; i++) | |
sum += h_c[i]; | |
printf("final result: %f\n", sum/n); | |
// Release device memory | |
cudaFree(d_a); | |
cudaFree(d_b); | |
cudaFree(d_c); | |
// Release host memory | |
free(h_a); | |
free(h_b); | |
free(h_c); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment