Created
April 1, 2018 22:31
-
-
Save andreinechaev/af611566ea637ccd4de9195f8af3f4ba to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
/* | |
* Host function to initialize vector elements. This function | |
* simply initializes each element to equal its index in the | |
* vector. | |
*/ | |
__global__ | |
void initWith(float num, float *a, int N) | |
{ | |
int i = threadIdx.x + blockIdx.x * blockDim.x; | |
if (i < N) { | |
a[i] = num; | |
} | |
} | |
/* | |
* Device kernel stores into `result` the sum of each | |
* same-indexed value of `a` and `b`. | |
*/ | |
__global__ | |
void addVectorsInto(float *result, float *a, float *b, int N) | |
{ | |
int index = threadIdx.x + blockIdx.x * blockDim.x; | |
int stride = blockDim.x * gridDim.x; | |
for(int i = index; i < N; i += stride) | |
{ | |
result[i] = a[i] + b[i]; | |
} | |
} | |
/* | |
* Host function to confirm values in `vector`. This function | |
* assumes all values are the same `target` value. | |
*/ | |
void checkElementsAre(float target, float *vector, int N) | |
{ | |
for(int i = 0; i < N; i++) | |
{ | |
if(vector[i] != target) | |
{ | |
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); | |
exit(1); | |
} | |
} | |
printf("Success! All values calculated correctly.\n"); | |
} | |
int main() | |
{ | |
const int N = 2<<24; | |
size_t size = N * sizeof(float); | |
float *a; | |
float *b; | |
float *c; | |
cudaMallocManaged(&a, size); | |
cudaMallocManaged(&b, size); | |
cudaMallocManaged(&c, size); | |
int t_n = 32; | |
int b_n = N / t_n + 1; | |
initWith<<<b_n, t_n>>>(3, a, N); | |
initWith<<<b_n, t_n>>>(4, b, N); | |
initWith<<<b_n, t_n>>>(0, c, N); | |
cudaError_t initErr = cudaDeviceSynchronize(); | |
if(initErr != cudaSuccess) printf("Init Error: %s\n", cudaGetErrorString(initErr)); | |
size_t threadsPerBlock; | |
size_t numberOfBlocks; | |
/* | |
* nvprof should register performance changes when execution configuration | |
* is updated. | |
*/ | |
int deviceId; | |
cudaGetDevice(&deviceId); | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props, deviceId); | |
threadsPerBlock = 128; | |
numberOfBlocks = N / threadsPerBlock * props.warpSize; | |
cudaError_t addVectorsErr; | |
cudaError_t asyncErr; | |
cudaMemPrefetchAsync(c, size, deviceId); | |
cudaMemPrefetchAsync(a, size, deviceId); | |
cudaMemPrefetchAsync(b, size, deviceId); | |
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N); | |
addVectorsErr = cudaGetLastError(); | |
if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); | |
asyncErr = cudaDeviceSynchronize(); | |
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr)); | |
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId); | |
checkElementsAre(7, c, N); | |
cudaFree(a); | |
cudaFree(b); | |
cudaFree(c); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment