Last active
November 1, 2019 10:11
-
-
Save hayunjong83/98f10d644092b3c5f81f95f4d0e59726 to your computer and use it in GitHub Desktop.
cuda sdk sample code : 0_Simple/asyncAPI.cu
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <cuda_duntime.h> | |
#include <helper_cuda.h> | |
#include <helper_functions.h> | |
__global__ void incermental_kernel(int *g_data, int inc_value) | |
{ | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
g_data[idx] = g_data[idx] + inc_value; | |
} | |
bool correct_output(int *data, const int n, const int x) | |
{ | |
for(int i=0; i < n; i++) | |
if( data[i] !=x) | |
{ | |
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); | |
return false; | |
} | |
return true; | |
} | |
int main( int argc, char *argv[]) | |
{ | |
int devID; | |
cudaDeviceProp deviceProps; | |
printf("[%s] - Starting...\n", argv[0]); | |
devID = findCudaDevice(argc, (const char **)argv); | |
// get device name | |
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); | |
printf("CUDA device [%s]\n", deviceProps.name); | |
int n = 16 * 1024 * 1024; | |
int nbytes = n * sizeof(int); | |
int value = 26; | |
// allocate host memory | |
int a* = 0; | |
checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); | |
memset(a, 0, nbytes); | |
// allocate device memory | |
int d_a* = 0; | |
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); | |
checkCudaErrors(cudaMemset(d_a, 255, nbytes)); | |
//set kernel launch configuration | |
dim3 threads = dim3(512, 1); | |
dim3 blocks = dim3(n/threads.x, 1); | |
//create cuda event handles | |
cudaEvent_t start, stop; | |
checkCudaErrors(cudaEventCreate(&start)); | |
checkCudaErrors(cudaEventCreate(&stop)); | |
StopWatchInterface *timer = NULL; | |
sdkCreateTimer(&timer); | |
sdkResetTimer(&timer); | |
checkCudaErrors(cudaDeviceSynchronize()); | |
float gpu_time = 0.0f; | |
// asynchronously issue work to the GPU ( all to stream 0) | |
sdkStartTimer(&timer); | |
cudaEventRecord(start, 0); | |
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); | |
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value); | |
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); | |
cudaEventRecord(stop, 0); | |
sdkStopTimer(&timer); | |
// have CPU do some work while waiting for stage 1 to finish | |
unsigned long int counter=0; | |
while( cudaEventQuery(stop) == cudaErrorNotReady ) | |
{ | |
counter++; | |
} | |
checkCudaErrors( cudaEventElapsedTime(&gpu_time, start, stop)); | |
//print the cpu and gpu times | |
printf("time spent execution by the GPU: %.2f\n", gpu_time); | |
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); | |
printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); | |
// check the output for correctness | |
bool bFinalResults = correct_output(a, n, value); | |
// release resourcess | |
checkCudaErrors(cudaEventDestroy(start)); | |
checkCudaErrors(cudaEventDestroy(stop)); | |
checkCudaErrors(cudaFreeHost(a)); | |
checkCudaErrors(cudaFree(d_a)); | |
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment