Created
March 5, 2020 07:59
-
-
Save hayunjong83/332fd88438dc3628ea2d737ea672cbf1 to your computer and use it in GitHub Desktop.
CUDA dynamic parallelism example 1) cdpSimplePrint
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cstdio> | |
#include <cstdlib> | |
#include <helper_cuda.h> | |
#include <helper_string.h> | |
__device__ int g_uids = 0; | |
__device__ void print_info(int depth, int thread, int uid, int parent_uid) | |
{ | |
if(threadIdx.x == 0) | |
{ | |
if(depth==0) | |
printf("BLOCK %d launched by the host\n", uid); | |
else{ | |
char buffer[32]; | |
for(int i = 0; i < depth; ++i) | |
{ | |
buffer[3*i+0] = '|'; | |
buffer[3*i+1] = ' '; | |
buffer[3*i+2] = ' '; | |
} | |
buffer[3*depth] = '\0'; | |
printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid); | |
} | |
} | |
__syncthreads(); | |
} | |
__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid) | |
{ | |
__shared__ int s_uid; | |
if(threadIdx.x == 0) | |
{ | |
s_uid = atomicAdd(&g_uids, 1); | |
} | |
__syncthreads(); | |
print_info(depth, thread, s_uid, parent_uid); | |
if(++depth >=max_depth) | |
{ | |
return; | |
} | |
cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid); | |
} | |
int main(int argc, char **argv) | |
{ | |
printf("starting Simple Print (CUDA Dynamic Parallelism)\n"); | |
int max_depth = 2; | |
if(checkCmdLineFlag(argc, (const char **)argv, "help") || | |
checkCmdLineFlag(argc, (const char **)argv, "h")) | |
{ | |
printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n",argv[0]); | |
exit(EXIT_SUCCESS); | |
} | |
if(checkCmdLineFlag(argc, (const char **)argv, "depth")) | |
{ | |
max_depth = getCmdLineArgumentInc(argc, (const char **)argv, "depth"); | |
if(max_depth < 1 || max_depth > 0) | |
{ | |
printf("depth parameter has to be between 1 and 8\n"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
int device = -1; | |
cudaDeviceProp deviceProp; | |
device = findCudaDevice(argc, (const char **)argv); | |
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); | |
if(!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >=5))) | |
{ | |
printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name); | |
exit(EXIT_WAIVED); | |
} | |
printf("***************************************************************************\n"); | |
printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n"); | |
printf("until it reaches max_depth=%d\n\n", max_depth); | |
printf("In total 2"); | |
int num_blocks = 2, sum = 2; | |
for(int i = 1 ; i < max_depth; ++i) | |
{ | |
num_blocks *=4; | |
printf("+%d", num_blocks); | |
sum += num_blocks; | |
} | |
printf("=%d blocks are launched!!!! (%d from the GPU)\n", sum, sum-2); | |
printf("*****************************************************************************\n\n"); | |
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth); | |
printf("Launching cdp_kernel() with CUDA Dynamci Parallelism:\n\n"); | |
cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1); | |
checkCudaErrors(cudaGetLastError()); | |
checkCudaErrors(cudaDeviceSynchronize()); | |
exit(EXIT_SUCCESS); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment