hayunjong83 · March 5, 2020 07:59
diff --git a/cdpSimplePrint.cu b/cdpSimplePrint.cu
 #include <iostream>
 #include <cstdio>
 #include <cstdlib>
 #include <helper_cuda.h>
 #include <helper_string.h>

 __device__ int g_uids = 0;

 __device__ void print_info(int depth, int thread, int uid, int parent_uid)
 {
  if(threadIdx.x == 0)
  {
    if(depth==0)
      printf("BLOCK %d launched by the host\n", uid);
    else{
      char buffer[32];
      
      for(int i = 0; i < depth; ++i)
      {
        buffer[3*i+0] = '|';
        buffer[3*i+1] = ' ';
        buffer[3*i+2] = ' ';
      }
      
      buffer[3*depth] = '\0';
      printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
    }
  }
  
  __syncthreads();
 }

 __global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
 {
  __shared__ int s_uid;
  
  if(threadIdx.x == 0)
  {
    s_uid = atomicAdd(&g_uids, 1);
  }
  __syncthreads();
  
  print_info(depth, thread, s_uid, parent_uid);
  
  if(++depth >=max_depth)
  {
    return;
  }
  
  cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
 }


 int main(int argc, char **argv)
 {
  printf("starting Simple Print (CUDA Dynamic Parallelism)\n");
  
  int max_depth = 2;
  
  if(checkCmdLineFlag(argc, (const char **)argv, "help") ||
     checkCmdLineFlag(argc, (const char **)argv, "h"))
  {
    printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n",argv[0]);
    exit(EXIT_SUCCESS);
  }
  
  if(checkCmdLineFlag(argc, (const char **)argv, "depth"))
  {
    max_depth = getCmdLineArgumentInc(argc, (const char **)argv, "depth");
    
    if(max_depth < 1 || max_depth > 0)
    {
      printf("depth parameter has to be between 1 and 8\n");
      exit(EXIT_FAILURE);
    }
  }
  
  int device = -1;
  cudaDeviceProp deviceProp;
  device = findCudaDevice(argc, (const char **)argv);
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
  
  if(!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >=5)))
  {
    printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name);
    exit(EXIT_WAIVED);
  }
  
  printf("***************************************************************************\n");
  printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
  printf("until it reaches max_depth=%d\n\n", max_depth);
  printf("In total 2");
  int num_blocks = 2, sum = 2;
  
  for(int i = 1 ; i < max_depth; ++i)
  {
    num_blocks *=4;
    printf("+%d", num_blocks);
    sum += num_blocks;
  }
  
  printf("=%d blocks are launched!!!! (%d from the GPU)\n", sum, sum-2);
  printf("*****************************************************************************\n\n");
  
  cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
  
  printf("Launching cdp_kernel() with CUDA Dynamci Parallelism:\n\n");
  cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
  checkCudaErrors(cudaGetLastError());
  
  checkCudaErrors(cudaDeviceSynchronize());
  
  exit(EXIT_SUCCESS);
  
 }
	#include <iostream>
	#include <cstdio>
	#include <cstdlib>
	#include <helper_cuda.h>
	#include <helper_string.h>

	__device__ int g_uids = 0;

	__device__ void print_info(int depth, int thread, int uid, int parent_uid)
	{
	if(threadIdx.x == 0)
	{
	if(depth==0)
	printf("BLOCK %d launched by the host\n", uid);
	else{
	char buffer[32];

	for(int i = 0; i < depth; ++i)
	{
	buffer[3*i+0] = '\|';
	buffer[3*i+1] = ' ';
	buffer[3*i+2] = ' ';
	}

	buffer[3*depth] = '\0';
	printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
	}
	}

	__syncthreads();
	}

	__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
	{
	__shared__ int s_uid;

	if(threadIdx.x == 0)
	{
	s_uid = atomicAdd(&g_uids, 1);
	}
	__syncthreads();

	print_info(depth, thread, s_uid, parent_uid);

	if(++depth >=max_depth)
	{
	return;
	}

	cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
	}


	int main(int argc, char **argv)
	{
	printf("starting Simple Print (CUDA Dynamic Parallelism)\n");

	int max_depth = 2;

	if(checkCmdLineFlag(argc, (const char **)argv, "help") \|\|
	checkCmdLineFlag(argc, (const char **)argv, "h"))
	{
	printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n",argv[0]);
	exit(EXIT_SUCCESS);
	}

	if(checkCmdLineFlag(argc, (const char **)argv, "depth"))
	{
	max_depth = getCmdLineArgumentInc(argc, (const char **)argv, "depth");

	if(max_depth < 1 \|\| max_depth > 0)
	{
	printf("depth parameter has to be between 1 and 8\n");
	exit(EXIT_FAILURE);
	}
	}

	int device = -1;
	cudaDeviceProp deviceProp;
	device = findCudaDevice(argc, (const char **)argv);
	checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));

	if(!(deviceProp.major > 3 \|\| (deviceProp.major == 3 && deviceProp.minor >=5)))
	{
	printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name);
	exit(EXIT_WAIVED);
	}

	printf("***************************************************************************\n");
	printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
	printf("until it reaches max_depth=%d\n\n", max_depth);
	printf("In total 2");
	int num_blocks = 2, sum = 2;

	for(int i = 1 ; i < max_depth; ++i)
	{
	num_blocks *=4;
	printf("+%d", num_blocks);
	sum += num_blocks;
	}

	printf("=%d blocks are launched!!!! (%d from the GPU)\n", sum, sum-2);
	printf("*****************************************************************************\n\n");

	cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);

	printf("Launching cdp_kernel() with CUDA Dynamci Parallelism:\n\n");
	cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
	checkCudaErrors(cudaGetLastError());

	checkCudaErrors(cudaDeviceSynchronize());

	exit(EXIT_SUCCESS);

	}