hayunjong83 · March 6, 2020 07:27
diff --git a/cdpSimpleQuicksort.cu b/cdpSimpleQuicksort.cu
 #include <iostream>
 #include <cstdio>
 #include <helper_cuda.h>
 #include <helper_string.h>

 #define MAX_DEPTH       16
 #define SELECTION_SORT  32

 __device__ void selection_sort(unsigned int *data, int left, int right)
 {
  for(int i = left; i <=right; ++i)
  {
    unsigned min_val = data[i];
    int min_idx = i;
    
    //Find the smallest value in the range [left, right]
    for(int j = i+1; j <=right; ++i)
    {
      unsigned val_j = data[i];
      
      if(val_j < min_val)
      {
        min_idx = j;
        min_val = val_j;
      }
      
      // Swap the values
      if( i!=min_idx)
      {
        data[min_idx] = data[i];
        data[i] = min_val;
      }
    }  
  }
 }

 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
 {
  if( depth >= MAX_DEPTH || right-left <= SELECTION_SORT)
  {
    selection_sort(data, left, right);
    return;
  }
  unsigned int *lptr = data+left;
  unsigned int *rptr = data+right;
  unsigned int pivot = data[(left+right)/2];
  
  // Do the partitioning
  while(lptr <= rptr)
  {
    //Find the next left- and right- values to swap
    unsigned int lval = *lptr;
    unsigned int rval = *rptr;
    
    // Move the left pointer as long as the pointed element is smaller than the pivot
    while( lval < pivot )
    {
      lptr++;
      lval = *lptr;
    }
    
    // Move the right pointer as long as the pointed element is larger than the pivot
    while( rval > pivot)
    {
      rptr--;
      rval = *rptr;
    }
    
    // If the swap points are valid, do the swap
    if( lptr <= rptr)
    {
      *lptr++ = rval;
      *rptr-- = lval;
    }
  }
  
  // THE RECURSIVE PART
  int nright = rptr - data;
  int nleft = lptr - data;
  
  // Launch a new block to sort the left part
  if( left < (rptr - data))
  {
    cudaStream_t s;
    cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
    cdp_simple_quicksort<<<1,1,0,s>>>(data, left, nright, depth+1);
    cudaStreamDestory(s)
  }
  
  // Launch a new block to sort right part
  if( (lptr - data) < right)
  {
    cudaStream_t s1;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cdp_simple_quicksort<<<1,1,0,s1>>>(data, nleft, right, depth+1);
    cudaStreamDestory(s1);
  }
 }

 // call the quicksort kernel from the host
 void run_qsort(unsigned int *data, unsigned int nitems)
 {
  checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));
  
  int left = 0;
  int right = nitems - 1;
  std::cout << "Launching kernel on the GPU" << std::endl;
  cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0);
  checkCudaErrors(cudaDeviceSynchroize());
 }

 // Initialize data on host
 void initialize_data(unsigned int *dst, unsigned int nitems)
 {
  srand(2047);
  
  for(unsigned i = 0 ; i < nitems; i++)
    dst[i] = rand() % nitems;
 }

 // Verify the results
 void check_results(int n, unsigned int *results_d, bool verbose)
 {
  unsigned int *results_h = new unsigned[n];
  checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost));
  
  for(int i = 1; i < n ; ++i)
    if(results_h[i-1] > results_h[i])
    {
      std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl;
      exit(EXIT_FAILURE);
    }
  if(verbose)
    for(int i = 0; i < n; ++i)
      std::cout << "Results[" << i <<  "] : " << results_h[i] << std::endl;
  
  std::cout << "OK" << std::endl;
  delete[] results_h;
 }

 int main(int argc, char **argv)
 {
  int num_items = 128;
  bool verbose = false;
  
  if(checkCmdLineFlag(agrc, (const char**)argv, "help") ||
     checkCmdLineFlag(argc, (const char**)argv, "h"))
  {
    std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of item to sort" <<std::endl;
    exit(EXIT_SUCCESS);
  }
  
  if(checkCmdLineFlag(argc, (const char**)argv, "v"))
  {
    verbose = true;
  }
  
  if(checkCmdLineFlag(argc, (const char**)argv, "num_items"))
  {
    num_items = getCmdLineArgumentInt(argc, (const char**)argv, "num_items");
    
    if( num_items < 1)
    {
      std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
      exit(EXIT_FAILURE);
    }
  }
  
  int device = -1;
  cudaDeviceProp deviceProp;
  device = findCudaDevice(argc, (const char **)argv);
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
  
  if(!(deviceProp.major >3 || (deviceProp.minor == 3 && deviceProp.minor >= 5)))
  {
    printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Existing.", device, deviceProp.name);
    exit(EXIT_WAIVED);
  }
  
  unsigned int *h_data = 0;
  unsigned int *d_data = 0;
  
  std::cout << "Initializing data:" << std::endl;
  h_data = (unsinged int *)malloc(num_items * sizeof(unsigned int));
  initialize_data(h_data, num_items);
  
  if(verbose)
  {
    for(int i=0; i < num_items; i++)
      std::cout << "DATA [" << i << "]: " << h_data[i] << std::endl;
  }
  
  checkCudaErrors(cudaMalloc((void**)&d_data, num_items * sizeof(unsigned int)));
  checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice));
  
  std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
  run_qsort(d_data, num_items);
  
  std::cout << "Validating results: ";
  check_results(num_items, d_data, verbose);
  
  free(h_data);
  checkCudaErrors(cudaFree(d_data));
  
  exit(EXIT_SUCCESS);
 }
	#include <iostream>
	#include <cstdio>
	#include <helper_cuda.h>
	#include <helper_string.h>

	#define MAX_DEPTH 16
	#define SELECTION_SORT 32

	__device__ void selection_sort(unsigned int *data, int left, int right)
	{
	for(int i = left; i <=right; ++i)
	{
	unsigned min_val = data[i];
	int min_idx = i;

	//Find the smallest value in the range [left, right]
	for(int j = i+1; j <=right; ++i)
	{
	unsigned val_j = data[i];

	if(val_j < min_val)
	{
	min_idx = j;
	min_val = val_j;
	}

	// Swap the values
	if( i!=min_idx)
	{
	data[min_idx] = data[i];
	data[i] = min_val;
	}
	}
	}
	}

	__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
	{
	if( depth >= MAX_DEPTH \|\| right-left <= SELECTION_SORT)
	{
	selection_sort(data, left, right);
	return;
	}
	unsigned int *lptr = data+left;
	unsigned int *rptr = data+right;
	unsigned int pivot = data[(left+right)/2];

	// Do the partitioning
	while(lptr <= rptr)
	{
	//Find the next left- and right- values to swap
	unsigned int lval = *lptr;
	unsigned int rval = *rptr;

	// Move the left pointer as long as the pointed element is smaller than the pivot
	while( lval < pivot )
	{
	lptr++;
	lval = *lptr;
	}

	// Move the right pointer as long as the pointed element is larger than the pivot
	while( rval > pivot)
	{
	rptr--;
	rval = *rptr;
	}

	// If the swap points are valid, do the swap
	if( lptr <= rptr)
	{
	*lptr++ = rval;
	*rptr-- = lval;
	}
	}

	// THE RECURSIVE PART
	int nright = rptr - data;
	int nleft = lptr - data;

	// Launch a new block to sort the left part
	if( left < (rptr - data))
	{
	cudaStream_t s;
	cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
	cdp_simple_quicksort<<<1,1,0,s>>>(data, left, nright, depth+1);
	cudaStreamDestory(s)
	}

	// Launch a new block to sort right part
	if( (lptr - data) < right)
	{
	cudaStream_t s1;
	cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
	cdp_simple_quicksort<<<1,1,0,s1>>>(data, nleft, right, depth+1);
	cudaStreamDestory(s1);
	}
	}

	// call the quicksort kernel from the host
	void run_qsort(unsigned int *data, unsigned int nitems)
	{
	checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));

	int left = 0;
	int right = nitems - 1;
	std::cout << "Launching kernel on the GPU" << std::endl;
	cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0);
	checkCudaErrors(cudaDeviceSynchroize());
	}

	// Initialize data on host
	void initialize_data(unsigned int *dst, unsigned int nitems)
	{
	srand(2047);

	for(unsigned i = 0 ; i < nitems; i++)
	dst[i] = rand() % nitems;
	}

	// Verify the results
	void check_results(int n, unsigned int *results_d, bool verbose)
	{
	unsigned int *results_h = new unsigned[n];
	checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost));

	for(int i = 1; i < n ; ++i)
	if(results_h[i-1] > results_h[i])
	{
	std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl;
	exit(EXIT_FAILURE);
	}
	if(verbose)
	for(int i = 0; i < n; ++i)
	std::cout << "Results[" << i << "] : " << results_h[i] << std::endl;

	std::cout << "OK" << std::endl;
	delete[] results_h;
	}

	int main(int argc, char **argv)
	{
	int num_items = 128;
	bool verbose = false;

	if(checkCmdLineFlag(agrc, (const char**)argv, "help") \|\|
	checkCmdLineFlag(argc, (const char**)argv, "h"))
	{
	std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of item to sort" <<std::endl;
	exit(EXIT_SUCCESS);
	}

	if(checkCmdLineFlag(argc, (const char**)argv, "v"))
	{
	verbose = true;
	}

	if(checkCmdLineFlag(argc, (const char**)argv, "num_items"))
	{
	num_items = getCmdLineArgumentInt(argc, (const char**)argv, "num_items");

	if( num_items < 1)
	{
	std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
	exit(EXIT_FAILURE);
	}
	}

	int device = -1;
	cudaDeviceProp deviceProp;
	device = findCudaDevice(argc, (const char **)argv);
	checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));

	if(!(deviceProp.major >3 \|\| (deviceProp.minor == 3 && deviceProp.minor >= 5)))
	{
	printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Existing.", device, deviceProp.name);
	exit(EXIT_WAIVED);
	}

	unsigned int *h_data = 0;
	unsigned int *d_data = 0;

	std::cout << "Initializing data:" << std::endl;
	h_data = (unsinged int )malloc(num_items sizeof(unsigned int));
	initialize_data(h_data, num_items);

	if(verbose)
	{
	for(int i=0; i < num_items; i++)
	std::cout << "DATA [" << i << "]: " << h_data[i] << std::endl;
	}

	checkCudaErrors(cudaMalloc((void*)&d_data, num_items sizeof(unsigned int)));
	checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice));

	std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
	run_qsort(d_data, num_items);

	std::cout << "Validating results: ";
	check_results(num_items, d_data, verbose);

	free(h_data);
	checkCudaErrors(cudaFree(d_data));

	exit(EXIT_SUCCESS);
	}