Last active
March 6, 2020 07:27
-
-
Save hayunjong83/8b04e29285a658311ba9da993c5208c6 to your computer and use it in GitHub Desktop.
quick sort using CUDA dynamic parallelism
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cstdio> | |
#include <helper_cuda.h> | |
#include <helper_string.h> | |
#define MAX_DEPTH 16 | |
#define SELECTION_SORT 32 | |
__device__ void selection_sort(unsigned int *data, int left, int right) | |
{ | |
for(int i = left; i <=right; ++i) | |
{ | |
unsigned min_val = data[i]; | |
int min_idx = i; | |
//Find the smallest value in the range [left, right] | |
for(int j = i+1; j <=right; ++i) | |
{ | |
unsigned val_j = data[i]; | |
if(val_j < min_val) | |
{ | |
min_idx = j; | |
min_val = val_j; | |
} | |
// Swap the values | |
if( i!=min_idx) | |
{ | |
data[min_idx] = data[i]; | |
data[i] = min_val; | |
} | |
} | |
} | |
} | |
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) | |
{ | |
if( depth >= MAX_DEPTH || right-left <= SELECTION_SORT) | |
{ | |
selection_sort(data, left, right); | |
return; | |
} | |
unsigned int *lptr = data+left; | |
unsigned int *rptr = data+right; | |
unsigned int pivot = data[(left+right)/2]; | |
// Do the partitioning | |
while(lptr <= rptr) | |
{ | |
//Find the next left- and right- values to swap | |
unsigned int lval = *lptr; | |
unsigned int rval = *rptr; | |
// Move the left pointer as long as the pointed element is smaller than the pivot | |
while( lval < pivot ) | |
{ | |
lptr++; | |
lval = *lptr; | |
} | |
// Move the right pointer as long as the pointed element is larger than the pivot | |
while( rval > pivot) | |
{ | |
rptr--; | |
rval = *rptr; | |
} | |
// If the swap points are valid, do the swap | |
if( lptr <= rptr) | |
{ | |
*lptr++ = rval; | |
*rptr-- = lval; | |
} | |
} | |
// THE RECURSIVE PART | |
int nright = rptr - data; | |
int nleft = lptr - data; | |
// Launch a new block to sort the left part | |
if( left < (rptr - data)) | |
{ | |
cudaStream_t s; | |
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); | |
cdp_simple_quicksort<<<1,1,0,s>>>(data, left, nright, depth+1); | |
cudaStreamDestory(s) | |
} | |
// Launch a new block to sort right part | |
if( (lptr - data) < right) | |
{ | |
cudaStream_t s1; | |
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); | |
cdp_simple_quicksort<<<1,1,0,s1>>>(data, nleft, right, depth+1); | |
cudaStreamDestory(s1); | |
} | |
} | |
// call the quicksort kernel from the host | |
void run_qsort(unsigned int *data, unsigned int nitems) | |
{ | |
checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH)); | |
int left = 0; | |
int right = nitems - 1; | |
std::cout << "Launching kernel on the GPU" << std::endl; | |
cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0); | |
checkCudaErrors(cudaDeviceSynchroize()); | |
} | |
// Initialize data on host | |
void initialize_data(unsigned int *dst, unsigned int nitems) | |
{ | |
srand(2047); | |
for(unsigned i = 0 ; i < nitems; i++) | |
dst[i] = rand() % nitems; | |
} | |
// Verify the results | |
void check_results(int n, unsigned int *results_d, bool verbose) | |
{ | |
unsigned int *results_h = new unsigned[n]; | |
checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost)); | |
for(int i = 1; i < n ; ++i) | |
if(results_h[i-1] > results_h[i]) | |
{ | |
std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl; | |
exit(EXIT_FAILURE); | |
} | |
if(verbose) | |
for(int i = 0; i < n; ++i) | |
std::cout << "Results[" << i << "] : " << results_h[i] << std::endl; | |
std::cout << "OK" << std::endl; | |
delete[] results_h; | |
} | |
int main(int argc, char **argv) | |
{ | |
int num_items = 128; | |
bool verbose = false; | |
if(checkCmdLineFlag(agrc, (const char**)argv, "help") || | |
checkCmdLineFlag(argc, (const char**)argv, "h")) | |
{ | |
std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of item to sort" <<std::endl; | |
exit(EXIT_SUCCESS); | |
} | |
if(checkCmdLineFlag(argc, (const char**)argv, "v")) | |
{ | |
verbose = true; | |
} | |
if(checkCmdLineFlag(argc, (const char**)argv, "num_items")) | |
{ | |
num_items = getCmdLineArgumentInt(argc, (const char**)argv, "num_items"); | |
if( num_items < 1) | |
{ | |
std::cerr << "ERROR: num_items has to be greater than 1" << std::endl; | |
exit(EXIT_FAILURE); | |
} | |
} | |
int device = -1; | |
cudaDeviceProp deviceProp; | |
device = findCudaDevice(argc, (const char **)argv); | |
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); | |
if(!(deviceProp.major >3 || (deviceProp.minor == 3 && deviceProp.minor >= 5))) | |
{ | |
printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Existing.", device, deviceProp.name); | |
exit(EXIT_WAIVED); | |
} | |
unsigned int *h_data = 0; | |
unsigned int *d_data = 0; | |
std::cout << "Initializing data:" << std::endl; | |
h_data = (unsinged int *)malloc(num_items * sizeof(unsigned int)); | |
initialize_data(h_data, num_items); | |
if(verbose) | |
{ | |
for(int i=0; i < num_items; i++) | |
std::cout << "DATA [" << i << "]: " << h_data[i] << std::endl; | |
} | |
checkCudaErrors(cudaMalloc((void**)&d_data, num_items * sizeof(unsigned int))); | |
checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice)); | |
std::cout << "Running quicksort on " << num_items << " elements" << std::endl; | |
run_qsort(d_data, num_items); | |
std::cout << "Validating results: "; | |
check_results(num_items, d_data, verbose); | |
free(h_data); | |
checkCudaErrors(cudaFree(d_data)); | |
exit(EXIT_SUCCESS); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment