Created
June 17, 2021 22:27
-
-
Save Lokno/8f3bcd3b53921585b43be964943057cb to your computer and use it in GitHub Desktop.
Generates a buffer of Gaussian noise using cuRand and writes it to a binary file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdint.h> | |
| #include <assert.h> | |
| #include <random> | |
| #include <limits> | |
| #ifdef __unix__ | |
| #include <pthread.h> | |
| #endif | |
| #include <curand_kernel.h> | |
| #define CUDA_CHECK(err) if( (err) != cudaSuccess ) { fprintf(stderr, "ERROR %d: %s in %s at line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); exit(EXIT_FAILURE); } | |
| typedef struct | |
| { | |
| int minGridSize; | |
| int gridSize; | |
| int blockSize; | |
| int numBlocks; | |
| }cuda_params; | |
| typedef unsigned long long u64; | |
| typedef struct | |
| { | |
| size_t n; | |
| size_t seed; | |
| int device_id; | |
| size_t seed_bytes; | |
| size_t buffer_bytes; | |
| u64* seeds_host; | |
| u64* seeds_device; | |
| float* buffer_host; | |
| float* buffer_device; | |
| curandStatePhilox4_32_10_t* devPHILOXStates; | |
| cudaDeviceProp deviceProp; | |
| std::default_random_engine* generator; | |
| std::uniform_int_distribution<int>* dist; | |
| }curand_test_data; | |
| void str_add_int_suffix(char* dst, const char* src, int suffix, const char* extension ) | |
| { | |
| int slen = strlen(src); | |
| int elen = strlen(extension) + 1; | |
| if( slen < elen || strcmp(src + (slen - elen + 1), extension) != 0 ) | |
| { | |
| sprintf(dst,"%s_%d.%s",src, suffix, extension ); | |
| } | |
| else | |
| { | |
| strncpy(dst, src, slen-elen); | |
| sprintf(dst+slen-elen,"_%d.%s", suffix, extension ); | |
| } | |
| } | |
| void write_bytes(const char* filename, void* ptr, size_t size, size_t count) | |
| { | |
| FILE* pfile = fopen(filename,"wb"); | |
| if( pfile != NULL ) | |
| { | |
| fwrite(ptr, size, count, pfile); | |
| fclose(pfile); | |
| } | |
| } | |
| void set_cuda_seeds(curand_test_data* rsd, unsigned long long* seeds, size_t offset, size_t n) | |
| { | |
| for( size_t i = 0; i < offset; i++ ) | |
| { | |
| int x = (*rsd->dist)(*rsd->generator); | |
| } | |
| for( size_t i = 0; i < n; i++ ) | |
| { | |
| seeds[i] = (unsigned long long)(*rsd->dist)(*rsd->generator); | |
| } | |
| } | |
| __global__ void setup_kernel(curandStatePhilox4_32_10_t * state, unsigned long long* seeds, int offset, int n) | |
| { | |
| int i = blockDim.x * blockIdx.x + threadIdx.x; | |
| if( i < n ) | |
| { | |
| curand_init(seeds[i], offset+i, 0, &state[i]); | |
| } | |
| } | |
| __global__ void generate_cuda(float* __restrict__ buffer, curandStatePhilox4_32_10_t* __restrict__ state, int n) | |
| { | |
| int i = blockDim.x * blockIdx.x + threadIdx.x; | |
| if( i < n ) | |
| { | |
| int j = i*4; | |
| curandStatePhilox4_32_10_t localState = state[i]; | |
| float4 gaussdev = curand_normal4(&localState); | |
| buffer[j+0] = gaussdev.x; | |
| buffer[j+1] = gaussdev.y; | |
| buffer[j+2] = gaussdev.z; | |
| buffer[j+3] = gaussdev.w; | |
| } | |
| } | |
| void init_curand_test_data(curand_test_data* rsd, size_t n, size_t seed, int device_id ) | |
| { | |
| rsd->generator = new std::default_random_engine(seed); | |
| rsd->dist = new std::uniform_int_distribution<int>(0, std::numeric_limits<int>::max()); | |
| rsd->seed = seed; | |
| rsd->n = n; | |
| rsd->device_id = device_id; | |
| printf("Generating %zu values on device %d with seed %zu\n", n, rsd->device_id, seed); | |
| CUDA_CHECK(cudaSetDevice(rsd->device_id)) | |
| CUDA_CHECK(cudaMalloc((void **)&rsd->devPHILOXStates, sizeof(curandStatePhilox4_32_10_t)*rsd->n)) | |
| rsd->seed_bytes = sizeof(u64)*n; | |
| CUDA_CHECK(cudaMalloc((void **)&rsd->seeds_device, rsd->seed_bytes)) | |
| rsd->seeds_host = (u64*)malloc(rsd->seed_bytes); | |
| rsd->buffer_bytes = sizeof(float)*n*4; | |
| CUDA_CHECK(cudaMalloc((void **)&rsd->buffer_device, rsd->buffer_bytes)) | |
| rsd->buffer_host = (float*)malloc( rsd->buffer_bytes); | |
| } | |
| void curand_test_data_cleanup(curand_test_data* rsd) | |
| { | |
| if(rsd->seeds_host != NULL) free(rsd->seeds_host ); | |
| if(rsd->seeds_device != NULL) CUDA_CHECK(cudaFree(rsd->seeds_device)) | |
| if(rsd->buffer_host != NULL) free(rsd->buffer_host); | |
| if(rsd->buffer_device != NULL) CUDA_CHECK(cudaFree(rsd->buffer_device)) | |
| delete rsd->generator; | |
| delete rsd->dist; | |
| memset(rsd,0,sizeof(curand_test_data)); | |
| } | |
| void calculate_grid_size(int threadCount, int blockSize, int maxGridSize, int* gridSize ) | |
| { | |
| *gridSize = (threadCount + blockSize - 1) / blockSize; | |
| if( *gridSize > maxGridSize ) | |
| { | |
| printf("ERROR: Required grid size %d too large for hardware\n", *gridSize); | |
| return; | |
| } | |
| } | |
| void test_curand( curand_test_data* rsd, size_t n, size_t seed, int device_id ) | |
| { | |
| int device_count; | |
| CUDA_CHECK(cudaGetDeviceCount(&device_count)) | |
| if( device_id < 0 || device_id >= device_count ) | |
| { | |
| printf("Parameter 3, Device ID, Invalid. Value must be less than the device count (%d)\n", device_count); | |
| return; | |
| } | |
| init_curand_test_data(rsd, n, seed, device_id ); | |
| cudaGetDeviceProperties(&rsd->deviceProp, rsd->device_id); | |
| cuda_params setup_kernel_params, generate_kernel_params; | |
| int maxBlockSize = rsd->deviceProp.maxThreadsPerBlock; | |
| int maxWarps = rsd->deviceProp.maxThreadsPerMultiProcessor / rsd->deviceProp.warpSize; | |
| int numBlocks; | |
| CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&setup_kernel_params.minGridSize,&setup_kernel_params.blockSize,setup_kernel,0,maxBlockSize)) | |
| printf("setup_kernel - minGridSize: %d - blockSize: %d\n", setup_kernel_params.minGridSize, setup_kernel_params.blockSize); | |
| cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,setup_kernel,setup_kernel_params.blockSize,0); | |
| printf(" occupancy: %f\n", (double)(numBlocks * setup_kernel_params.blockSize / rsd->deviceProp.warpSize) / maxWarps * 100.0); | |
| calculate_grid_size(rsd->n, setup_kernel_params.blockSize, rsd->deviceProp.maxGridSize[0], &setup_kernel_params.gridSize ); | |
| CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&generate_kernel_params.minGridSize,&generate_kernel_params.blockSize,setup_kernel,0,maxBlockSize)) | |
| printf("generate_kernel - minGridSize: %d - blockSize: %d\n", generate_kernel_params.minGridSize, generate_kernel_params.blockSize); | |
| cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,setup_kernel,generate_kernel_params.blockSize,0); | |
| printf(" occupancy: %f\n", (double)(numBlocks * generate_kernel_params.blockSize / rsd->deviceProp.warpSize) / maxWarps * 100.0); | |
| calculate_grid_size(rsd->n, generate_kernel_params.blockSize, rsd->deviceProp.maxGridSize[0], &generate_kernel_params.gridSize ); | |
| cudaEvent_t start,stop; | |
| CUDA_CHECK(cudaEventCreate(&start)) | |
| CUDA_CHECK(cudaEventCreate(&stop)) | |
| set_cuda_seeds(rsd,rsd->seeds_host,0,rsd->n); | |
| CUDA_CHECK(cudaMemcpy(rsd->seeds_device, rsd->seeds_host, rsd->seed_bytes, cudaMemcpyHostToDevice)) | |
| { | |
| float msElapsedTime; | |
| CUDA_CHECK(cudaEventRecord(start, NULL)) | |
| setup_kernel<<<setup_kernel_params.gridSize, setup_kernel_params.blockSize>>>(rsd->devPHILOXStates, rsd->seeds_device, 0, rsd->n); | |
| CUDA_CHECK(cudaGetLastError()) | |
| CUDA_CHECK(cudaEventRecord(stop, NULL)) | |
| CUDA_CHECK(cudaEventSynchronize(stop)) | |
| CUDA_CHECK(cudaEventElapsedTime(&msElapsedTime, start, stop)) | |
| printf("Initialized cuRand state in %fs\n",msElapsedTime/1000.0f); | |
| } | |
| { | |
| float msElapsedTime; | |
| CUDA_CHECK(cudaEventRecord(start, NULL)) | |
| generate_cuda<<<generate_kernel_params.gridSize, generate_kernel_params.blockSize>>>(rsd->buffer_device, rsd->devPHILOXStates, rsd->n); | |
| CUDA_CHECK(cudaGetLastError()) | |
| CUDA_CHECK(cudaEventRecord(stop, NULL)) | |
| CUDA_CHECK(cudaEventSynchronize(stop)) | |
| CUDA_CHECK(cudaEventElapsedTime(&msElapsedTime, start, stop)) | |
| printf("Generated cuRand values in %fs\n",msElapsedTime/1000.0f); | |
| } | |
| CUDA_CHECK(cudaMemcpy(rsd->buffer_host, rsd->buffer_device, rsd->buffer_bytes, cudaMemcpyDeviceToHost)) | |
| char filename[256]; | |
| memset(filename,0,sizeof(char)*256); | |
| str_add_int_suffix(&filename[0], "curand_test_data.bin", rsd->device_id, "bin"); | |
| write_bytes(filename, rsd->buffer_host, sizeof(float), (size_t)rsd->n*4); | |
| printf("Wrote %s\n", filename); | |
| curand_test_data_cleanup(rsd); | |
| } | |
| int main(int argc, char** argv) | |
| { | |
| size_t n; | |
| size_t seed; | |
| int device_id; | |
| curand_test_data rsd; | |
| if( argc != 4 ) | |
| { | |
| printf(" usage %s <seed> <n> <device_id>\n", argv[0]); | |
| exit(-1); | |
| } | |
| seed = atol(argv[1]); | |
| n = atol(argv[2]); | |
| device_id = atoi(argv[3]); | |
| test_curand(&rsd,n,seed,device_id); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment