Skip to content

Instantly share code, notes, and snippets.

@Lokno
Created June 17, 2021 22:27
Show Gist options
  • Select an option

  • Save Lokno/8f3bcd3b53921585b43be964943057cb to your computer and use it in GitHub Desktop.

Select an option

Save Lokno/8f3bcd3b53921585b43be964943057cb to your computer and use it in GitHub Desktop.
Generates a buffer of Gaussian noise using cuRand and writes it to a binary file
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <random>
#include <limits>
#ifdef __unix__
#include <pthread.h>
#endif
#include <curand_kernel.h>
#define CUDA_CHECK(err) if( (err) != cudaSuccess ) { fprintf(stderr, "ERROR %d: %s in %s at line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); exit(EXIT_FAILURE); }
typedef struct
{
int minGridSize;
int gridSize;
int blockSize;
int numBlocks;
}cuda_params;
typedef unsigned long long u64;
typedef struct
{
size_t n;
size_t seed;
int device_id;
size_t seed_bytes;
size_t buffer_bytes;
u64* seeds_host;
u64* seeds_device;
float* buffer_host;
float* buffer_device;
curandStatePhilox4_32_10_t* devPHILOXStates;
cudaDeviceProp deviceProp;
std::default_random_engine* generator;
std::uniform_int_distribution<int>* dist;
}curand_test_data;
void str_add_int_suffix(char* dst, const char* src, int suffix, const char* extension )
{
int slen = strlen(src);
int elen = strlen(extension) + 1;
if( slen < elen || strcmp(src + (slen - elen + 1), extension) != 0 )
{
sprintf(dst,"%s_%d.%s",src, suffix, extension );
}
else
{
strncpy(dst, src, slen-elen);
sprintf(dst+slen-elen,"_%d.%s", suffix, extension );
}
}
void write_bytes(const char* filename, void* ptr, size_t size, size_t count)
{
FILE* pfile = fopen(filename,"wb");
if( pfile != NULL )
{
fwrite(ptr, size, count, pfile);
fclose(pfile);
}
}
void set_cuda_seeds(curand_test_data* rsd, unsigned long long* seeds, size_t offset, size_t n)
{
for( size_t i = 0; i < offset; i++ )
{
int x = (*rsd->dist)(*rsd->generator);
}
for( size_t i = 0; i < n; i++ )
{
seeds[i] = (unsigned long long)(*rsd->dist)(*rsd->generator);
}
}
__global__ void setup_kernel(curandStatePhilox4_32_10_t * state, unsigned long long* seeds, int offset, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < n )
{
curand_init(seeds[i], offset+i, 0, &state[i]);
}
}
__global__ void generate_cuda(float* __restrict__ buffer, curandStatePhilox4_32_10_t* __restrict__ state, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < n )
{
int j = i*4;
curandStatePhilox4_32_10_t localState = state[i];
float4 gaussdev = curand_normal4(&localState);
buffer[j+0] = gaussdev.x;
buffer[j+1] = gaussdev.y;
buffer[j+2] = gaussdev.z;
buffer[j+3] = gaussdev.w;
}
}
void init_curand_test_data(curand_test_data* rsd, size_t n, size_t seed, int device_id )
{
rsd->generator = new std::default_random_engine(seed);
rsd->dist = new std::uniform_int_distribution<int>(0, std::numeric_limits<int>::max());
rsd->seed = seed;
rsd->n = n;
rsd->device_id = device_id;
printf("Generating %zu values on device %d with seed %zu\n", n, rsd->device_id, seed);
CUDA_CHECK(cudaSetDevice(rsd->device_id))
CUDA_CHECK(cudaMalloc((void **)&rsd->devPHILOXStates, sizeof(curandStatePhilox4_32_10_t)*rsd->n))
rsd->seed_bytes = sizeof(u64)*n;
CUDA_CHECK(cudaMalloc((void **)&rsd->seeds_device, rsd->seed_bytes))
rsd->seeds_host = (u64*)malloc(rsd->seed_bytes);
rsd->buffer_bytes = sizeof(float)*n*4;
CUDA_CHECK(cudaMalloc((void **)&rsd->buffer_device, rsd->buffer_bytes))
rsd->buffer_host = (float*)malloc( rsd->buffer_bytes);
}
void curand_test_data_cleanup(curand_test_data* rsd)
{
if(rsd->seeds_host != NULL) free(rsd->seeds_host );
if(rsd->seeds_device != NULL) CUDA_CHECK(cudaFree(rsd->seeds_device))
if(rsd->buffer_host != NULL) free(rsd->buffer_host);
if(rsd->buffer_device != NULL) CUDA_CHECK(cudaFree(rsd->buffer_device))
delete rsd->generator;
delete rsd->dist;
memset(rsd,0,sizeof(curand_test_data));
}
void calculate_grid_size(int threadCount, int blockSize, int maxGridSize, int* gridSize )
{
*gridSize = (threadCount + blockSize - 1) / blockSize;
if( *gridSize > maxGridSize )
{
printf("ERROR: Required grid size %d too large for hardware\n", *gridSize);
return;
}
}
void test_curand( curand_test_data* rsd, size_t n, size_t seed, int device_id )
{
int device_count;
CUDA_CHECK(cudaGetDeviceCount(&device_count))
if( device_id < 0 || device_id >= device_count )
{
printf("Parameter 3, Device ID, Invalid. Value must be less than the device count (%d)\n", device_count);
return;
}
init_curand_test_data(rsd, n, seed, device_id );
cudaGetDeviceProperties(&rsd->deviceProp, rsd->device_id);
cuda_params setup_kernel_params, generate_kernel_params;
int maxBlockSize = rsd->deviceProp.maxThreadsPerBlock;
int maxWarps = rsd->deviceProp.maxThreadsPerMultiProcessor / rsd->deviceProp.warpSize;
int numBlocks;
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&setup_kernel_params.minGridSize,&setup_kernel_params.blockSize,setup_kernel,0,maxBlockSize))
printf("setup_kernel - minGridSize: %d - blockSize: %d\n", setup_kernel_params.minGridSize, setup_kernel_params.blockSize);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,setup_kernel,setup_kernel_params.blockSize,0);
printf(" occupancy: %f\n", (double)(numBlocks * setup_kernel_params.blockSize / rsd->deviceProp.warpSize) / maxWarps * 100.0);
calculate_grid_size(rsd->n, setup_kernel_params.blockSize, rsd->deviceProp.maxGridSize[0], &setup_kernel_params.gridSize );
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&generate_kernel_params.minGridSize,&generate_kernel_params.blockSize,setup_kernel,0,maxBlockSize))
printf("generate_kernel - minGridSize: %d - blockSize: %d\n", generate_kernel_params.minGridSize, generate_kernel_params.blockSize);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,setup_kernel,generate_kernel_params.blockSize,0);
printf(" occupancy: %f\n", (double)(numBlocks * generate_kernel_params.blockSize / rsd->deviceProp.warpSize) / maxWarps * 100.0);
calculate_grid_size(rsd->n, generate_kernel_params.blockSize, rsd->deviceProp.maxGridSize[0], &generate_kernel_params.gridSize );
cudaEvent_t start,stop;
CUDA_CHECK(cudaEventCreate(&start))
CUDA_CHECK(cudaEventCreate(&stop))
set_cuda_seeds(rsd,rsd->seeds_host,0,rsd->n);
CUDA_CHECK(cudaMemcpy(rsd->seeds_device, rsd->seeds_host, rsd->seed_bytes, cudaMemcpyHostToDevice))
{
float msElapsedTime;
CUDA_CHECK(cudaEventRecord(start, NULL))
setup_kernel<<<setup_kernel_params.gridSize, setup_kernel_params.blockSize>>>(rsd->devPHILOXStates, rsd->seeds_device, 0, rsd->n);
CUDA_CHECK(cudaGetLastError())
CUDA_CHECK(cudaEventRecord(stop, NULL))
CUDA_CHECK(cudaEventSynchronize(stop))
CUDA_CHECK(cudaEventElapsedTime(&msElapsedTime, start, stop))
printf("Initialized cuRand state in %fs\n",msElapsedTime/1000.0f);
}
{
float msElapsedTime;
CUDA_CHECK(cudaEventRecord(start, NULL))
generate_cuda<<<generate_kernel_params.gridSize, generate_kernel_params.blockSize>>>(rsd->buffer_device, rsd->devPHILOXStates, rsd->n);
CUDA_CHECK(cudaGetLastError())
CUDA_CHECK(cudaEventRecord(stop, NULL))
CUDA_CHECK(cudaEventSynchronize(stop))
CUDA_CHECK(cudaEventElapsedTime(&msElapsedTime, start, stop))
printf("Generated cuRand values in %fs\n",msElapsedTime/1000.0f);
}
CUDA_CHECK(cudaMemcpy(rsd->buffer_host, rsd->buffer_device, rsd->buffer_bytes, cudaMemcpyDeviceToHost))
char filename[256];
memset(filename,0,sizeof(char)*256);
str_add_int_suffix(&filename[0], "curand_test_data.bin", rsd->device_id, "bin");
write_bytes(filename, rsd->buffer_host, sizeof(float), (size_t)rsd->n*4);
printf("Wrote %s\n", filename);
curand_test_data_cleanup(rsd);
}
int main(int argc, char** argv)
{
size_t n;
size_t seed;
int device_id;
curand_test_data rsd;
if( argc != 4 )
{
printf(" usage %s <seed> <n> <device_id>\n", argv[0]);
exit(-1);
}
seed = atol(argv[1]);
n = atol(argv[2]);
device_id = atoi(argv[3]);
test_curand(&rsd,n,seed,device_id);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment