Skip to content

Instantly share code, notes, and snippets.

@ajtulloch
Last active July 1, 2021 00:12
Show Gist options
  • Save ajtulloch/c9ae27b8aed01262ff156b8f6b1e682a to your computer and use it in GitHub Desktop.
Save ajtulloch/c9ae27b8aed01262ff156b8f6b1e682a to your computer and use it in GitHub Desktop.
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is a simple test program to measure the memcopy bandwidth of the GPU.
* It can measure device to device copy bandwidth, host to device copy bandwidth
* for pageable and pinned memory, and device to host copy bandwidth for
* pageable and pinned memory.
*
* Usage:
* ./bandwidthTest [option]...
*/
// CUDA runtime
#include <cuda_runtime.h>
// includes
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <cuda.h>
#include <cassert>
#include <iostream>
#include <memory>
static const char *sSDKsample = "CUDA Bandwidth Test";
// defines, project
#define MEMCOPY_ITERATIONS 100
#define DEFAULT_SIZE (32 * (1e6)) // 32 M
#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M
#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M
// shmoo mode defines
#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M
#define SHMOO_MEMSIZE_START (1e3) // 1 KB
#define SHMOO_INCREMENT_1KB (1e3) // 1 KB
#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB
#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB
#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB
#define SHMOO_INCREMENT_1MB (1e6) // 1 MB
#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB
#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB
#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB
#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB
#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB
#define SHMOO_LIMIT_1MB (1e6) // 1 MB
#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB
#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB
// CPU cache flush
#define FLUSH_SIZE (256 * 1024 * 1024)
char *flush_buf;
// enums, project
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE, BATCHGATHER_MODE };
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
"Device to Device", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
// if true, use CPU based timing for everything
static bool bDontUseGPUTiming;
int *pArgc = NULL;
char **pArgv = NULL;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc);
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc);
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
float testBatchGatherTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc);
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc);
void printHelp(void);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
pArgc = &argc;
pArgv = argv;
flush_buf = (char *)malloc(FLUSH_SIZE);
// set logfile name and start logs
printf("[%s] - Starting...\n", sSDKsample);
int iRetVal = runTest(argc, (const char **)argv);
if (iRetVal < 0) {
checkCudaErrors(cudaSetDevice(0));
}
// finish
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
free(flush_buf);
exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
}
///////////////////////////////////////////////////////////////////////////////
// Parse args, run the appropriate tests
///////////////////////////////////////////////////////////////////////////////
int runTest(const int argc, const char **argv) {
int start = DEFAULT_SIZE;
int end = DEFAULT_SIZE;
int startDevice = 0;
int endDevice = 0;
int increment = DEFAULT_INCREMENT;
testMode mode = QUICK_MODE;
bool htod = false;
bool dtoh = false;
bool dtod = false;
bool wc = false;
char *modeStr;
char *device = NULL;
printMode printmode = USER_READABLE;
char *memModeStr = NULL;
memoryMode memMode = PINNED;
// process command line args
if (checkCmdLineFlag(argc, argv, "help")) {
printHelp();
return 0;
}
if (checkCmdLineFlag(argc, argv, "csv")) {
printmode = CSV;
}
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (strcmp(memModeStr, "pageable") == 0) {
memMode = PAGEABLE;
} else if (strcmp(memModeStr, "pinned") == 0) {
memMode = PINNED;
} else {
printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("See --help for more information\n");
return -1000;
}
} else {
// default - pinned memory
memMode = PINNED;
}
if (getCmdLineArgumentString(argc, argv, "device", &device)) {
int deviceCount;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
}
if (deviceCount == 0) {
printf("!!!!!No devices found!!!!!\n");
return -2000;
}
if (strcmp(device, "all") == 0) {
printf(
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"!!!!!!\n\n");
startDevice = 0;
endDevice = deviceCount - 1;
} else {
startDevice = endDevice = atoi(device);
if (startDevice >= deviceCount || startDevice < 0) {
printf(
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"used !!!!!\n",
startDevice, 0);
startDevice = endDevice = 0;
}
}
}
printf("Running on...\n\n");
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
if (error_id == cudaSuccess) {
printf(" Device %d: %s\n", currentDevice, deviceProp.name);
if (deviceProp.computeMode == cudaComputeModeProhibited) {
fprintf(stderr,
"Error: device is running in <Compute Mode Prohibited>, no "
"threads can use ::cudaSetDevice().\n");
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
}
} else {
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
}
}
if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
// figure out the mode
if (strcmp(modeStr, "quick") == 0) {
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
} else if (strcmp(modeStr, "shmoo") == 0) {
printf(" Shmoo Mode\n\n");
mode = SHMOO_MODE;
} else if (strcmp(modeStr, "range") == 0) {
printf(" Range Mode\n\n");
mode = RANGE_MODE;
} else if (strcmp(modeStr, "batchgather") == 0) {
printf(" batchgather Mode\n\n");
mode = BATCHGATHER_MODE;
} else {
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("See --help for more information\n");
return -3000;
}
} else {
// default mode - quick
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
}
if (checkCmdLineFlag(argc, argv, "htod")) {
htod = true;
}
if (checkCmdLineFlag(argc, argv, "dtoh")) {
dtoh = true;
}
if (checkCmdLineFlag(argc, argv, "dtod")) {
dtod = true;
}
#if CUDART_VERSION >= 2020
if (checkCmdLineFlag(argc, argv, "wc")) {
wc = true;
}
#endif
if (checkCmdLineFlag(argc, argv, "cputiming")) {
bDontUseGPUTiming = true;
}
if (!htod && !dtoh && !dtod) {
// default: All
htod = true;
dtoh = true;
dtod = true;
}
if (RANGE_MODE == mode) {
if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
start = getCmdLineArgumentInt(argc, argv, "start");
if (start <= 0) {
printf("Illegal argument - start must be greater than zero\n");
return -4000;
}
} else {
printf("Must specify a starting size in range mode\n");
printf("See --help for more information\n");
return -5000;
}
if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
end = getCmdLineArgumentInt(argc, argv, "end");
if (end <= 0) {
printf("Illegal argument - end must be greater than zero\n");
return -6000;
}
if (start > end) {
printf("Illegal argument - start is greater than end\n");
return -7000;
}
} else {
printf("Must specify an end size in range mode.\n");
printf("See --help for more information\n");
return -8000;
}
if (checkCmdLineFlag(argc, argv, "increment")) {
increment = getCmdLineArgumentInt(argc, argv, "increment");
if (increment <= 0) {
printf("Illegal argument - increment must be greater than zero\n");
return -9000;
}
} else {
printf("Must specify an increment in user mode\n");
printf("See --help for more information\n");
return -10000;
}
}
if (htod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
}
if (dtoh) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
memMode, startDevice, endDevice, wc);
}
if (dtod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
}
// Ensure that we reset all CUDA Devices in question
for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
cudaSetDevice(nDevice);
}
return 0;
}
///////////////////////////////////////////////////////////////////////////////
// Run a bandwidth test
///////////////////////////////////////////////////////////////////////////////
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
switch (mode) {
case QUICK_MODE:
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
endDevice, wc);
break;
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode,
startDevice, endDevice, wc);
break;
case SHMOO_MODE:
testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
break;
case BATCHGATHER_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode,
startDevice, endDevice, wc);
break;
default:
break;
}
}
//////////////////////////////////////////////////////////////////////
// Run a quick mode bandwidth test
//////////////////////////////////////////////////////////////////////
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
startDevice, endDevice, wc);
}
///////////////////////////////////////////////////////////////////////
// Run a range mode bandwidth test
//////////////////////////////////////////////////////////////////////
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc) {
// count the number of copies we're going to run
unsigned int count = 1 + ((end - start) / increment);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
// Before calculating the cumulative bandwidth, initialize bandwidths array to
// NULL
for (unsigned int i = 0; i < count; i++) {
bandwidths[i] = 0.0;
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaSetDevice(currentDevice);
// run each of the copies
for (unsigned int i = 0; i < count; i++) {
memSizes[i] = start + i * increment;
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[i] += testBatchGatherTransfer(memSizes[i]);
break;
}
}
} // Complete the bandwidth computation on all the devices
// print results
if (printmode == CSV) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
}
// clean up
free(memSizes);
free(bandwidths);
}
//////////////////////////////////////////////////////////////////////////////
// Intense shmoo mode - covers a large range of values with varying increments
//////////////////////////////////////////////////////////////////////////////
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
// count the number of copies to make
unsigned int count =
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
// Before calculating the cumulative bandwidth, initialize bandwidths array to
// NULL
for (unsigned int i = 0; i < count; i++) {
bandwidths[i] = 0.0;
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaSetDevice(currentDevice);
// Run the shmoo
int iteration = 0;
unsigned int memSize = 0;
while (memSize <= SHMOO_MEMSIZE_MAX) {
if (memSize < SHMOO_LIMIT_20KB) {
memSize += SHMOO_INCREMENT_1KB;
} else if (memSize < SHMOO_LIMIT_50KB) {
memSize += SHMOO_INCREMENT_2KB;
} else if (memSize < SHMOO_LIMIT_100KB) {
memSize += SHMOO_INCREMENT_10KB;
} else if (memSize < SHMOO_LIMIT_1MB) {
memSize += SHMOO_INCREMENT_100KB;
} else if (memSize < SHMOO_LIMIT_16MB) {
memSize += SHMOO_INCREMENT_1MB;
} else if (memSize < SHMOO_LIMIT_32MB) {
memSize += SHMOO_INCREMENT_2MB;
} else {
memSize += SHMOO_INCREMENT_4MB;
}
memSizes[iteration] = memSize;
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[iteration] +=
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[iteration] +=
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[iteration] +=
testBatchGatherTransfer(memSizes[iteration]);
break;
}
iteration++;
printf(".");
fflush(0);
}
} // Complete the bandwidth computation on all the devices
// print results
printf("\n");
if (CSV == printmode) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
}
// clean up
free(memSizes);
free(bandwidths);
}
///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
unsigned char *h_idata = NULL;
unsigned char *h_odata = NULL;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
if (PINNED == memMode) {
// pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
// pageable memory mode - use malloc
h_idata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
if (h_idata == 0 || h_odata == 0) {
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
exit(EXIT_FAILURE);
}
}
// initialize the memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// initialize the device memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy data from GPU to Host
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
cudaMemcpyDeviceToHost, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
memset(flush_buf, i, FLUSH_SIZE);
}
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
sdkDeleteTimer(&timer);
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_odata));
} else {
free(h_idata);
free(h_odata);
}
checkCudaErrors(cudaFree(d_idata));
return bandwidthInGBs;
}
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a host to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
unsigned char *h_odata = NULL;
if (PINNED == memMode) {
#if CUDART_VERSION >= 2020
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
// pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
if (h_odata == 0) {
fprintf(stderr, "Not enough memory available on host to run test!\n");
exit(EXIT_FAILURE);
}
}
unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
fprintf(stderr, "Not enough memory available on host to run test!\n");
exit(EXIT_FAILURE);
}
// initialize the memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_odata[i] = (unsigned char)(i & 0xff);
}
for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
h_cacheClear1[i] = (unsigned char)(i & 0xff);
h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// copy host memory to device memory
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
cudaMemcpyHostToDevice, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
memset(flush_buf, i, FLUSH_SIZE);
}
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
sdkDeleteTimer(&timer);
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_odata));
} else {
free(h_odata);
}
free(h_cacheClear1);
free(h_cacheClear2);
checkCudaErrors(cudaFree(d_idata));
return bandwidthInGBs;
}
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testDeviceToDeviceTransfer(unsigned int memSize) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
unsigned char *h_idata = (unsigned char *)malloc(memSize);
if (h_idata == 0) {
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
exit(EXIT_FAILURE);
}
// initialize the host memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
unsigned char *d_odata;
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
// initialize memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// run the memcopy
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
}
checkCudaErrors(cudaEventRecord(stop, 0));
// Since device to device memory copies are non-blocking,
// cudaDeviceSynchronize() is required in order to get
// proper timing.
checkCudaErrors(cudaDeviceSynchronize());
// get the total elapsed time in ms
sdkStopTimer(&timer);
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
elapsedTimeInMs = sdkGetTimerValue(&timer);
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
sdkDeleteTimer(&timer);
free(h_idata);
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata));
return bandwidthInGBs;
}
/////////////////////////////////////////////////////////
// print results in an easily read format
////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc) {
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
if (wc) {
printf(" Write-Combined Memory Writes are Enabled");
}
printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n");
unsigned int i;
for (i = 0; i < (count - 1); i++) {
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
///////////////////////////////////////////////////////////////////////////
// print results in a database format
///////////////////////////////////////////////////////////////////////////
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc) {
std::string sConfig;
// log config information
if (kind == DEVICE_TO_DEVICE) {
sConfig += "D2D";
} else {
if (kind == DEVICE_TO_HOST) {
sConfig += "D2H";
} else if (kind == HOST_TO_DEVICE) {
sConfig += "H2D";
}
if (memMode == PAGEABLE) {
sConfig += "-Paged";
} else if (memMode == PINNED) {
sConfig += "-Pinned";
if (wc) {
sConfig += "-WriteCombined";
}
}
}
unsigned int i;
double dSeconds = 0.0;
for (i = 0; i < count; i++) {
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
printf(
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
}
}
__global__ void batch_gather_kernel(__restrict__ uint8_t* data, __restrict__ uint32_t* indices, __restrict__ uint8_t* output, uint32_t D) {
// warp per row.
uint32_t row_idx = blockIdx.x * blockDim.y + threadIdx.y;
uint32_t idx = indices[row_idx];
auto* input_row_start = &data[D * idx];
auto* output_row_start = &output[D * row_idx];
for (auto i = 16 * threadIdx.x; i < D; i += 16 * 32) {
*(uint4*)(&output_row_start[i]) = *(uint4*)(&input_row_start[i]);
}
}
void run_batch_gather(__restrict__ uint8_t* data, __restrict__ uint32_t* indices, __restrict__ uint8_t* output, uint32_t D, uint32_t N) {
batch_gather_kernel<<<dim3(N / 8), dim3(32, 8), 0, nullptr>>>(
data, indices, output, D
);
}
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testBatchGatherTransfer(unsigned int memSize) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
uint64_t D = 2048;
uint64_t E = 10000000;
uint64_t R = 32 * 100000;
// initialize the host memory
uint32_t *h_indices = (uint32_t *)malloc(R * sizeof(uint32_t));
for (unsigned int i = 0; i < R; i++) {
h_indices[i] = rand() % E;
}
// allocate device memory
uint8_t*d_table;
checkCudaErrors(cudaMalloc((void **)&d_table, E * D));
uint32_t*d_indices;
checkCudaErrors(cudaMalloc((void **)&d_indices, R * sizeof(uint32_t)));
uint8_t*d_odata;
checkCudaErrors(cudaMalloc((void **)&d_odata, R * D));
// initialize memory
checkCudaErrors(
cudaMemcpy(d_indices, h_indices, R * sizeof(uint32_t), cudaMemcpyHostToDevice));
// run the memcopy
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
run_batch_gather(d_table, d_indices, d_odata, D, R);
}
checkCudaErrors(cudaEventRecord(stop, 0));
// Since device to device memory copies are non-blocking,
// cudaDeviceSynchronize() is required in order to get
// proper timing.
checkCudaErrors(cudaDeviceSynchronize());
// get the total elapsed time in ms
sdkStopTimer(&timer);
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
elapsedTimeInMs = sdkGetTimerValue(&timer);
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (2.0f * R * D * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
printf("batchGather, Bandwidth = %.1f GB/s\n", bandwidthInGBs);
// clean up memory
sdkDeleteTimer(&timer);
free(h_indices);
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaFree(d_table));
checkCudaErrors(cudaFree(d_indices));
checkCudaErrors(cudaFree(d_odata));
return bandwidthInGBs;
}
///////////////////////////////////////////////////////////////////////////
// Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void) {
printf("Usage: bandwidthTest [OPTION]...\n");
printf(
"Test the bandwidth for device to host, host to device, and device to "
"device transfers\n");
printf("\n");
printf(
"Example: measure the bandwidth of device to host pinned memory copies "
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf(
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"--increment=1024 --dtoh\n");
printf("\n");
printf("Options:\n");
printf("--help\tDisplay this help menu\n");
printf("--csv\tPrint results as a CSV\n");
printf("--device=[deviceno]\tSpecify the device device to be used\n");
printf(" all - compute cumulative bandwidth on all the devices\n");
printf(" 0,1,2,...,n - Specify any particular device to be used\n");
printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
printf(" pageable - pageable memory\n");
printf(" pinned - non-pageable system memory\n");
printf("--mode=[MODE]\tSpecify the mode to use\n");
printf(" quick - performs a quick measurement\n");
printf(" range - measures a user-specified range of values\n");
printf(" shmoo - performs an intense shmoo of a large range of values\n");
printf(" batchgather - batchgather\n");
printf("--htod\tMeasure host to device transfers\n");
printf("--dtoh\tMeasure device to host transfers\n");
printf("--dtod\tMeasure device to device transfers\n");
#if CUDART_VERSION >= 2020
printf("--wc\tAllocate pinned memory as write-combined\n");
#endif
printf("--cputiming\tForce CPU-based timing always\n");
printf("Range mode options\n");
printf("--start=[SIZE]\tStarting transfer size in bytes\n");
printf("--end=[SIZE]\tEnding transfer size in bytes\n");
printf("--increment=[SIZE]\tIncrement size in bytes\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment