Last active
July 1, 2021 00:12
-
-
Save ajtulloch/c9ae27b8aed01262ff156b8f6b1e682a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions | |
* are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of NVIDIA CORPORATION nor the names of its | |
* contributors may be used to endorse or promote products derived | |
* from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | |
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
*/ | |
/* | |
* This is a simple test program to measure the memcopy bandwidth of the GPU. | |
* It can measure device to device copy bandwidth, host to device copy bandwidth | |
* for pageable and pinned memory, and device to host copy bandwidth for | |
* pageable and pinned memory. | |
* | |
* Usage: | |
* ./bandwidthTest [option]... | |
*/ | |
// CUDA runtime | |
#include <cuda_runtime.h> | |
// includes | |
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization | |
#include <helper_functions.h> // helper for shared functions common to CUDA Samples | |
#include <cuda.h> | |
#include <cassert> | |
#include <iostream> | |
#include <memory> | |
static const char *sSDKsample = "CUDA Bandwidth Test"; | |
// defines, project | |
#define MEMCOPY_ITERATIONS 100 | |
#define DEFAULT_SIZE (32 * (1e6)) // 32 M | |
#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M | |
#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M | |
// shmoo mode defines | |
#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M | |
#define SHMOO_MEMSIZE_START (1e3) // 1 KB | |
#define SHMOO_INCREMENT_1KB (1e3) // 1 KB | |
#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB | |
#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB | |
#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB | |
#define SHMOO_INCREMENT_1MB (1e6) // 1 MB | |
#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB | |
#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB | |
#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB | |
#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB | |
#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB | |
#define SHMOO_LIMIT_1MB (1e6) // 1 MB | |
#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB | |
#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB | |
// CPU cache flush | |
#define FLUSH_SIZE (256 * 1024 * 1024) | |
char *flush_buf; | |
// enums, project | |
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE, BATCHGATHER_MODE }; | |
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE }; | |
enum printMode { USER_READABLE, CSV }; | |
enum memoryMode { PINNED, PAGEABLE }; | |
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", | |
"Device to Device", NULL}; | |
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; | |
// if true, use CPU based timing for everything | |
static bool bDontUseGPUTiming; | |
int *pArgc = NULL; | |
char **pArgv = NULL; | |
//////////////////////////////////////////////////////////////////////////////// | |
// declaration, forward | |
int runTest(const int argc, const char **argv); | |
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, | |
testMode mode, memcpyKind kind, printMode printmode, | |
memoryMode memMode, int startDevice, int endDevice, bool wc); | |
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, | |
memoryMode memMode, int startDevice, int endDevice, | |
bool wc); | |
void testBandwidthRange(unsigned int start, unsigned int end, | |
unsigned int increment, memcpyKind kind, | |
printMode printmode, memoryMode memMode, | |
int startDevice, int endDevice, bool wc); | |
void testBandwidthShmoo(memcpyKind kind, printMode printmode, | |
memoryMode memMode, int startDevice, int endDevice, | |
bool wc); | |
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, | |
bool wc); | |
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, | |
bool wc); | |
float testDeviceToDeviceTransfer(unsigned int memSize); | |
float testBatchGatherTransfer(unsigned int memSize); | |
void printResultsReadable(unsigned int *memSizes, double *bandwidths, | |
unsigned int count, memcpyKind kind, | |
memoryMode memMode, int iNumDevs, bool wc); | |
void printResultsCSV(unsigned int *memSizes, double *bandwidths, | |
unsigned int count, memcpyKind kind, memoryMode memMode, | |
int iNumDevs, bool wc); | |
void printHelp(void); | |
//////////////////////////////////////////////////////////////////////////////// | |
// Program main | |
//////////////////////////////////////////////////////////////////////////////// | |
int main(int argc, char **argv) { | |
pArgc = &argc; | |
pArgv = argv; | |
flush_buf = (char *)malloc(FLUSH_SIZE); | |
// set logfile name and start logs | |
printf("[%s] - Starting...\n", sSDKsample); | |
int iRetVal = runTest(argc, (const char **)argv); | |
if (iRetVal < 0) { | |
checkCudaErrors(cudaSetDevice(0)); | |
} | |
// finish | |
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); | |
printf( | |
"\nNOTE: The CUDA Samples are not meant for performance measurements. " | |
"Results may vary when GPU Boost is enabled.\n"); | |
free(flush_buf); | |
exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// Parse args, run the appropriate tests | |
/////////////////////////////////////////////////////////////////////////////// | |
int runTest(const int argc, const char **argv) { | |
int start = DEFAULT_SIZE; | |
int end = DEFAULT_SIZE; | |
int startDevice = 0; | |
int endDevice = 0; | |
int increment = DEFAULT_INCREMENT; | |
testMode mode = QUICK_MODE; | |
bool htod = false; | |
bool dtoh = false; | |
bool dtod = false; | |
bool wc = false; | |
char *modeStr; | |
char *device = NULL; | |
printMode printmode = USER_READABLE; | |
char *memModeStr = NULL; | |
memoryMode memMode = PINNED; | |
// process command line args | |
if (checkCmdLineFlag(argc, argv, "help")) { | |
printHelp(); | |
return 0; | |
} | |
if (checkCmdLineFlag(argc, argv, "csv")) { | |
printmode = CSV; | |
} | |
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { | |
if (strcmp(memModeStr, "pageable") == 0) { | |
memMode = PAGEABLE; | |
} else if (strcmp(memModeStr, "pinned") == 0) { | |
memMode = PINNED; | |
} else { | |
printf("Invalid memory mode - valid modes are pageable or pinned\n"); | |
printf("See --help for more information\n"); | |
return -1000; | |
} | |
} else { | |
// default - pinned memory | |
memMode = PINNED; | |
} | |
if (getCmdLineArgumentString(argc, argv, "device", &device)) { | |
int deviceCount; | |
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); | |
if (error_id != cudaSuccess) { | |
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, | |
cudaGetErrorString(error_id)); | |
exit(EXIT_FAILURE); | |
} | |
if (deviceCount == 0) { | |
printf("!!!!!No devices found!!!!!\n"); | |
return -2000; | |
} | |
if (strcmp(device, "all") == 0) { | |
printf( | |
"\n!!!!!Cumulative Bandwidth to be computed from all the devices " | |
"!!!!!!\n\n"); | |
startDevice = 0; | |
endDevice = deviceCount - 1; | |
} else { | |
startDevice = endDevice = atoi(device); | |
if (startDevice >= deviceCount || startDevice < 0) { | |
printf( | |
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be " | |
"used !!!!!\n", | |
startDevice, 0); | |
startDevice = endDevice = 0; | |
} | |
} | |
} | |
printf("Running on...\n\n"); | |
for (int currentDevice = startDevice; currentDevice <= endDevice; | |
currentDevice++) { | |
cudaDeviceProp deviceProp; | |
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); | |
if (error_id == cudaSuccess) { | |
printf(" Device %d: %s\n", currentDevice, deviceProp.name); | |
if (deviceProp.computeMode == cudaComputeModeProhibited) { | |
fprintf(stderr, | |
"Error: device is running in <Compute Mode Prohibited>, no " | |
"threads can use ::cudaSetDevice().\n"); | |
checkCudaErrors(cudaSetDevice(currentDevice)); | |
exit(EXIT_FAILURE); | |
} | |
} else { | |
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, | |
cudaGetErrorString(error_id)); | |
checkCudaErrors(cudaSetDevice(currentDevice)); | |
exit(EXIT_FAILURE); | |
} | |
} | |
if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) { | |
// figure out the mode | |
if (strcmp(modeStr, "quick") == 0) { | |
printf(" Quick Mode\n\n"); | |
mode = QUICK_MODE; | |
} else if (strcmp(modeStr, "shmoo") == 0) { | |
printf(" Shmoo Mode\n\n"); | |
mode = SHMOO_MODE; | |
} else if (strcmp(modeStr, "range") == 0) { | |
printf(" Range Mode\n\n"); | |
mode = RANGE_MODE; | |
} else if (strcmp(modeStr, "batchgather") == 0) { | |
printf(" batchgather Mode\n\n"); | |
mode = BATCHGATHER_MODE; | |
} else { | |
printf("Invalid mode - valid modes are quick, range, or shmoo\n"); | |
printf("See --help for more information\n"); | |
return -3000; | |
} | |
} else { | |
// default mode - quick | |
printf(" Quick Mode\n\n"); | |
mode = QUICK_MODE; | |
} | |
if (checkCmdLineFlag(argc, argv, "htod")) { | |
htod = true; | |
} | |
if (checkCmdLineFlag(argc, argv, "dtoh")) { | |
dtoh = true; | |
} | |
if (checkCmdLineFlag(argc, argv, "dtod")) { | |
dtod = true; | |
} | |
#if CUDART_VERSION >= 2020 | |
if (checkCmdLineFlag(argc, argv, "wc")) { | |
wc = true; | |
} | |
#endif | |
if (checkCmdLineFlag(argc, argv, "cputiming")) { | |
bDontUseGPUTiming = true; | |
} | |
if (!htod && !dtoh && !dtod) { | |
// default: All | |
htod = true; | |
dtoh = true; | |
dtod = true; | |
} | |
if (RANGE_MODE == mode) { | |
if (checkCmdLineFlag(argc, (const char **)argv, "start")) { | |
start = getCmdLineArgumentInt(argc, argv, "start"); | |
if (start <= 0) { | |
printf("Illegal argument - start must be greater than zero\n"); | |
return -4000; | |
} | |
} else { | |
printf("Must specify a starting size in range mode\n"); | |
printf("See --help for more information\n"); | |
return -5000; | |
} | |
if (checkCmdLineFlag(argc, (const char **)argv, "end")) { | |
end = getCmdLineArgumentInt(argc, argv, "end"); | |
if (end <= 0) { | |
printf("Illegal argument - end must be greater than zero\n"); | |
return -6000; | |
} | |
if (start > end) { | |
printf("Illegal argument - start is greater than end\n"); | |
return -7000; | |
} | |
} else { | |
printf("Must specify an end size in range mode.\n"); | |
printf("See --help for more information\n"); | |
return -8000; | |
} | |
if (checkCmdLineFlag(argc, argv, "increment")) { | |
increment = getCmdLineArgumentInt(argc, argv, "increment"); | |
if (increment <= 0) { | |
printf("Illegal argument - increment must be greater than zero\n"); | |
return -9000; | |
} | |
} else { | |
printf("Must specify an increment in user mode\n"); | |
printf("See --help for more information\n"); | |
return -10000; | |
} | |
} | |
if (htod) { | |
testBandwidth((unsigned int)start, (unsigned int)end, | |
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode, | |
memMode, startDevice, endDevice, wc); | |
} | |
if (dtoh) { | |
testBandwidth((unsigned int)start, (unsigned int)end, | |
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode, | |
memMode, startDevice, endDevice, wc); | |
} | |
if (dtod) { | |
testBandwidth((unsigned int)start, (unsigned int)end, | |
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, | |
memMode, startDevice, endDevice, wc); | |
} | |
// Ensure that we reset all CUDA Devices in question | |
for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) { | |
cudaSetDevice(nDevice); | |
} | |
return 0; | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// Run a bandwidth test | |
/////////////////////////////////////////////////////////////////////////////// | |
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, | |
testMode mode, memcpyKind kind, printMode printmode, | |
memoryMode memMode, int startDevice, int endDevice, | |
bool wc) { | |
switch (mode) { | |
case QUICK_MODE: | |
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, | |
endDevice, wc); | |
break; | |
case RANGE_MODE: | |
testBandwidthRange(start, end, increment, kind, printmode, memMode, | |
startDevice, endDevice, wc); | |
break; | |
case SHMOO_MODE: | |
testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); | |
break; | |
case BATCHGATHER_MODE: | |
testBandwidthRange(start, end, increment, kind, printmode, memMode, | |
startDevice, endDevice, wc); | |
break; | |
default: | |
break; | |
} | |
} | |
////////////////////////////////////////////////////////////////////// | |
// Run a quick mode bandwidth test | |
////////////////////////////////////////////////////////////////////// | |
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, | |
memoryMode memMode, int startDevice, int endDevice, | |
bool wc) { | |
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, | |
startDevice, endDevice, wc); | |
} | |
/////////////////////////////////////////////////////////////////////// | |
// Run a range mode bandwidth test | |
////////////////////////////////////////////////////////////////////// | |
void testBandwidthRange(unsigned int start, unsigned int end, | |
unsigned int increment, memcpyKind kind, | |
printMode printmode, memoryMode memMode, | |
int startDevice, int endDevice, bool wc) { | |
// count the number of copies we're going to run | |
unsigned int count = 1 + ((end - start) / increment); | |
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); | |
double *bandwidths = (double *)malloc(count * sizeof(double)); | |
// Before calculating the cumulative bandwidth, initialize bandwidths array to | |
// NULL | |
for (unsigned int i = 0; i < count; i++) { | |
bandwidths[i] = 0.0; | |
} | |
// Use the device asked by the user | |
for (int currentDevice = startDevice; currentDevice <= endDevice; | |
currentDevice++) { | |
cudaSetDevice(currentDevice); | |
// run each of the copies | |
for (unsigned int i = 0; i < count; i++) { | |
memSizes[i] = start + i * increment; | |
switch (kind) { | |
case DEVICE_TO_HOST: | |
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); | |
break; | |
case HOST_TO_DEVICE: | |
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); | |
break; | |
case DEVICE_TO_DEVICE: | |
bandwidths[i] += testBatchGatherTransfer(memSizes[i]); | |
break; | |
} | |
} | |
} // Complete the bandwidth computation on all the devices | |
// print results | |
if (printmode == CSV) { | |
printResultsCSV(memSizes, bandwidths, count, kind, memMode, | |
(1 + endDevice - startDevice), wc); | |
} else { | |
printResultsReadable(memSizes, bandwidths, count, kind, memMode, | |
(1 + endDevice - startDevice), wc); | |
} | |
// clean up | |
free(memSizes); | |
free(bandwidths); | |
} | |
////////////////////////////////////////////////////////////////////////////// | |
// Intense shmoo mode - covers a large range of values with varying increments | |
////////////////////////////////////////////////////////////////////////////// | |
void testBandwidthShmoo(memcpyKind kind, printMode printmode, | |
memoryMode memMode, int startDevice, int endDevice, | |
bool wc) { | |
// count the number of copies to make | |
unsigned int count = | |
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + | |
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + | |
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + | |
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + | |
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + | |
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + | |
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); | |
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); | |
double *bandwidths = (double *)malloc(count * sizeof(double)); | |
// Before calculating the cumulative bandwidth, initialize bandwidths array to | |
// NULL | |
for (unsigned int i = 0; i < count; i++) { | |
bandwidths[i] = 0.0; | |
} | |
// Use the device asked by the user | |
for (int currentDevice = startDevice; currentDevice <= endDevice; | |
currentDevice++) { | |
cudaSetDevice(currentDevice); | |
// Run the shmoo | |
int iteration = 0; | |
unsigned int memSize = 0; | |
while (memSize <= SHMOO_MEMSIZE_MAX) { | |
if (memSize < SHMOO_LIMIT_20KB) { | |
memSize += SHMOO_INCREMENT_1KB; | |
} else if (memSize < SHMOO_LIMIT_50KB) { | |
memSize += SHMOO_INCREMENT_2KB; | |
} else if (memSize < SHMOO_LIMIT_100KB) { | |
memSize += SHMOO_INCREMENT_10KB; | |
} else if (memSize < SHMOO_LIMIT_1MB) { | |
memSize += SHMOO_INCREMENT_100KB; | |
} else if (memSize < SHMOO_LIMIT_16MB) { | |
memSize += SHMOO_INCREMENT_1MB; | |
} else if (memSize < SHMOO_LIMIT_32MB) { | |
memSize += SHMOO_INCREMENT_2MB; | |
} else { | |
memSize += SHMOO_INCREMENT_4MB; | |
} | |
memSizes[iteration] = memSize; | |
switch (kind) { | |
case DEVICE_TO_HOST: | |
bandwidths[iteration] += | |
testDeviceToHostTransfer(memSizes[iteration], memMode, wc); | |
break; | |
case HOST_TO_DEVICE: | |
bandwidths[iteration] += | |
testHostToDeviceTransfer(memSizes[iteration], memMode, wc); | |
break; | |
case DEVICE_TO_DEVICE: | |
bandwidths[iteration] += | |
testBatchGatherTransfer(memSizes[iteration]); | |
break; | |
} | |
iteration++; | |
printf("."); | |
fflush(0); | |
} | |
} // Complete the bandwidth computation on all the devices | |
// print results | |
printf("\n"); | |
if (CSV == printmode) { | |
printResultsCSV(memSizes, bandwidths, count, kind, memMode, | |
(1 + endDevice - startDevice), wc); | |
} else { | |
printResultsReadable(memSizes, bandwidths, count, kind, memMode, | |
(1 + endDevice - startDevice), wc); | |
} | |
// clean up | |
free(memSizes); | |
free(bandwidths); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// test the bandwidth of a device to host memcopy of a specific size | |
/////////////////////////////////////////////////////////////////////////////// | |
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, | |
bool wc) { | |
StopWatchInterface *timer = NULL; | |
float elapsedTimeInMs = 0.0f; | |
float bandwidthInGBs = 0.0f; | |
unsigned char *h_idata = NULL; | |
unsigned char *h_odata = NULL; | |
cudaEvent_t start, stop; | |
sdkCreateTimer(&timer); | |
checkCudaErrors(cudaEventCreate(&start)); | |
checkCudaErrors(cudaEventCreate(&stop)); | |
// allocate host memory | |
if (PINNED == memMode) { | |
// pinned memory mode - use special function to get OS-pinned memory | |
#if CUDART_VERSION >= 2020 | |
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, | |
(wc) ? cudaHostAllocWriteCombined : 0)); | |
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, | |
(wc) ? cudaHostAllocWriteCombined : 0)); | |
#else | |
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); | |
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); | |
#endif | |
} else { | |
// pageable memory mode - use malloc | |
h_idata = (unsigned char *)malloc(memSize); | |
h_odata = (unsigned char *)malloc(memSize); | |
if (h_idata == 0 || h_odata == 0) { | |
fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
// initialize the memory | |
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { | |
h_idata[i] = (unsigned char)(i & 0xff); | |
} | |
// allocate device memory | |
unsigned char *d_idata; | |
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); | |
// initialize the device memory | |
checkCudaErrors( | |
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); | |
// copy data from GPU to Host | |
if (PINNED == memMode) { | |
if (bDontUseGPUTiming) sdkStartTimer(&timer); | |
checkCudaErrors(cudaEventRecord(start, 0)); | |
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { | |
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, | |
cudaMemcpyDeviceToHost, 0)); | |
} | |
checkCudaErrors(cudaEventRecord(stop, 0)); | |
checkCudaErrors(cudaDeviceSynchronize()); | |
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); | |
if (bDontUseGPUTiming) { | |
sdkStopTimer(&timer); | |
elapsedTimeInMs = sdkGetTimerValue(&timer); | |
sdkResetTimer(&timer); | |
} | |
} else { | |
elapsedTimeInMs = 0; | |
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { | |
sdkStartTimer(&timer); | |
checkCudaErrors( | |
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); | |
sdkStopTimer(&timer); | |
elapsedTimeInMs += sdkGetTimerValue(&timer); | |
sdkResetTimer(&timer); | |
memset(flush_buf, i, FLUSH_SIZE); | |
} | |
} | |
// calculate bandwidth in GB/s | |
double time_s = elapsedTimeInMs / 1e3; | |
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; | |
bandwidthInGBs = bandwidthInGBs / time_s; | |
// clean up memory | |
checkCudaErrors(cudaEventDestroy(stop)); | |
checkCudaErrors(cudaEventDestroy(start)); | |
sdkDeleteTimer(&timer); | |
if (PINNED == memMode) { | |
checkCudaErrors(cudaFreeHost(h_idata)); | |
checkCudaErrors(cudaFreeHost(h_odata)); | |
} else { | |
free(h_idata); | |
free(h_odata); | |
} | |
checkCudaErrors(cudaFree(d_idata)); | |
return bandwidthInGBs; | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
//! test the bandwidth of a host to device memcopy of a specific size | |
/////////////////////////////////////////////////////////////////////////////// | |
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, | |
bool wc) { | |
StopWatchInterface *timer = NULL; | |
float elapsedTimeInMs = 0.0f; | |
float bandwidthInGBs = 0.0f; | |
cudaEvent_t start, stop; | |
sdkCreateTimer(&timer); | |
checkCudaErrors(cudaEventCreate(&start)); | |
checkCudaErrors(cudaEventCreate(&stop)); | |
// allocate host memory | |
unsigned char *h_odata = NULL; | |
if (PINNED == memMode) { | |
#if CUDART_VERSION >= 2020 | |
// pinned memory mode - use special function to get OS-pinned memory | |
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, | |
(wc) ? cudaHostAllocWriteCombined : 0)); | |
#else | |
// pinned memory mode - use special function to get OS-pinned memory | |
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); | |
#endif | |
} else { | |
// pageable memory mode - use malloc | |
h_odata = (unsigned char *)malloc(memSize); | |
if (h_odata == 0) { | |
fprintf(stderr, "Not enough memory available on host to run test!\n"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); | |
unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); | |
if (h_cacheClear1 == 0 || h_cacheClear2 == 0) { | |
fprintf(stderr, "Not enough memory available on host to run test!\n"); | |
exit(EXIT_FAILURE); | |
} | |
// initialize the memory | |
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { | |
h_odata[i] = (unsigned char)(i & 0xff); | |
} | |
for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { | |
h_cacheClear1[i] = (unsigned char)(i & 0xff); | |
h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff)); | |
} | |
// allocate device memory | |
unsigned char *d_idata; | |
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); | |
// copy host memory to device memory | |
if (PINNED == memMode) { | |
if (bDontUseGPUTiming) sdkStartTimer(&timer); | |
checkCudaErrors(cudaEventRecord(start, 0)); | |
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { | |
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, | |
cudaMemcpyHostToDevice, 0)); | |
} | |
checkCudaErrors(cudaEventRecord(stop, 0)); | |
checkCudaErrors(cudaDeviceSynchronize()); | |
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); | |
if (bDontUseGPUTiming) { | |
sdkStopTimer(&timer); | |
elapsedTimeInMs = sdkGetTimerValue(&timer); | |
sdkResetTimer(&timer); | |
} | |
} else { | |
elapsedTimeInMs = 0; | |
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { | |
sdkStartTimer(&timer); | |
checkCudaErrors( | |
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); | |
sdkStopTimer(&timer); | |
elapsedTimeInMs += sdkGetTimerValue(&timer); | |
sdkResetTimer(&timer); | |
memset(flush_buf, i, FLUSH_SIZE); | |
} | |
} | |
// calculate bandwidth in GB/s | |
double time_s = elapsedTimeInMs / 1e3; | |
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; | |
bandwidthInGBs = bandwidthInGBs / time_s; | |
// clean up memory | |
checkCudaErrors(cudaEventDestroy(stop)); | |
checkCudaErrors(cudaEventDestroy(start)); | |
sdkDeleteTimer(&timer); | |
if (PINNED == memMode) { | |
checkCudaErrors(cudaFreeHost(h_odata)); | |
} else { | |
free(h_odata); | |
} | |
free(h_cacheClear1); | |
free(h_cacheClear2); | |
checkCudaErrors(cudaFree(d_idata)); | |
return bandwidthInGBs; | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
//! test the bandwidth of a device to device memcopy of a specific size | |
/////////////////////////////////////////////////////////////////////////////// | |
float testDeviceToDeviceTransfer(unsigned int memSize) { | |
StopWatchInterface *timer = NULL; | |
float elapsedTimeInMs = 0.0f; | |
float bandwidthInGBs = 0.0f; | |
cudaEvent_t start, stop; | |
sdkCreateTimer(&timer); | |
checkCudaErrors(cudaEventCreate(&start)); | |
checkCudaErrors(cudaEventCreate(&stop)); | |
// allocate host memory | |
unsigned char *h_idata = (unsigned char *)malloc(memSize); | |
if (h_idata == 0) { | |
fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); | |
exit(EXIT_FAILURE); | |
} | |
// initialize the host memory | |
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { | |
h_idata[i] = (unsigned char)(i & 0xff); | |
} | |
// allocate device memory | |
unsigned char *d_idata; | |
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); | |
unsigned char *d_odata; | |
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); | |
// initialize memory | |
checkCudaErrors( | |
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); | |
// run the memcopy | |
sdkStartTimer(&timer); | |
checkCudaErrors(cudaEventRecord(start, 0)); | |
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { | |
checkCudaErrors( | |
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); | |
} | |
checkCudaErrors(cudaEventRecord(stop, 0)); | |
// Since device to device memory copies are non-blocking, | |
// cudaDeviceSynchronize() is required in order to get | |
// proper timing. | |
checkCudaErrors(cudaDeviceSynchronize()); | |
// get the total elapsed time in ms | |
sdkStopTimer(&timer); | |
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); | |
if (bDontUseGPUTiming) { | |
elapsedTimeInMs = sdkGetTimerValue(&timer); | |
} | |
// calculate bandwidth in GB/s | |
double time_s = elapsedTimeInMs / 1e3; | |
bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; | |
bandwidthInGBs = bandwidthInGBs / time_s; | |
// clean up memory | |
sdkDeleteTimer(&timer); | |
free(h_idata); | |
checkCudaErrors(cudaEventDestroy(stop)); | |
checkCudaErrors(cudaEventDestroy(start)); | |
checkCudaErrors(cudaFree(d_idata)); | |
checkCudaErrors(cudaFree(d_odata)); | |
return bandwidthInGBs; | |
} | |
///////////////////////////////////////////////////////// | |
// print results in an easily read format | |
//////////////////////////////////////////////////////// | |
void printResultsReadable(unsigned int *memSizes, double *bandwidths, | |
unsigned int count, memcpyKind kind, | |
memoryMode memMode, int iNumDevs, bool wc) { | |
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); | |
printf(" %s Memory Transfers\n", sMemoryMode[memMode]); | |
if (wc) { | |
printf(" Write-Combined Memory Writes are Enabled"); | |
} | |
printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n"); | |
unsigned int i; | |
for (i = 0; i < (count - 1); i++) { | |
printf(" %u\t\t\t%s%.1f\n", memSizes[i], | |
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); | |
} | |
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], | |
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); | |
} | |
/////////////////////////////////////////////////////////////////////////// | |
// print results in a database format | |
/////////////////////////////////////////////////////////////////////////// | |
void printResultsCSV(unsigned int *memSizes, double *bandwidths, | |
unsigned int count, memcpyKind kind, memoryMode memMode, | |
int iNumDevs, bool wc) { | |
std::string sConfig; | |
// log config information | |
if (kind == DEVICE_TO_DEVICE) { | |
sConfig += "D2D"; | |
} else { | |
if (kind == DEVICE_TO_HOST) { | |
sConfig += "D2H"; | |
} else if (kind == HOST_TO_DEVICE) { | |
sConfig += "H2D"; | |
} | |
if (memMode == PAGEABLE) { | |
sConfig += "-Paged"; | |
} else if (memMode == PINNED) { | |
sConfig += "-Pinned"; | |
if (wc) { | |
sConfig += "-WriteCombined"; | |
} | |
} | |
} | |
unsigned int i; | |
double dSeconds = 0.0; | |
for (i = 0; i < count; i++) { | |
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); | |
printf( | |
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " | |
"bytes, NumDevsUsed = %d\n", | |
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs); | |
} | |
} | |
__global__ void batch_gather_kernel(__restrict__ uint8_t* data, __restrict__ uint32_t* indices, __restrict__ uint8_t* output, uint32_t D) { | |
// warp per row. | |
uint32_t row_idx = blockIdx.x * blockDim.y + threadIdx.y; | |
uint32_t idx = indices[row_idx]; | |
auto* input_row_start = &data[D * idx]; | |
auto* output_row_start = &output[D * row_idx]; | |
for (auto i = 16 * threadIdx.x; i < D; i += 16 * 32) { | |
*(uint4*)(&output_row_start[i]) = *(uint4*)(&input_row_start[i]); | |
} | |
} | |
void run_batch_gather(__restrict__ uint8_t* data, __restrict__ uint32_t* indices, __restrict__ uint8_t* output, uint32_t D, uint32_t N) { | |
batch_gather_kernel<<<dim3(N / 8), dim3(32, 8), 0, nullptr>>>( | |
data, indices, output, D | |
); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
//! test the bandwidth of a device to device memcopy of a specific size | |
/////////////////////////////////////////////////////////////////////////////// | |
float testBatchGatherTransfer(unsigned int memSize) { | |
StopWatchInterface *timer = NULL; | |
float elapsedTimeInMs = 0.0f; | |
float bandwidthInGBs = 0.0f; | |
cudaEvent_t start, stop; | |
sdkCreateTimer(&timer); | |
checkCudaErrors(cudaEventCreate(&start)); | |
checkCudaErrors(cudaEventCreate(&stop)); | |
// allocate host memory | |
uint64_t D = 2048; | |
uint64_t E = 10000000; | |
uint64_t R = 32 * 100000; | |
// initialize the host memory | |
uint32_t *h_indices = (uint32_t *)malloc(R * sizeof(uint32_t)); | |
for (unsigned int i = 0; i < R; i++) { | |
h_indices[i] = rand() % E; | |
} | |
// allocate device memory | |
uint8_t*d_table; | |
checkCudaErrors(cudaMalloc((void **)&d_table, E * D)); | |
uint32_t*d_indices; | |
checkCudaErrors(cudaMalloc((void **)&d_indices, R * sizeof(uint32_t))); | |
uint8_t*d_odata; | |
checkCudaErrors(cudaMalloc((void **)&d_odata, R * D)); | |
// initialize memory | |
checkCudaErrors( | |
cudaMemcpy(d_indices, h_indices, R * sizeof(uint32_t), cudaMemcpyHostToDevice)); | |
// run the memcopy | |
sdkStartTimer(&timer); | |
checkCudaErrors(cudaEventRecord(start, 0)); | |
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { | |
run_batch_gather(d_table, d_indices, d_odata, D, R); | |
} | |
checkCudaErrors(cudaEventRecord(stop, 0)); | |
// Since device to device memory copies are non-blocking, | |
// cudaDeviceSynchronize() is required in order to get | |
// proper timing. | |
checkCudaErrors(cudaDeviceSynchronize()); | |
// get the total elapsed time in ms | |
sdkStopTimer(&timer); | |
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); | |
if (bDontUseGPUTiming) { | |
elapsedTimeInMs = sdkGetTimerValue(&timer); | |
} | |
// calculate bandwidth in GB/s | |
double time_s = elapsedTimeInMs / 1e3; | |
bandwidthInGBs = (2.0f * R * D * (float)MEMCOPY_ITERATIONS) / (double)1e9; | |
bandwidthInGBs = bandwidthInGBs / time_s; | |
printf("batchGather, Bandwidth = %.1f GB/s\n", bandwidthInGBs); | |
// clean up memory | |
sdkDeleteTimer(&timer); | |
free(h_indices); | |
checkCudaErrors(cudaEventDestroy(stop)); | |
checkCudaErrors(cudaEventDestroy(start)); | |
checkCudaErrors(cudaFree(d_table)); | |
checkCudaErrors(cudaFree(d_indices)); | |
checkCudaErrors(cudaFree(d_odata)); | |
return bandwidthInGBs; | |
} | |
/////////////////////////////////////////////////////////////////////////// | |
// Print help screen | |
/////////////////////////////////////////////////////////////////////////// | |
void printHelp(void) { | |
printf("Usage: bandwidthTest [OPTION]...\n"); | |
printf( | |
"Test the bandwidth for device to host, host to device, and device to " | |
"device transfers\n"); | |
printf("\n"); | |
printf( | |
"Example: measure the bandwidth of device to host pinned memory copies " | |
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); | |
printf( | |
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " | |
"--increment=1024 --dtoh\n"); | |
printf("\n"); | |
printf("Options:\n"); | |
printf("--help\tDisplay this help menu\n"); | |
printf("--csv\tPrint results as a CSV\n"); | |
printf("--device=[deviceno]\tSpecify the device device to be used\n"); | |
printf(" all - compute cumulative bandwidth on all the devices\n"); | |
printf(" 0,1,2,...,n - Specify any particular device to be used\n"); | |
printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); | |
printf(" pageable - pageable memory\n"); | |
printf(" pinned - non-pageable system memory\n"); | |
printf("--mode=[MODE]\tSpecify the mode to use\n"); | |
printf(" quick - performs a quick measurement\n"); | |
printf(" range - measures a user-specified range of values\n"); | |
printf(" shmoo - performs an intense shmoo of a large range of values\n"); | |
printf(" batchgather - batchgather\n"); | |
printf("--htod\tMeasure host to device transfers\n"); | |
printf("--dtoh\tMeasure device to host transfers\n"); | |
printf("--dtod\tMeasure device to device transfers\n"); | |
#if CUDART_VERSION >= 2020 | |
printf("--wc\tAllocate pinned memory as write-combined\n"); | |
#endif | |
printf("--cputiming\tForce CPU-based timing always\n"); | |
printf("Range mode options\n"); | |
printf("--start=[SIZE]\tStarting transfer size in bytes\n"); | |
printf("--end=[SIZE]\tEnding transfer size in bytes\n"); | |
printf("--increment=[SIZE]\tIncrement size in bytes\n"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment