Last active
September 17, 2023 11:16
-
-
Save so298/9b6575d5368e10cc18edab70ef9916cc to your computer and use it in GitHub Desktop.
cudaMemcpy bandwidth test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <ctime> | |
#include <cuda_runtime.h> | |
#define MEGA_BYTE (1'000'000) | |
const int N = 100 * MEGA_BYTE; // Size of data | |
const int numIterations = 100; // iteration | |
// CUDA error check macro | |
#define CUDA_CHECK(call) \ | |
do { \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" << __LINE__ << std::endl; \ | |
exit(err); \ | |
} \ | |
} while (0) | |
int main() { | |
// allocate memory on host and device | |
int* h_data = new int[N]; | |
int* d_data; | |
CUDA_CHECK(cudaMalloc((void**)&d_data, N * sizeof(int))); | |
// data initialization | |
for (int i = 0; i < N; i++) { | |
h_data[i] = i; | |
} | |
// measure time for dev to host | |
clock_t start, end; | |
double devToHostTime = 0.0; | |
for (int iter = 0; iter < numIterations; iter++) { | |
start = clock(); | |
CUDA_CHECK(cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost)); | |
end = clock(); | |
devToHostTime += (double)(end - start) / CLOCKS_PER_SEC; | |
} | |
// measure time for host to dev | |
double hostToDevTime = 0.0; | |
for (int iter = 0; iter < numIterations; iter++) { | |
start = clock(); | |
CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice)); | |
end = clock(); | |
hostToDevTime += (double)(end - start) / CLOCKS_PER_SEC; | |
} | |
// convert to average | |
devToHostTime /= numIterations; | |
hostToDevTime /= numIterations; | |
// show result | |
std::cout << "Average bandwidth for Device to Host memory copy: " << N * sizeof(int) / devToHostTime / MEGA_BYTE << " Mbytes / s" << std::endl; | |
std::cout << "Average bandwidth for Host to Device memory copy: " << N * sizeof(int) / hostToDevTime / MEGA_BYTE << " Mbytes / s" << std::endl; | |
// free memory | |
delete[] h_data; | |
CUDA_CHECK(cudaFree(d_data)); | |
return 0; | |
} |
Wisteria/BDEC-1 aquarius
$ nvidia-smi
Sun Sep 17 19:52:00 2023
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13 Driver Version: 525.60.13 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A100-SXM... On | 00000000:27:00.0 Off | 0 |
| N/A 26C P0 53W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA A100-SXM... On | 00000000:2A:00.0 Off | 0 |
| N/A 24C P0 54W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA A100-SXM... On | 00000000:51:00.0 Off | 0 |
| N/A 24C P0 51W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA A100-SXM... On | 00000000:57:00.0 Off | 0 |
| N/A 24C P0 53W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 4 NVIDIA A100-SXM... On | 00000000:9E:00.0 Off | 0 |
| N/A 24C P0 54W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 5 NVIDIA A100-SXM... On | 00000000:A4:00.0 Off | 0 |
| N/A 24C P0 51W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 6 NVIDIA A100-SXM... On | 00000000:C7:00.0 Off | 0 |
| N/A 24C P0 54W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 7 NVIDIA A100-SXM... On | 00000000:CA:00.0 Off | 0 |
| N/A 25C P0 53W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Output
Average bandwidth for Device to Host memory copy: 10242.4 Mbytes / s
Average bandwidth for Host to Device memory copy: 11211.3 Mbytes / s
MDX (A100 with vmware virtualization)
$ nvidia-smi
Sun Sep 17 20:11:06 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A100-SXM4-40GB On | 00000000:0C:00.0 Off | 0 |
| N/A 26C P0 45W / 400W | 4MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
Output
Average bandwidth for Device to Host memory copy: 9817.57 Mbytes / s
Average bandwidth for Host to Device memory copy: 10713.9 Mbytes / s
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output (RTX 3060Ti, PCIe 4.0)
Average bandwidth for Device to Host memory copy: 8880.28 Mbytes / s
Average bandwidth for Host to Device memory copy: 7798.98 Mbytes / s