Last active
September 17, 2023 11:16
-
-
Save so298/9b6575d5368e10cc18edab70ef9916cc to your computer and use it in GitHub Desktop.
cudaMemcpy bandwidth test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <ctime> | |
#include <cuda_runtime.h> | |
#define MEGA_BYTE (1'000'000) | |
const int N = 100 * MEGA_BYTE; // Size of data | |
const int numIterations = 100; // iteration | |
// CUDA error check macro | |
#define CUDA_CHECK(call) \ | |
do { \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" << __LINE__ << std::endl; \ | |
exit(err); \ | |
} \ | |
} while (0) | |
int main() { | |
// allocate memory on host and device | |
int* h_data = new int[N]; | |
int* d_data; | |
CUDA_CHECK(cudaMalloc((void**)&d_data, N * sizeof(int))); | |
// data initialization | |
for (int i = 0; i < N; i++) { | |
h_data[i] = i; | |
} | |
// measure time for dev to host | |
clock_t start, end; | |
double devToHostTime = 0.0; | |
for (int iter = 0; iter < numIterations; iter++) { | |
start = clock(); | |
CUDA_CHECK(cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost)); | |
end = clock(); | |
devToHostTime += (double)(end - start) / CLOCKS_PER_SEC; | |
} | |
// measure time for host to dev | |
double hostToDevTime = 0.0; | |
for (int iter = 0; iter < numIterations; iter++) { | |
start = clock(); | |
CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice)); | |
end = clock(); | |
hostToDevTime += (double)(end - start) / CLOCKS_PER_SEC; | |
} | |
// convert to average | |
devToHostTime /= numIterations; | |
hostToDevTime /= numIterations; | |
// show result | |
std::cout << "Average bandwidth for Device to Host memory copy: " << N * sizeof(int) / devToHostTime / MEGA_BYTE << " Mbytes / s" << std::endl; | |
std::cout << "Average bandwidth for Host to Device memory copy: " << N * sizeof(int) / hostToDevTime / MEGA_BYTE << " Mbytes / s" << std::endl; | |
// free memory | |
delete[] h_data; | |
CUDA_CHECK(cudaFree(d_data)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
MDX (A100 with vmware virtualization)
Output