Last active
January 4, 2023 09:28
-
-
Save sonots/c2f220e1980778b42f111307097f2c31 to your computer and use it in GitHub Desktop.
Benchmark of cudaMalloc. Allocate 1MB of memory totally with several block sizes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <sys/time.h> | |
#include <cuda_runtime.h> | |
#include <stdio.h> | |
inline double seconds() | |
{ | |
struct timeval tp; | |
struct timezone tzp; | |
int i = gettimeofday(&tp, &tzp); | |
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); | |
} | |
int total_size = 1024 * 1024; // 1MB | |
void test(int size) | |
{ | |
double iStart, iElaps; | |
int num = total_size / size; | |
float *d[num]; | |
iStart = seconds(); | |
for (int i = 0; i < num; i++) { | |
cudaMalloc((float**)&d[i], size); | |
} | |
iElaps = seconds() - iStart; | |
printf("cudaMalloc(%d) x %d Time elapsed %f sec\n", size, num, iElaps); | |
iStart = seconds(); | |
for (int i = 0; i < num; i++) { | |
cudaFree(d[i]); | |
} | |
iElaps = seconds() - iStart; | |
printf("cudaFree(%d) x %d Time elapsed %f sec\n", size, num, iElaps); | |
} | |
int main(int argc, char **argv) | |
{ | |
printf("%s Starting...\n", argv[0]); | |
// set up device | |
int dev = 0; | |
cudaDeviceProp deviceProp; | |
cudaGetDeviceProperties(&deviceProp, dev); | |
printf("Using Device %d: %s\n", dev, deviceProp.name); | |
cudaSetDevice(dev); | |
int size = atoi(argv[1]); | |
test(size); | |
return(0); | |
} |
NVIDIA Tesla K80 (AWS p2.xlarge)
$ ./a.out 32
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(32) x 32768 Time elapsed 0.368449 sec
cudaFree(32) x 32768 Time elapsed 0.233109 sec
$ ./a.out 64
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(64) x 16384 Time elapsed 0.222854 sec
cudaFree(64) x 16384 Time elapsed 0.113812 sec
$ ./a.out 128
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(128) x 8192 Time elapsed 0.153932 sec
cudaFree(128) x 8192 Time elapsed 0.053498 sec
$ ./a.out 256
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(256) x 4096 Time elapsed 0.122204 sec
cudaFree(256) x 4096 Time elapsed 0.034702 sec
$ ./a.out 512
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(512) x 2048 Time elapsed 0.100751 sec
cudaFree(512) x 2048 Time elapsed 0.009651 sec
$ ./a.out 1024
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(1024) x 1024 Time elapsed 0.096302 sec
cudaFree(1024) x 1024 Time elapsed 0.004838 sec
asag5104$ ./a.out 1048576
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(1048576) x 1 Time elapsed 0.093849 sec
cudaFree(1048576) x 1 Time elapsed 0.000106 sec
Average for 256: 0.11
Average for 512: 0.09
Average for 1024: 0.086
Average for 2048: 0.085
speed: 1048576 ≒ 1024 ≒ 512 > 256 > 128 > 64 > 32
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
GeForce GTX TITAN X
speed: 1048576 ≒ 1024 ≒ 512 ≒ 256 > 128 > 64 > 32