Skip to content

Instantly share code, notes, and snippets.

@sonots
Last active July 9, 2017 11:55
Show Gist options
  • Save sonots/7e11231af2f2cc6b1f519ad832632566 to your computer and use it in GitHub Desktop.
Save sonots/7e11231af2f2cc6b1f519ad832632566 to your computer and use it in GitHub Desktop.
nvcc cudaMallocVScuMemAllocBench.cu -L /usr/local/cuda/lib64 -l cuda
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CU_CHECK(call) \
{ \
const CUresult error = call; \
if (error != CUDA_SUCCESS) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d\n", error); \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
int total_size = 1024 * 1024;
void test(int size)
{
double iStart, iElaps;
int num = total_size / size;
float *d[num];
CUdeviceptr d2[num];
iStart = seconds();
for (int i = 0; i < num; i++) {
CHECK(cudaMalloc((void**)&d[i], size));
//printf("%p\n", d[i]);
}
iElaps = seconds() - iStart;
printf("cudaMalloc(%d) x %d Time elapsed %f sec\n", size, num, iElaps);
iStart = seconds();
for (int i = 0; i < num; i++) {
CHECK(cudaFree(d[i]));
}
iElaps = seconds() - iStart;
printf("cudaFree(%d) x %d Time elapsed %f sec\n", size, num, iElaps);
iStart = seconds();
for (int i = 0; i < num; i++) {
CU_CHECK(cuMemAlloc(&d2[i], size));
//printf("%p\n", (void*)(d2[i]));
}
iElaps = seconds() - iStart;
printf("cuMemAlloc(%d) x %d Time elapsed %f sec\n", size, num, iElaps);
iStart = seconds();
for (int i = 0; i < num; i++) {
CU_CHECK(cuMemFree(d2[i]));
}
iElaps = seconds() - iStart;
printf("cuMemFree(%d) x %d Time elapsed %f sec\n", size, num, iElaps);
}
int main(int argc, char **argv)
{
if (argc < 2) {
printf("%s size\n", argv[0]);
exit(1);
}
printf("%s Starting...\n", argv[0]);
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("Using Device %d: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
int size = atoi(argv[1]);
test(size);
return(0);
}
@sonots
Copy link
Author

sonots commented Jun 20, 2017

AWS p2.xlarge

$ ./a.out 256
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(256) x 4096 Time elapsed 0.122005 sec
cudaFree(256) x 4096 Time elapsed 0.026064 sec
cuMemAlloc(256) x 4096 Time elapsed 0.025064 sec
cuMemFree(256) x 4096 Time elapsed 0.025215 sec
$ ./a.out 512
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(512) x 2048 Time elapsed 0.100835 sec
cudaFree(512) x 2048 Time elapsed 0.009801 sec
cuMemAlloc(512) x 2048 Time elapsed 0.005990 sec
cuMemFree(512) x 2048 Time elapsed 0.009296 sec
$ ./a.out 1024
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(1024) x 1024 Time elapsed 0.097044 sec
cudaFree(1024) x 1024 Time elapsed 0.004797 sec
cuMemAlloc(1024) x 1024 Time elapsed 0.003089 sec
cuMemFree(1024) x 1024 Time elapsed 0.004644 sec
$ ./a.out 2048
./a.out Starting...
Using Device 0: Tesla K80
cudaMalloc(2048) x 512 Time elapsed 0.095105 sec
cudaFree(2048) x 512 Time elapsed 0.002426 sec
cuMemAlloc(2048) x 512 Time elapsed 0.001700 sec
cuMemFree(2048) x 512 Time elapsed 0.002350 sec

@sonots
Copy link
Author

sonots commented Jun 20, 2017

The least alignment for cuMemAlloc was 512 bytes same as cudaMalloc

0x12052de200
0x12052de400
0x12052de600
0x12052de800

@sonots
Copy link
Author

sonots commented Jul 9, 2017

cuMemAlloc looked faster, but it was because cudaMalloc creates a context on a first call.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment