Last active
July 9, 2017 11:55
-
-
Save sonots/7e11231af2f2cc6b1f519ad832632566 to your computer and use it in GitHub Desktop.
nvcc cudaMallocVScuMemAllocBench.cu -L /usr/local/cuda/lib64 -l cuda
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <sys/time.h> | |
#include <cuda.h> | |
#include <cuda_runtime.h> | |
#include <stdio.h> | |
#define CHECK(call) \ | |
{ \ | |
const cudaError_t error = call; \ | |
if (error != cudaSuccess) \ | |
{ \ | |
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ | |
fprintf(stderr, "code: %d, reason: %s\n", error, \ | |
cudaGetErrorString(error)); \ | |
exit(1); \ | |
} \ | |
} | |
#define CU_CHECK(call) \ | |
{ \ | |
const CUresult error = call; \ | |
if (error != CUDA_SUCCESS) \ | |
{ \ | |
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ | |
fprintf(stderr, "code: %d\n", error); \ | |
exit(1); \ | |
} \ | |
} | |
inline double seconds() | |
{ | |
struct timeval tp; | |
struct timezone tzp; | |
int i = gettimeofday(&tp, &tzp); | |
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); | |
} | |
int total_size = 1024 * 1024; | |
void test(int size) | |
{ | |
double iStart, iElaps; | |
int num = total_size / size; | |
float *d[num]; | |
CUdeviceptr d2[num]; | |
iStart = seconds(); | |
for (int i = 0; i < num; i++) { | |
CHECK(cudaMalloc((void**)&d[i], size)); | |
//printf("%p\n", d[i]); | |
} | |
iElaps = seconds() - iStart; | |
printf("cudaMalloc(%d) x %d Time elapsed %f sec\n", size, num, iElaps); | |
iStart = seconds(); | |
for (int i = 0; i < num; i++) { | |
CHECK(cudaFree(d[i])); | |
} | |
iElaps = seconds() - iStart; | |
printf("cudaFree(%d) x %d Time elapsed %f sec\n", size, num, iElaps); | |
iStart = seconds(); | |
for (int i = 0; i < num; i++) { | |
CU_CHECK(cuMemAlloc(&d2[i], size)); | |
//printf("%p\n", (void*)(d2[i])); | |
} | |
iElaps = seconds() - iStart; | |
printf("cuMemAlloc(%d) x %d Time elapsed %f sec\n", size, num, iElaps); | |
iStart = seconds(); | |
for (int i = 0; i < num; i++) { | |
CU_CHECK(cuMemFree(d2[i])); | |
} | |
iElaps = seconds() - iStart; | |
printf("cuMemFree(%d) x %d Time elapsed %f sec\n", size, num, iElaps); | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc < 2) { | |
printf("%s size\n", argv[0]); | |
exit(1); | |
} | |
printf("%s Starting...\n", argv[0]); | |
// set up device | |
int dev = 0; | |
cudaDeviceProp deviceProp; | |
cudaGetDeviceProperties(&deviceProp, dev); | |
printf("Using Device %d: %s\n", dev, deviceProp.name); | |
cudaSetDevice(dev); | |
int size = atoi(argv[1]); | |
test(size); | |
return(0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cuMemAlloc looked faster, but it was because cudaMalloc creates a context on a first call.