Last active
February 23, 2025 16:00
-
-
Save f0k/0d6431e3faa60bffc788f8b4daa029b1 to your computer and use it in GitHub Desktop.
Simple program to test whether nvcc/CUDA work
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <cuda.h> | |
#include <cuda_runtime_api.h> | |
/* Outputs some information on CUDA-enabled devices on your computer, | |
* including compute capability and current memory usage. | |
* | |
* On Linux, compile with: nvcc -o cuda_check cuda_check.c -lcuda | |
* On Windows, compile with: nvcc -o cuda_check.exe cuda_check.c -lcuda | |
* | |
* Authors: Thomas Unterthiner, Jan Schlüter | |
*/ | |
int ConvertSMVer2Cores(int major, int minor) | |
{ | |
// Returns the number of CUDA cores per multiprocessor for a given | |
// Compute Capability version. There is no way to retrieve that via | |
// the API, so it needs to be hard-coded. | |
// See _ConvertSMVer2Cores in helper_cuda.h in NVIDIA's CUDA Samples. | |
switch ((major << 4) + minor) { | |
case 0x10: return 8; // Tesla | |
case 0x11: return 8; | |
case 0x12: return 8; | |
case 0x13: return 8; | |
case 0x20: return 32; // Fermi | |
case 0x21: return 48; | |
case 0x30: return 192; // Kepler | |
case 0x32: return 192; | |
case 0x35: return 192; | |
case 0x37: return 192; | |
case 0x50: return 128; // Maxwell | |
case 0x52: return 128; | |
case 0x53: return 128; | |
case 0x60: return 64; // Pascal | |
case 0x61: return 128; | |
case 0x62: return 128; | |
case 0x70: return 64; // Volta | |
case 0x72: return 64; // Xavier | |
case 0x75: return 64; // Turing | |
case 0x80: return 64; // Ampere | |
case 0x86: return 128; | |
case 0x87: return 128; | |
case 0x89: return 128; // Ada | |
case 0x90: return 129; // Hopper | |
default: return 0; | |
} | |
} | |
int main() | |
{ | |
int nGpus; | |
int i; | |
char name[100]; | |
int cc_major, cc_minor, cores, cuda_cores, threads_per_core, clockrate; | |
size_t freeMem; | |
size_t totalMem; | |
CUresult result; | |
CUdevice device; | |
CUcontext context; | |
result = cuInit(0); | |
if (result != CUDA_SUCCESS) { | |
printf("cuInit failed with error code %d: %s\n", result, cudaGetErrorString(result)); | |
return 1; | |
} | |
result = cuDeviceGetCount(&nGpus); | |
if (result != CUDA_SUCCESS) { | |
printf("cuDeviceGetCount failed with error code %d: %s\n", result, cudaGetErrorString(result)); | |
return 1; | |
} | |
printf("Found %d device(s).\n", nGpus); | |
for (i = 0; i < nGpus; i++) { | |
cuDeviceGet(&device, i); | |
printf("Device: %d\n", i); | |
if (cuDeviceGetName(&name[0], sizeof(name), device) == CUDA_SUCCESS) { | |
printf(" Name: %s\n", &name[0]); | |
} | |
if ((cuDeviceGetAttribute(&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device) == CUDA_SUCCESS) && | |
(cuDeviceGetAttribute(&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device) == CUDA_SUCCESS)) { | |
printf(" Compute Capability: %d.%d\n", cc_major, cc_minor); | |
} | |
else { | |
cc_major = cc_minor = 0; | |
} | |
if (cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS) { | |
printf(" Multiprocessors: %d\n", cores); | |
if (cc_major && cc_minor) { | |
cuda_cores = cores * ConvertSMVer2Cores(cc_major, cc_minor); | |
if (cuda_cores > 0) { | |
printf(" CUDA Cores: %d\n", cuda_cores); | |
} | |
else { | |
printf(" CUDA Cores: unknown\n"); | |
} | |
} | |
if (cuDeviceGetAttribute(&threads_per_core, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS) { | |
printf(" Concurrent threads: %d\n", cores*threads_per_core); | |
} | |
} | |
if (cuDeviceGetAttribute(&clockrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS) { | |
printf(" GPU clock: %g MHz\n", clockrate/1000.); | |
} | |
if (cuDeviceGetAttribute(&clockrate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS) { | |
printf(" Memory clock: %g MHz\n", clockrate/1000.); | |
} | |
cuCtxCreate(&context, 0, device); | |
result = cuMemGetInfo(&freeMem, &totalMem); | |
if (result == CUDA_SUCCESS ) { | |
printf(" Total Memory: %ld MiB\n Free Memory: %ld MiB\n", totalMem / ( 1024 * 1024 ), freeMem / ( 1024 * 1024 )); | |
} else { | |
printf(" cMemGetInfo failed with error code %d: %s\n", result, cudaGetErrorString(result)); | |
} | |
cuCtxDestroy(context); | |
} | |
return 0; | |
} |
To compile use:
nvcc -o cuda_check cuda_check.c -lcuda
To fix the deprecated warning just do the following changes:
Where you read
if (cuDeviceComputeCapability(&cc_major, &cc_minor, device) == CUDA_SUCCESS) {
Replace for:
if ((cuDeviceGetAttribute(&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device) == CUDA_SUCCESS) &&
(cuDeviceGetAttribute(&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device) == CUDA_SUCCESS)) {
And where you read
cuCtxDetach(context);
Replace for:
cuCtxDestroy(context);
Here
➤ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jun_13_19:16:58_PDT_2023
Cuda compilation tools, release 12.2, V12.2.91
Build cuda_12.2.r12.2/compiler.32965470_0
Thanks @igormorgado, I've updated the gist accordingly (and also added some missing architectures to ConvertSMVer2Cores).
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@apivovarov: Thanks, nice catch!
cuInit
is from the driver API and returns aCUresult
, whilecudaGetErrorString
is from the runtime API and expects acudaError
, so the code is mixing the two.In any case, there are two ways to compile this code:
cuda_check.c
and compile it withnvcc -o cuda_check cuda_check.c -lcuda
. This gives some deprecation warning oncuDeviceComputeCapability
(as also seen by @zhmlcg), but still works.cuda_check.cu
and compile it withnvcc -o cuda_check cuda_check.cu
. This one does not work.I guess the code should be fixed to consistently use only the driver API or only the runtime API, but it still works (and is backwards-compatible down to CUDA 3 or so).