-
-
Save f0k/63a664160d016a491b2cbea15913d549 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Outputs some information on CUDA-enabled devices on your computer, | |
including current memory usage. | |
It's a port of https://gist.github.com/f0k/0d6431e3faa60bffc788f8b4daa029b1 | |
from C to Python with ctypes, so it can run without compiling anything. Note | |
that this is a direct translation with no attempt to make the code Pythonic. | |
It's meant as a general demonstration on how to obtain CUDA device information | |
from Python without resorting to nvidia-smi or a compiled Python extension. | |
Author: Jan Schlüter | |
License: MIT (https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549#gistcomment-3870498) | |
""" | |
import sys | |
import ctypes | |
# Some constants taken from cuda.h | |
CUDA_SUCCESS = 0 | |
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16 | |
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39 | |
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13 | |
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36 | |
def ConvertSMVer2Cores(major, minor): | |
# Returns the number of CUDA cores per multiprocessor for a given | |
# Compute Capability version. There is no way to retrieve that via | |
# the API, so it needs to be hard-coded. | |
# See _ConvertSMVer2Cores in helper_cuda.h in NVIDIA's CUDA Samples. | |
return {(1, 0): 8, # Tesla | |
(1, 1): 8, | |
(1, 2): 8, | |
(1, 3): 8, | |
(2, 0): 32, # Fermi | |
(2, 1): 48, | |
(3, 0): 192, # Kepler | |
(3, 2): 192, | |
(3, 5): 192, | |
(3, 7): 192, | |
(5, 0): 128, # Maxwell | |
(5, 2): 128, | |
(5, 3): 128, | |
(6, 0): 64, # Pascal | |
(6, 1): 128, | |
(6, 2): 128, | |
(7, 0): 64, # Volta | |
(7, 2): 64, | |
(7, 5): 64, # Turing | |
(8, 0): 64, # Ampere | |
(8, 6): 128, | |
(8, 7): 128, | |
(8, 9): 128, # Ada | |
(9, 0): 128, # Hopper | |
}.get((major, minor), 0) | |
def main(): | |
libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll', 'cuda.dll') | |
for libname in libnames: | |
try: | |
cuda = ctypes.CDLL(libname) | |
except OSError: | |
continue | |
else: | |
break | |
else: | |
raise OSError("could not load any of: " + ' '.join(libnames)) | |
nGpus = ctypes.c_int() | |
name = b' ' * 100 | |
cc_major = ctypes.c_int() | |
cc_minor = ctypes.c_int() | |
cores = ctypes.c_int() | |
threads_per_core = ctypes.c_int() | |
clockrate = ctypes.c_int() | |
freeMem = ctypes.c_size_t() | |
totalMem = ctypes.c_size_t() | |
result = ctypes.c_int() | |
device = ctypes.c_int() | |
context = ctypes.c_void_p() | |
error_str = ctypes.c_char_p() | |
result = cuda.cuInit(0) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuInit failed with error code %d: %s" % (result, error_str.value.decode())) | |
return 1 | |
result = cuda.cuDeviceGetCount(ctypes.byref(nGpus)) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode())) | |
return 1 | |
print("Found %d device(s)." % nGpus.value) | |
for i in range(nGpus.value): | |
result = cuda.cuDeviceGet(ctypes.byref(device), i) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuDeviceGet failed with error code %d: %s" % (result, error_str.value.decode())) | |
return 1 | |
print("Device: %d" % i) | |
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS: | |
print(" Name: %s" % (name.split(b'\0', 1)[0].decode())) | |
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS: | |
print(" Compute Capability: %d.%d" % (cc_major.value, cc_minor.value)) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS: | |
print(" Multiprocessors: %d" % cores.value) | |
print(" CUDA Cores: %s" % (cores.value * ConvertSMVer2Cores(cc_major.value, cc_minor.value) or "unknown")) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(threads_per_core), CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS: | |
print(" Concurrent threads: %d" % (cores.value * threads_per_core.value)) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS: | |
print(" GPU clock: %g MHz" % (clockrate.value / 1000.)) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS: | |
print(" Memory clock: %g MHz" % (clockrate.value / 1000.)) | |
try: | |
result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) | |
except AttributeError: | |
result = cuda.cuCtxCreate(ctypes.byref(context), 0, device) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuCtxCreate failed with error code %d: %s" % (result, error_str.value.decode())) | |
else: | |
try: | |
result = cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) | |
except AttributeError: | |
result = cuda.cuMemGetInfo(ctypes.byref(freeMem), ctypes.byref(totalMem)) | |
if result == CUDA_SUCCESS: | |
print(" Total Memory: %ld MiB" % (totalMem.value / 1024**2)) | |
print(" Free Memory: %ld MiB" % (freeMem.value / 1024**2)) | |
else: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuMemGetInfo failed with error code %d: %s" % (result, error_str.value.decode())) | |
cuda.cuCtxDetach(context) | |
return 0 | |
if __name__=="__main__": | |
sys.exit(main()) |
This is Cuda programming code and Compare GPU vs CPU
https://debuggingsolution.blogspot.com/2021/09/vector-addition-cuda-parallel.html
Thanks @f0k for the excellent snippet! Here is an importable version which can be run inside other scripts as get_cuda_device_specs()
. It returns a list of specification dicts per CUDA device
[
{
"name": "NVIDIA GeForce RTX 3080 Laptop GPU",
"compute_capability": [
8,
6
],
"architecture": "ampere",
"cores": 48,
"cuda_cores": 3072,
"concurrent_threads": 73728,
"gpu_clock_mhz": 1245.0,
"mem_clock_mhz": 6001.0,
"total_mem_mb": 16125.3125,
"free_mem_mb": 15733.25
}
]
I also made some minor cosmetic updates
- Refactor
str.format()
to f-strings for readability - Refactor camel case to snake case (for PEP linting)
- Move semantic versioning map to constant named dict
- Add another mapping to the architecture key name
- Switch
sys.exit
codes toRuntimeError
andwarnings.warn
where appropriate
import ctypes
import json
from typing import Any, Dict, List
from warnings import warn
# TODO define decorator to share the RuntimeError/CUDA_SUCCESS logic among different library functions
# One of the following libraries must be available to load
libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise ImportError(f'Could not load any of: {", ".join(libnames)}')
# Constants from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
# Conversions from semantic version numbers
# Borrowed from original gist and updated from the "GPUs supported" section of this Wikipedia article
# https://en.wikipedia.org/wiki/CUDA
SEMVER_TO_CORES = {
(1, 0): 8, # Tesla
(1, 1): 8,
(1, 2): 8,
(1, 3): 8,
(2, 0): 32, # Fermi
(2, 1): 48,
(3, 0): 192, # Kepler
(3, 2): 192,
(3, 5): 192,
(3, 7): 192,
(5, 0): 128, # Maxwell
(5, 2): 128,
(5, 3): 128,
(6, 0): 64, # Pascal
(6, 1): 128,
(6, 2): 128,
(7, 0): 64, # Volta
(7, 2): 64,
(7, 5): 64, # Turing
(8, 0): 64, # Ampere
(8, 6): 64,
}
SEMVER_TO_ARCH = {
(1, 0): 'tesla',
(1, 1): 'tesla',
(1, 2): 'tesla',
(1, 3): 'tesla',
(2, 0): 'fermi',
(2, 1): 'fermi',
(3, 0): 'kepler',
(3, 2): 'kepler',
(3, 5): 'kepler',
(3, 7): 'kepler',
(5, 0): 'maxwell',
(5, 2): 'maxwell',
(5, 3): 'maxwell',
(6, 0): 'pascal',
(6, 1): 'pascal',
(6, 2): 'pascal',
(7, 0): 'volta',
(7, 2): 'volta',
(7, 5): 'turing',
(8, 0): 'ampere',
(8, 6): 'ampere',
}
def get_cuda_device_specs() -> List[Dict[str, Any]]:
"""Generate spec for each GPU device with format
{
'name': str,
'compute_capability': (major: int, minor: int),
'cores': int,
'cuda_cores': int,
'concurrent_threads': int,
'gpu_clock_mhz': float,
'mem_clock_mhz': float,
'total_mem_mb': float,
'free_mem_mb': float
}
"""
# Type-binding definitions for ctypes
num_gpus = ctypes.c_int()
name = b' ' * 100
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
cores = ctypes.c_int()
threads_per_core = ctypes.c_int()
clockrate = ctypes.c_int()
free_mem = ctypes.c_size_t()
total_mem = ctypes.c_size_t()
result = ctypes.c_int()
device = ctypes.c_int()
context = ctypes.c_void_p()
error_str = ctypes.c_char_p()
# Check expected initialization
result = cuda.cuInit(0)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(f'cuInit failed with error code {result}: {error_str.value.decode()}')
result = cuda.cuDeviceGetCount(ctypes.byref(num_gpus))
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(f'cuDeviceGetCount failed with error code {result}: {error_str.value.decode()}')
# Iterate through available devices
device_specs = []
for i in range(num_gpus.value):
spec = {}
result = cuda.cuDeviceGet(ctypes.byref(device), i)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(f'cuDeviceGet failed with error code {result}: {error_str.value.decode()}')
# Parse specs for each device
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
spec.update(name=name.split(b'\0', 1)[0].decode())
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
spec.update(compute_capability=(cc_major.value, cc_minor.value))
spec.update(architecture=SEMVER_TO_ARCH.get((cc_major.value, cc_minor.value), 'unknown'))
if cuda.cuDeviceGetAttribute(ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS:
spec.update(
cores=cores.value,
cuda_cores=cores.value * SEMVER_TO_CORES.get((cc_major.value, cc_minor.value), 'unknown'))
if cuda.cuDeviceGetAttribute(ctypes.byref(threads_per_core), CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS:
spec.update(concurrent_threads=cores.value * threads_per_core.value)
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS:
spec.update(gpu_clock_mhz=clockrate.value / 1000.)
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS:
spec.update(mem_clock_mhz=clockrate.value / 1000.)
# Attempt to determine available vs. free memory
try:
result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device)
except AttributeError:
result = cuda.cuCtxCreate(ctypes.byref(context), 0, device)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
warn(f'cuCtxCreate failed with error code {result}: {error_str.value.decode()}')
else:
try:
result = cuda.cuMemGetInfo_v2(ctypes.byref(free_mem), ctypes.byref(total_mem))
except AttributeError:
result = cuda.cuMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
if result == CUDA_SUCCESS:
spec.update(
total_mem_mb=total_mem.value / 1024**2,
free_mem_mb=free_mem.value / 1024**2)
else:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
warn(f'cuMemGetInfo failed with error code {result}: {error_str.value.decode()}')
cuda.cuCtxDetach(context)
device_specs.append(spec)
return device_specs
if __name__ == '__main__':
print(json.dumps(get_cuda_device_specs(), indent=2))
Thank you for this script! It helped me debug an issue with getting CUDA working in Windows 10 with Ubuntu WSL.
See this: bitsandbytes-foundation/bitsandbytes#337 - Thanks again @f0k !!!
Thanks! Further refactoring with decorators.
import ctypes
import json
from functools import wraps
from typing import Any, Dict, List
from warnings import warn
# Constants from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
# Conversions from semantic version numbers
# Borrowed from original gist and updated from the "GPUs supported" section of this Wikipedia article
# https://en.wikipedia.org/wiki/CUDA
SEMVER_TO_CORES = {
(1, 0): 8, # Tesla
(1, 1): 8,
(1, 2): 8,
(1, 3): 8,
(2, 0): 32, # Fermi
(2, 1): 48,
(3, 0): 192, # Kepler
(3, 2): 192,
(3, 5): 192,
(3, 7): 192,
(5, 0): 128, # Maxwell
(5, 2): 128,
(5, 3): 128,
(6, 0): 64, # Pascal
(6, 1): 128,
(6, 2): 128,
(7, 0): 64, # Volta
(7, 2): 64,
(7, 5): 64, # Turing
(8, 0): 64, # Ampere
(8, 6): 64,
}
SEMVER_TO_ARCH = {
(1, 0): "tesla",
(1, 1): "tesla",
(1, 2): "tesla",
(1, 3): "tesla",
(2, 0): "fermi",
(2, 1): "fermi",
(3, 0): "kepler",
(3, 2): "kepler",
(3, 5): "kepler",
(3, 7): "kepler",
(5, 0): "maxwell",
(5, 2): "maxwell",
(5, 3): "maxwell",
(6, 0): "pascal",
(6, 1): "pascal",
(6, 2): "pascal",
(7, 0): "volta",
(7, 2): "volta",
(7, 5): "turing",
(8, 0): "ampere",
(8, 6): "ampere",
}
# Decorator for CUDA API calls
def cuda_api_call(func):
"""
Decorator to wrap CUDA API calls and check their results.
Raises RuntimeError if the CUDA call does not return CUDA_SUCCESS.
"""
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
if result != CUDA_SUCCESS:
error_str = ctypes.c_char_p()
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(
f"{func.__name__} failed with error code {result}: {error_str.value.decode()}"
)
return result
return wrapper
def cuda_api_call_warn(func):
"""
Decorator to wrap CUDA API calls and check their results.
Prints a warning message if the CUDA call does not return CUDA_SUCCESS.
"""
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
if result != CUDA_SUCCESS:
error_str = ctypes.c_char_p()
cuda.cuGetErrorString(result, ctypes.byref(error_str))
warn(
f"Warning: {func.__name__} failed with error code {result}: {error_str.value.decode()}"
)
return result
return wrapper
# Attempt to load the CUDA library
libnames = ("libcuda.so", "libcuda.dylib", "cuda.dll")
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise ImportError(f'Could not load any of: {", ".join(libnames)}')
# CUDA API calls wrapped with the decorator
@cuda_api_call
def cuInit(flags):
return cuda.cuInit(flags)
@cuda_api_call
def cuDeviceGetCount(count):
return cuda.cuDeviceGetCount(count)
@cuda_api_call
def cuDeviceGet(device, ordinal):
return cuda.cuDeviceGet(device, ordinal)
@cuda_api_call
def cuDeviceGetName(name, len, dev):
return cuda.cuDeviceGetName(name, len, dev)
@cuda_api_call
def cuDeviceComputeCapability(major, minor, dev):
return cuda.cuDeviceComputeCapability(major, minor, dev)
@cuda_api_call
def cuDeviceGetAttribute(pi, attrib, dev):
return cuda.cuDeviceGetAttribute(pi, attrib, dev)
@cuda_api_call_warn
def cuCtxCreate(pctx, flags, dev):
try:
result = cuda.cuCtxCreate_v2(pctx, flags, dev)
except AttributeError:
result = cuda.cuCtxCreate(pctx, flags, dev)
return result
@cuda_api_call_warn
def cuMemGetInfo(free, total):
try:
result = cuda.cuMemGetInfo_v2(free, total)
except AttributeError:
result = cuda.cuMemGetInfo(free, total)
return result
@cuda_api_call
def cuCtxDetach(ctx):
return cuda.cuCtxDetach(ctx)
# Main function to get CUDA device specs
def get_cuda_device_specs() -> List[Dict[str, Any]]:
"""Generate spec for each GPU device with format
{
'name': str,
'compute_capability': (major: int, minor: int),
'cores': int,
'cuda_cores': int,
'concurrent_threads': int,
'gpu_clock_mhz': float,
'mem_clock_mhz': float,
'total_mem_mb': float,
'free_mem_mb': float,
'architecture': str,
'cuda_cores': int
}
"""
# Initialize CUDA
cuInit(0)
num_gpus = ctypes.c_int()
cuDeviceGetCount(ctypes.byref(num_gpus))
device_specs = []
for i in range(num_gpus.value):
spec = {}
device = ctypes.c_int()
cuDeviceGet(ctypes.byref(device), i)
name = b" " * 100
cuDeviceGetName(ctypes.c_char_p(name), len(name), device)
spec["name"] = name.split(b"\0", 1)[0].decode()
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
cuDeviceComputeCapability(
ctypes.byref(cc_major), ctypes.byref(cc_minor), device
)
compute_capability = (cc_major.value, cc_minor.value)
spec["compute_capability"] = compute_capability
cores = ctypes.c_int()
cuDeviceGetAttribute(
ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
)
spec["cores"] = cores.value
threads_per_core = ctypes.c_int()
cuDeviceGetAttribute(
ctypes.byref(threads_per_core),
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
device,
)
spec["concurrent_threads"] = cores.value * threads_per_core.value
clockrate = ctypes.c_int()
cuDeviceGetAttribute(
ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device
)
spec["gpu_clock_mhz"] = clockrate.value / 1000.0
cuDeviceGetAttribute(
ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device
)
spec["mem_clock_mhz"] = clockrate.value / 1000.0
context = ctypes.c_void_p()
if cuCtxCreate(ctypes.byref(context), 0, device) == CUDA_SUCCESS:
free_mem = ctypes.c_size_t()
total_mem = ctypes.c_size_t()
cuMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
spec["total_mem_mb"] = total_mem.value / 1024**2
spec["free_mem_mb"] = free_mem.value / 1024**2
spec["architecture"] = SEMVER_TO_ARCH.get(compute_capability, "unknown")
spec["cuda_cores"] = cores.value * SEMVER_TO_CORES.get(
compute_capability, "unknown"
)
cuCtxDetach(context)
device_specs.append(spec)
return device_specs
if __name__ == "__main__":
print(json.dumps(get_cuda_device_specs(), indent=2))
Thanks for sharing @addisonklinke and @IanBoyanZhang! Looks good except that it would probably be easier to maintain if the two SEMVER dictionaries were joined into one, and the SEMVER_TO_CORES.get()
should default to 0 instead of "unknown", otherwise you will get a very long string in spec["cuda_cores"] for new architectures :) I will not update the gist as the original is so much shorter, but yours will be handy for people who need to access the information from another script.
https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549#file-cuda_check-py-L60
I believe it should look for nvcuda.dll here on windows.
As i know cuda.dll is deprecated since 1.1 version of cuda and then replaced by nvcuda.dll which distributed with Nvidia Driver since 169.x.x version. I test it localy and it works. I am not an expert though.