Skip to content

Instantly share code, notes, and snippets.

@harisankarh
Created April 13, 2021 05:11
Show Gist options
  • Save harisankarh/81c6f16cfea6caacee7adc4ce66fbc0d to your computer and use it in GitHub Desktop.
Save harisankarh/81c6f16cfea6caacee7adc4ce66fbc0d to your computer and use it in GitHub Desktop.
python program to check the status of gpus in a server
import os
if 'CUDA_VISIBLE_DEVICES' in os.environ:
print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES'])
if torch.cuda.is_available() is False:
print('no gpu available')
total_devices = torch.cuda.device_count()
print(f'{total_devices} gpus available')
for d in range(total_devices):
print('='*10)
print(f'gpu {d}')
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a # free inside reserved
print(f'total: {t} reserved: {r} allocated: {a} free: {f}')
device = torch.device("cuda:0")
a = torch.zeros(4,3)
try:
a = a.to(device)
print(f'successfully allocated memory to {d}')
except:
print(f'unable to allocate memory to gpu {d}')
print('NOTE: the numbering of gpus within this program need not be consistent with global number of gpus used in CUDA_VISIBLE_DEVICES')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment