Skip to content

Instantly share code, notes, and snippets.

@danielhavir
Last active February 23, 2022 11:08
Show Gist options
  • Save danielhavir/f4eca2c34c22a42d82f3020bae103c02 to your computer and use it in GitHub Desktop.
Save danielhavir/f4eca2c34c22a42d82f3020bae103c02 to your computer and use it in GitHub Desktop.
Monitor GPU usage, memory and temperature in Visdom during training
from visdom import Visdom
import numpy as np
from time import sleep
import logging
from threading import Event
try:
import psutil
except ImportError:
logging.error("You must \"pip install psutil\"")
raise
logging.basicConfig(level=logging.INFO)
EVENT = Event()
def cpu_monitor(interval=2) -> None:
logging.info("Starting monitoring CPU and RAM with {}s interval".format(interval))
visdom = Visdom()
logging.info("Initializing")
c = 0
usage = visdom.line(np.array([psutil.cpu_percent()]), X=np.array([c]), name="cpu",
opts={"title": "CPU/RAM Usage", "width": 600, "height": 500})
while not EVENT.is_set():
c += 1
usage = visdom.line(np.array([psutil.cpu_percent()]), X=np.array([c]), name="cpu", win=usage,
update="append", opts={"title": "CPU/RAM Usage", "width": 600, "height": 500})
usage = visdom.line(np.array([psutil.virtual_memory().percent]), X=np.array([c]), name="ram", win=usage,
update="append", opts={"title": "CPU/RAM Usage", "width": 600, "height": 500})
sleep(interval)
logging.info("Shutting down")
if __name__ == '__main__':
import sys
interval = 2 if len(sys.argv) < 2 else int(sys.argv[1])
try:
cpu_monitor(interval)
except KeyboardInterrupt:
print()
logging.info("Shutting down")
from visdom import Visdom
import numpy as np
from time import sleep
import logging
from threading import Event
try:
import pynvml
except ImportError:
logging.error("You must \"pip install nvidia-ml-py3\" for GPU monitoring")
raise
logging.basicConfig(level=logging.INFO)
EVENT = Event()
def gpu_monitor(interval=2) -> None:
logging.info("Starting monitoring GPU(s) with {}s interval".format(interval))
visdom = Visdom()
pynvml.nvmlInit()
logging.info("Initialized NVML")
count = pynvml.nvmlDeviceGetCount()
if count < 1:
logging.error("No GPU found")
pynvml.nvmlShutdown()
return
handles = []
c = 0
for i in range(count):
handles.append(pynvml.nvmlDeviceGetHandleByIndex(i))
usage = visdom.line(np.array([pynvml.nvmlDeviceGetUtilizationRates(handles[0]).gpu]), X=np.array([c]),
name="{}:{}".format(0, pynvml.nvmlDeviceGetName(handles[0]).decode("utf-8")),
opts={"title": "GPU Usage", "width": 600, "height": 500})
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handles[0])
memory = visdom.line(np.array([memory_info.used * 100.0 / memory_info.total]), X=np.array([c]),
name="{}:{}".format(0, pynvml.nvmlDeviceGetName(handles[0]).decode("utf-8")),
opts={"title": "GPU Memory", "width": 600, "height": 500})
temp = visdom.line(np.array([pynvml.nvmlDeviceGetTemperature(handles[0], pynvml.NVML_TEMPERATURE_GPU)]),
X=np.array([c]),
name="{}:{}".format(0, pynvml.nvmlDeviceGetName(handles[0]).decode("utf-8")),
opts={"title": "GPU Temperature (in °C)", "width": 600, "height": 500})
while not EVENT.is_set():
c += 1
for i, handle in enumerate(handles):
device_name = "{}: {}".format(i, pynvml.nvmlDeviceGetName(handle).decode("utf-8"))
usage = visdom.line(np.array([pynvml.nvmlDeviceGetUtilizationRates(handle).gpu]),
X=np.array([c]), win=usage, update="append", name=device_name,
opts={"title": "GPU Usage", "width": 600, "height": 500})
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
memory = visdom.line(np.array([memory_info.used * 100.0 / memory_info.total]), X=np.array([c]),
win=memory, update="append", name=device_name,
opts={"title": "GPU Memory", "width": 600, "height": 500})
t = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
temp = visdom.line(np.array([t]), X=np.array([c]), win=temp, update="append", name=device_name,
opts={"title": "GPU Temperature (in °C)", "width": 600, "height": 500})
sleep(interval)
logging.info("Shutting down NVML")
pynvml.nvmlShutdown()
if __name__ == '__main__':
import sys
interval = 2 if len(sys.argv) < 2 else int(sys.argv[1])
try:
gpu_monitor(interval)
except KeyboardInterrupt:
print()
logging.info("Shutting down NVML")
pynvml.nvmlShutdown()
# common
numpy
visdom
# cpu_monitor only
psutil
# gpu_monitor only
nvidia-ml-py3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment