Last active
February 23, 2022 11:08
-
-
Save danielhavir/f4eca2c34c22a42d82f3020bae103c02 to your computer and use it in GitHub Desktop.
Monitor GPU usage, memory and temperature in Visdom during training
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from visdom import Visdom | |
import numpy as np | |
from time import sleep | |
import logging | |
from threading import Event | |
try: | |
import psutil | |
except ImportError: | |
logging.error("You must \"pip install psutil\"") | |
raise | |
logging.basicConfig(level=logging.INFO) | |
EVENT = Event() | |
def cpu_monitor(interval=2) -> None: | |
logging.info("Starting monitoring CPU and RAM with {}s interval".format(interval)) | |
visdom = Visdom() | |
logging.info("Initializing") | |
c = 0 | |
usage = visdom.line(np.array([psutil.cpu_percent()]), X=np.array([c]), name="cpu", | |
opts={"title": "CPU/RAM Usage", "width": 600, "height": 500}) | |
while not EVENT.is_set(): | |
c += 1 | |
usage = visdom.line(np.array([psutil.cpu_percent()]), X=np.array([c]), name="cpu", win=usage, | |
update="append", opts={"title": "CPU/RAM Usage", "width": 600, "height": 500}) | |
usage = visdom.line(np.array([psutil.virtual_memory().percent]), X=np.array([c]), name="ram", win=usage, | |
update="append", opts={"title": "CPU/RAM Usage", "width": 600, "height": 500}) | |
sleep(interval) | |
logging.info("Shutting down") | |
if __name__ == '__main__': | |
import sys | |
interval = 2 if len(sys.argv) < 2 else int(sys.argv[1]) | |
try: | |
cpu_monitor(interval) | |
except KeyboardInterrupt: | |
print() | |
logging.info("Shutting down") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from visdom import Visdom | |
import numpy as np | |
from time import sleep | |
import logging | |
from threading import Event | |
try: | |
import pynvml | |
except ImportError: | |
logging.error("You must \"pip install nvidia-ml-py3\" for GPU monitoring") | |
raise | |
logging.basicConfig(level=logging.INFO) | |
EVENT = Event() | |
def gpu_monitor(interval=2) -> None: | |
logging.info("Starting monitoring GPU(s) with {}s interval".format(interval)) | |
visdom = Visdom() | |
pynvml.nvmlInit() | |
logging.info("Initialized NVML") | |
count = pynvml.nvmlDeviceGetCount() | |
if count < 1: | |
logging.error("No GPU found") | |
pynvml.nvmlShutdown() | |
return | |
handles = [] | |
c = 0 | |
for i in range(count): | |
handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) | |
usage = visdom.line(np.array([pynvml.nvmlDeviceGetUtilizationRates(handles[0]).gpu]), X=np.array([c]), | |
name="{}:{}".format(0, pynvml.nvmlDeviceGetName(handles[0]).decode("utf-8")), | |
opts={"title": "GPU Usage", "width": 600, "height": 500}) | |
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handles[0]) | |
memory = visdom.line(np.array([memory_info.used * 100.0 / memory_info.total]), X=np.array([c]), | |
name="{}:{}".format(0, pynvml.nvmlDeviceGetName(handles[0]).decode("utf-8")), | |
opts={"title": "GPU Memory", "width": 600, "height": 500}) | |
temp = visdom.line(np.array([pynvml.nvmlDeviceGetTemperature(handles[0], pynvml.NVML_TEMPERATURE_GPU)]), | |
X=np.array([c]), | |
name="{}:{}".format(0, pynvml.nvmlDeviceGetName(handles[0]).decode("utf-8")), | |
opts={"title": "GPU Temperature (in °C)", "width": 600, "height": 500}) | |
while not EVENT.is_set(): | |
c += 1 | |
for i, handle in enumerate(handles): | |
device_name = "{}: {}".format(i, pynvml.nvmlDeviceGetName(handle).decode("utf-8")) | |
usage = visdom.line(np.array([pynvml.nvmlDeviceGetUtilizationRates(handle).gpu]), | |
X=np.array([c]), win=usage, update="append", name=device_name, | |
opts={"title": "GPU Usage", "width": 600, "height": 500}) | |
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) | |
memory = visdom.line(np.array([memory_info.used * 100.0 / memory_info.total]), X=np.array([c]), | |
win=memory, update="append", name=device_name, | |
opts={"title": "GPU Memory", "width": 600, "height": 500}) | |
t = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) | |
temp = visdom.line(np.array([t]), X=np.array([c]), win=temp, update="append", name=device_name, | |
opts={"title": "GPU Temperature (in °C)", "width": 600, "height": 500}) | |
sleep(interval) | |
logging.info("Shutting down NVML") | |
pynvml.nvmlShutdown() | |
if __name__ == '__main__': | |
import sys | |
interval = 2 if len(sys.argv) < 2 else int(sys.argv[1]) | |
try: | |
gpu_monitor(interval) | |
except KeyboardInterrupt: | |
print() | |
logging.info("Shutting down NVML") | |
pynvml.nvmlShutdown() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# common | |
numpy | |
visdom | |
# cpu_monitor only | |
psutil | |
# gpu_monitor only | |
nvidia-ml-py3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment