Skip to content

Instantly share code, notes, and snippets.

@SolomidHero
Last active May 9, 2022 19:38
Show Gist options
  • Save SolomidHero/5c31b076ad73dde20f342a8c7f952a10 to your computer and use it in GitHub Desktop.
Save SolomidHero/5c31b076ad73dde20f342a8c7f952a10 to your computer and use it in GitHub Desktop.
Logging to tensorboard
import subprocess
import logging
import datetime
from pathlib import PosixPath
import time
from torch.utils.tensorboard import SummaryWriter
check_frequency = datetime.timedelta(minutes=5)
aws_url = 's3://<bucket>/<path>'
log_key = 'Automatic log'
def aws_ls(url):
process = subprocess.Popen(
['aws', 's3', 'ls', str(url).replace('s3:/', 's3://') + '/'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
stdout, stderr = process.communicate()
if stderr:
logging.warning(stderr)
for line in stdout.split('\n'):
if line.strip() == '':
continue
line = line.strip().split()
if line[0] == 'PRE':
yield (None, line[1])
continue
date = datetime.datetime.strptime(f'{line[0]} {line[1]}', "%Y-%m-%d %H:%M:%S")
yield (date, line[3])
def capture_nodes(root_url, tree=None):
'''root / os_type / user_id / run_{} / hist_{}.zip'''
root_url = PosixPath(root_url)
if tree is None:
tree = {}
change_num = 0
for _, os_type in aws_ls(root_url):
for _, user_id in aws_ls(root_url / os_type):
for _, run in aws_ls(root_url / os_type / user_id):
key = f"{os_type}/{user_id}"
nodes = list(aws_ls(root_url / os_type / user_id / run))
if key in tree:
prev_size = len(tree[key])
tree[key].update(nodes)
change_num += len(tree[key]) - prev_size
else:
tree[key] = set(nodes)
change_num += len(nodes)
return change_num, tree
if __name__ == "__main__":
total_nodes, tree = capture_nodes(aws_url)
start_date = datetime.datetime.now()
next_check = datetime.datetime.now() + check_frequency
scalar_logger = SummaryWriter('autologs_monitoring/')
i = 0
scalar_logger.add_scalar(f'{log_key} pkgs', total_nodes, i)
scalar_logger.add_scalar(f'{log_key} delta', 0, i)
print(f'Starting: {total_nodes} nodes. Next check {next_check}')
while True:
time_left = (next_check - datetime.datetime.now()).total_seconds()
if time_left > 0:
time.sleep(time_left)
i += 1
nodes_delta, tree = capture_nodes(aws_url, tree=tree)
total_nodes += nodes_delta
next_check += check_frequency
scalar_logger.add_scalar(f'{log_key} pkgs', total_nodes, i)
scalar_logger.add_scalar(f'{log_key} delta', nodes_delta, i)
print(f'{datetime.datetime.now()} Nodes: {total_nodes} (+{nodes_delta}). Next check {next_check}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment