Skip to content

Instantly share code, notes, and snippets.

@avivajpeyi
Created September 25, 2022 10:48
Show Gist options
  • Save avivajpeyi/d389bbcab05714089ed1dbe78d0b6cdb to your computer and use it in GitHub Desktop.
Save avivajpeyi/d389bbcab05714089ed1dbe78d0b6cdb to your computer and use it in GitHub Desktop.
""" Plots total number of CPU hours used
To create a "jobstats.txt" run somthing like the following:
> sacct -S 2020-01-01 -E 2021-10-06 -u avajpeyi -X -o "jobname%-40,cputimeraw,start" --parsable2 > jobstats.txt
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from humanize import intword
from matplotlib import ticker
from datetime import timedelta, datetime, date
plt.style.use(
"https://gist.githubusercontent.com/avivajpeyi/4d9839b1ceb7d3651cbb469bc6b0d69b/raw/4ee4a870126653d542572372ff3eee4e89abcab0/publication.mplstyle")
plt.rcParams['axes.grid']= False
FNAME = "jobstats.txt"
# Get HTML page from:
# https://accounting.ligo.org/htmls_by_users/avi.vajpeyi.html --> avi.vajpeyi.html
CIT_HTML = "avi.vajpeyi.html"
SEC_IN_HR = 60.0 * 60.0
def load_ozstar_cpu_data():
with open(FNAME, 'r') as f:
filecontents = f.read().split("\n")
header = filecontents[0].split("|")
data = filecontents[1:]
data = [d for d in data if len(d) > 1]
data = np.array([np.array(row.split("|")) for row in data])
data = data.T
data_dict = {header[i]: data[i] for i in range(len(header))}
data = pd.DataFrame(data_dict)
data['CPUTimeRAW'] = data['CPUTimeRAW'].astype('float64')
data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR
data['date'] = pd.to_datetime(data['Start'], format='%Y-%m-%dT%H:%M:%S')
return data
def get_total_cpu_hrs(data):
return np.sum(data['CPUTimeRAW'].values) / SEC_IN_HR
def plot_time(data):
plt.figure(figsize=(4, 3))
data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR
hrs = data['CPU Hrs']
hrs = hrs[hrs > 0.01]
min_h, max_h = min(hrs), max(hrs)
plt.hist(hrs, density=False, bins=np.geomspace(min_h, max_h, 100))
plt.xlabel("CPU Hrs")
plt.xlim(left=min_h)
plt.yscale('log')
plt.xscale('log')
plt.ylabel("Jobs")
plt.title(f"Total: {intword(get_total_cpu_hrs(data), '%.1f')} Hr")
plt.tight_layout()
plt.savefig('cpuhrs_hist.png')
def bin_dates_data(dates, data, delta=5, binstart=None):
delta = timedelta(days=delta)
res = {}
# end of first bin:
if binstart is None:
binstart = dates[0]
bin_key = str(binstart)
res[bin_key] = 0
# iterate through the data item
for cur_date, cur_data, in zip(dates, data):
# if the data item belongs to this bin, append it into the bin
if cur_date < binstart + delta:
res[bin_key] = res.get(bin_key,0) + cur_data
continue
# otherwise, create new empty bins until this data fits into a bin
binstart += delta
bin_key = str(binstart)
while cur_date > binstart + delta:
res[bin_key] = 0
binstart += delta
bin_key = str(binstart)
# create a bin with the data
res[bin_key] = res.get(bin_key,0) + cur_data
date_bins, cpu_hrs = list(res.keys()), list(res.values())
return date_bins, cpu_hrs
def format_date_ticklabel(d):
d = datetime.strptime(d, "%Y-%m-%d %H:%M:%S")
return d.strftime("%b, '%y")
def plot_cpu_timseries(dates, data, delta=20):
date_bins, cpu_hrs = bin_dates_data(dates, data, delta=delta)
fig, ax = plt.subplots(1,1, figsize=(4,2.5))
ax.bar(date_bins, cpu_hrs,width=1)
plt.xticks(rotation=-45)
num_bins = len(date_bins)
ticks = [i for i in range(0, num_bins, int(num_bins/5))]
labels = [format_date_ticklabel(date_bins[i]) for i in ticks]
ax.set_xticks(ticks)
ax.set_xticklabels(labels)
ax.set_yscale('log')
plt.minorticks_off()
ax.set_ylim(bottom=0.3)
ax.set_ylabel("Hrs")
plt.grid(visible=False)
plt.savefig('cpuhrs_timeseries.png')
def load_cit_data(fname):
with open(fname, "r") as fp:
data = fp.read()
data = data.replace("\n", "")
data = data.split("dashboard1.bind(")[0]
data = data.split("data.addRow([new Date(")[1:]
data = [d.replace("),", "; ") for d in data]
data = [d.replace(",]);", "") for d in data]
data = np.array([process_cit_data_chunk(d) for d in data]).T
df = pd.DataFrame(dict(
date=data[0],
cpu_hrs=data[1]
))
return df
def process_cit_data_chunk(d):
d = d.split(";")
ymd = [int(i) for i in d[0].split(", ")]
date= datetime.strptime(f"{ymd[0]}, {ymd[1]+1}, {ymd[2]}", "%Y, %m, %d")
nums = [int(i) for i in d[1].split(",")]
num = sum(nums)
return (date, num)
def main():
data = load_ozstar_cpu_data()
total_hrs = get_total_cpu_hrs(data)
print(f"Total CPU hrs: {total_hrs:.2f}")
plot_time(data)
plot_cpu_timseries(data["date"], data["CPU Hrs"])
# CIT
data = load_cit_data(CIT_HTML)
plot_cpu_timseries(data.date, data.cpu_hrs)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment