Created
September 25, 2022 10:48
-
-
Save avivajpeyi/d389bbcab05714089ed1dbe78d0b6cdb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Plots total number of CPU hours used | |
To create a "jobstats.txt" run somthing like the following: | |
> sacct -S 2020-01-01 -E 2021-10-06 -u avajpeyi -X -o "jobname%-40,cputimeraw,start" --parsable2 > jobstats.txt | |
""" | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from humanize import intword | |
from matplotlib import ticker | |
from datetime import timedelta, datetime, date | |
plt.style.use( | |
"https://gist.githubusercontent.com/avivajpeyi/4d9839b1ceb7d3651cbb469bc6b0d69b/raw/4ee4a870126653d542572372ff3eee4e89abcab0/publication.mplstyle") | |
plt.rcParams['axes.grid']= False | |
FNAME = "jobstats.txt" | |
# Get HTML page from: | |
# https://accounting.ligo.org/htmls_by_users/avi.vajpeyi.html --> avi.vajpeyi.html | |
CIT_HTML = "avi.vajpeyi.html" | |
SEC_IN_HR = 60.0 * 60.0 | |
def load_ozstar_cpu_data(): | |
with open(FNAME, 'r') as f: | |
filecontents = f.read().split("\n") | |
header = filecontents[0].split("|") | |
data = filecontents[1:] | |
data = [d for d in data if len(d) > 1] | |
data = np.array([np.array(row.split("|")) for row in data]) | |
data = data.T | |
data_dict = {header[i]: data[i] for i in range(len(header))} | |
data = pd.DataFrame(data_dict) | |
data['CPUTimeRAW'] = data['CPUTimeRAW'].astype('float64') | |
data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR | |
data['date'] = pd.to_datetime(data['Start'], format='%Y-%m-%dT%H:%M:%S') | |
return data | |
def get_total_cpu_hrs(data): | |
return np.sum(data['CPUTimeRAW'].values) / SEC_IN_HR | |
def plot_time(data): | |
plt.figure(figsize=(4, 3)) | |
data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR | |
hrs = data['CPU Hrs'] | |
hrs = hrs[hrs > 0.01] | |
min_h, max_h = min(hrs), max(hrs) | |
plt.hist(hrs, density=False, bins=np.geomspace(min_h, max_h, 100)) | |
plt.xlabel("CPU Hrs") | |
plt.xlim(left=min_h) | |
plt.yscale('log') | |
plt.xscale('log') | |
plt.ylabel("Jobs") | |
plt.title(f"Total: {intword(get_total_cpu_hrs(data), '%.1f')} Hr") | |
plt.tight_layout() | |
plt.savefig('cpuhrs_hist.png') | |
def bin_dates_data(dates, data, delta=5, binstart=None): | |
delta = timedelta(days=delta) | |
res = {} | |
# end of first bin: | |
if binstart is None: | |
binstart = dates[0] | |
bin_key = str(binstart) | |
res[bin_key] = 0 | |
# iterate through the data item | |
for cur_date, cur_data, in zip(dates, data): | |
# if the data item belongs to this bin, append it into the bin | |
if cur_date < binstart + delta: | |
res[bin_key] = res.get(bin_key,0) + cur_data | |
continue | |
# otherwise, create new empty bins until this data fits into a bin | |
binstart += delta | |
bin_key = str(binstart) | |
while cur_date > binstart + delta: | |
res[bin_key] = 0 | |
binstart += delta | |
bin_key = str(binstart) | |
# create a bin with the data | |
res[bin_key] = res.get(bin_key,0) + cur_data | |
date_bins, cpu_hrs = list(res.keys()), list(res.values()) | |
return date_bins, cpu_hrs | |
def format_date_ticklabel(d): | |
d = datetime.strptime(d, "%Y-%m-%d %H:%M:%S") | |
return d.strftime("%b, '%y") | |
def plot_cpu_timseries(dates, data, delta=20): | |
date_bins, cpu_hrs = bin_dates_data(dates, data, delta=delta) | |
fig, ax = plt.subplots(1,1, figsize=(4,2.5)) | |
ax.bar(date_bins, cpu_hrs,width=1) | |
plt.xticks(rotation=-45) | |
num_bins = len(date_bins) | |
ticks = [i for i in range(0, num_bins, int(num_bins/5))] | |
labels = [format_date_ticklabel(date_bins[i]) for i in ticks] | |
ax.set_xticks(ticks) | |
ax.set_xticklabels(labels) | |
ax.set_yscale('log') | |
plt.minorticks_off() | |
ax.set_ylim(bottom=0.3) | |
ax.set_ylabel("Hrs") | |
plt.grid(visible=False) | |
plt.savefig('cpuhrs_timeseries.png') | |
def load_cit_data(fname): | |
with open(fname, "r") as fp: | |
data = fp.read() | |
data = data.replace("\n", "") | |
data = data.split("dashboard1.bind(")[0] | |
data = data.split("data.addRow([new Date(")[1:] | |
data = [d.replace("),", "; ") for d in data] | |
data = [d.replace(",]);", "") for d in data] | |
data = np.array([process_cit_data_chunk(d) for d in data]).T | |
df = pd.DataFrame(dict( | |
date=data[0], | |
cpu_hrs=data[1] | |
)) | |
return df | |
def process_cit_data_chunk(d): | |
d = d.split(";") | |
ymd = [int(i) for i in d[0].split(", ")] | |
date= datetime.strptime(f"{ymd[0]}, {ymd[1]+1}, {ymd[2]}", "%Y, %m, %d") | |
nums = [int(i) for i in d[1].split(",")] | |
num = sum(nums) | |
return (date, num) | |
def main(): | |
data = load_ozstar_cpu_data() | |
total_hrs = get_total_cpu_hrs(data) | |
print(f"Total CPU hrs: {total_hrs:.2f}") | |
plot_time(data) | |
plot_cpu_timseries(data["date"], data["CPU Hrs"]) | |
# CIT | |
data = load_cit_data(CIT_HTML) | |
plot_cpu_timseries(data.date, data.cpu_hrs) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment