avivajpeyi · September 25, 2022 10:48
diff --git a/cit_ozstar_cpu_hr_plotter.py b/cit_ozstar_cpu_hr_plotter.py
 """ Plots total number of CPU hours used	
 To create a "jobstats.txt" run somthing like the following:	
 > sacct -S 2020-01-01 -E 2021-10-06 -u avajpeyi -X -o "jobname%-40,cputimeraw,start" --parsable2 > jobstats.txt	
 """
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from humanize import intword
 from matplotlib import ticker
 from datetime import timedelta, datetime, date




 plt.style.use(
    "https://gist.githubusercontent.com/avivajpeyi/4d9839b1ceb7d3651cbb469bc6b0d69b/raw/4ee4a870126653d542572372ff3eee4e89abcab0/publication.mplstyle")
 plt.rcParams['axes.grid']= False

 FNAME = "jobstats.txt"
 # Get HTML page from: 
 # https://accounting.ligo.org/htmls_by_users/avi.vajpeyi.html --> avi.vajpeyi.html
 CIT_HTML = "avi.vajpeyi.html"




 SEC_IN_HR = 60.0 * 60.0


 def load_ozstar_cpu_data():
    with open(FNAME, 'r') as f:
        filecontents = f.read().split("\n")
        header = filecontents[0].split("|")
        data = filecontents[1:]
        data = [d for d in data if len(d) > 1]
        data = np.array([np.array(row.split("|")) for row in data])
        data = data.T
        data_dict = {header[i]: data[i] for i in range(len(header))}
        data = pd.DataFrame(data_dict)
        data['CPUTimeRAW'] = data['CPUTimeRAW'].astype('float64')
        data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR
        data['date'] = pd.to_datetime(data['Start'], format='%Y-%m-%dT%H:%M:%S')
        return data
    

 def get_total_cpu_hrs(data):
    return np.sum(data['CPUTimeRAW'].values) / SEC_IN_HR


 def plot_time(data):
    plt.figure(figsize=(4, 3))
    data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR
    hrs = data['CPU Hrs']
    hrs = hrs[hrs > 0.01]
    min_h, max_h = min(hrs), max(hrs)
    plt.hist(hrs, density=False,  bins=np.geomspace(min_h, max_h, 100))
    plt.xlabel("CPU Hrs")
    plt.xlim(left=min_h)
    plt.yscale('log')
    plt.xscale('log')
    plt.ylabel("Jobs")
    plt.title(f"Total: {intword(get_total_cpu_hrs(data), '%.1f')} Hr")
    plt.tight_layout()
    plt.savefig('cpuhrs_hist.png')


 def bin_dates_data(dates, data, delta=5, binstart=None):
    delta = timedelta(days=delta)
    res = {}
    # end of first bin:
    
    if binstart is None:
        binstart = dates[0]
        
    bin_key = str(binstart)
    res[bin_key] = 0

    # iterate through the data item
    for cur_date, cur_data, in zip(dates, data):
        # if the data item belongs to this bin, append it into the bin
        if cur_date < binstart + delta:
            res[bin_key] = res.get(bin_key,0) + cur_data
            continue

        # otherwise, create new empty bins until this data fits into a bin
        binstart += delta
        bin_key = str(binstart)
        while cur_date > binstart + delta:
            res[bin_key] = 0
            binstart += delta
            bin_key = str(binstart)

        # create a bin with the data
        res[bin_key] = res.get(bin_key,0) +  cur_data
    date_bins, cpu_hrs = list(res.keys()), list(res.values())
    return date_bins, cpu_hrs

    
 def format_date_ticklabel(d):
    d = datetime.strptime(d, "%Y-%m-%d %H:%M:%S")
    return d.strftime("%b, '%y")


 def plot_cpu_timseries(dates, data, delta=20):
    date_bins, cpu_hrs = bin_dates_data(dates, data, delta=delta)
    fig, ax = plt.subplots(1,1, figsize=(4,2.5))
    ax.bar(date_bins, cpu_hrs,width=1)
    plt.xticks(rotation=-45)
    num_bins = len(date_bins)
    ticks = [i for i in range(0, num_bins, int(num_bins/5))]
    labels = [format_date_ticklabel(date_bins[i]) for i in ticks]
    ax.set_xticks(ticks)
    ax.set_xticklabels(labels)
    ax.set_yscale('log')
    plt.minorticks_off()
    ax.set_ylim(bottom=0.3)
    ax.set_ylabel("Hrs")
    plt.grid(visible=False)
    plt.savefig('cpuhrs_timeseries.png')




 def load_cit_data(fname):
    with open(fname, "r") as fp:
        data = fp.read()
        data = data.replace("\n", "")
        data = data.split("dashboard1.bind(")[0]
        data = data.split("data.addRow([new Date(")[1:]
        data = [d.replace("),", "; ") for d in data]
        data = [d.replace(",]);", "") for d in data]
        data = np.array([process_cit_data_chunk(d) for d in data]).T
        df = pd.DataFrame(dict(
            date=data[0],
            cpu_hrs=data[1]
        ))
        return df
    
 def process_cit_data_chunk(d):
    d = d.split(";")
    ymd = [int(i) for i in d[0].split(", ")]
    date=  datetime.strptime(f"{ymd[0]}, {ymd[1]+1}, {ymd[2]}", "%Y, %m, %d")
    nums = [int(i) for i in d[1].split(",")]
    num = sum(nums)
    return (date, num)

       

    

 def main():
    data = load_ozstar_cpu_data()
    total_hrs = get_total_cpu_hrs(data)
    print(f"Total CPU hrs: {total_hrs:.2f}")
    plot_time(data)
    plot_cpu_timseries(data["date"], data["CPU Hrs"])
    # CIT
    data = load_cit_data(CIT_HTML)
    plot_cpu_timseries(data.date, data.cpu_hrs)


 if __name__ == '__main__':
    main()
	""" Plots total number of CPU hours used
	To create a "jobstats.txt" run somthing like the following:
	> sacct -S 2020-01-01 -E 2021-10-06 -u avajpeyi -X -o "jobname%-40,cputimeraw,start" --parsable2 > jobstats.txt
	"""
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from humanize import intword
	from matplotlib import ticker
	from datetime import timedelta, datetime, date




	plt.style.use(
	"https://gist.githubusercontent.com/avivajpeyi/4d9839b1ceb7d3651cbb469bc6b0d69b/raw/4ee4a870126653d542572372ff3eee4e89abcab0/publication.mplstyle")
	plt.rcParams['axes.grid']= False

	FNAME = "jobstats.txt"
	# Get HTML page from:
	# https://accounting.ligo.org/htmls_by_users/avi.vajpeyi.html --> avi.vajpeyi.html
	CIT_HTML = "avi.vajpeyi.html"




	SEC_IN_HR = 60.0 * 60.0


	def load_ozstar_cpu_data():
	with open(FNAME, 'r') as f:
	filecontents = f.read().split("\n")
	header = filecontents[0].split("\|")
	data = filecontents[1:]
	data = [d for d in data if len(d) > 1]
	data = np.array([np.array(row.split("\|")) for row in data])
	data = data.T
	data_dict = {header[i]: data[i] for i in range(len(header))}
	data = pd.DataFrame(data_dict)
	data['CPUTimeRAW'] = data['CPUTimeRAW'].astype('float64')
	data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR
	data['date'] = pd.to_datetime(data['Start'], format='%Y-%m-%dT%H:%M:%S')
	return data


	def get_total_cpu_hrs(data):
	return np.sum(data['CPUTimeRAW'].values) / SEC_IN_HR


	def plot_time(data):
	plt.figure(figsize=(4, 3))
	data['CPU Hrs'] = data['CPUTimeRAW'] / SEC_IN_HR
	hrs = data['CPU Hrs']
	hrs = hrs[hrs > 0.01]
	min_h, max_h = min(hrs), max(hrs)
	plt.hist(hrs, density=False, bins=np.geomspace(min_h, max_h, 100))
	plt.xlabel("CPU Hrs")
	plt.xlim(left=min_h)
	plt.yscale('log')
	plt.xscale('log')
	plt.ylabel("Jobs")
	plt.title(f"Total: {intword(get_total_cpu_hrs(data), '%.1f')} Hr")
	plt.tight_layout()
	plt.savefig('cpuhrs_hist.png')


	def bin_dates_data(dates, data, delta=5, binstart=None):
	delta = timedelta(days=delta)
	res = {}
	# end of first bin:

	if binstart is None:
	binstart = dates[0]

	bin_key = str(binstart)
	res[bin_key] = 0

	# iterate through the data item
	for cur_date, cur_data, in zip(dates, data):
	# if the data item belongs to this bin, append it into the bin
	if cur_date < binstart + delta:
	res[bin_key] = res.get(bin_key,0) + cur_data
	continue

	# otherwise, create new empty bins until this data fits into a bin
	binstart += delta
	bin_key = str(binstart)
	while cur_date > binstart + delta:
	res[bin_key] = 0
	binstart += delta
	bin_key = str(binstart)

	# create a bin with the data
	res[bin_key] = res.get(bin_key,0) + cur_data
	date_bins, cpu_hrs = list(res.keys()), list(res.values())
	return date_bins, cpu_hrs


	def format_date_ticklabel(d):
	d = datetime.strptime(d, "%Y-%m-%d %H:%M:%S")
	return d.strftime("%b, '%y")


	def plot_cpu_timseries(dates, data, delta=20):
	date_bins, cpu_hrs = bin_dates_data(dates, data, delta=delta)
	fig, ax = plt.subplots(1,1, figsize=(4,2.5))
	ax.bar(date_bins, cpu_hrs,width=1)
	plt.xticks(rotation=-45)
	num_bins = len(date_bins)
	ticks = [i for i in range(0, num_bins, int(num_bins/5))]
	labels = [format_date_ticklabel(date_bins[i]) for i in ticks]
	ax.set_xticks(ticks)
	ax.set_xticklabels(labels)
	ax.set_yscale('log')
	plt.minorticks_off()
	ax.set_ylim(bottom=0.3)
	ax.set_ylabel("Hrs")
	plt.grid(visible=False)
	plt.savefig('cpuhrs_timeseries.png')




	def load_cit_data(fname):
	with open(fname, "r") as fp:
	data = fp.read()
	data = data.replace("\n", "")
	data = data.split("dashboard1.bind(")[0]
	data = data.split("data.addRow([new Date(")[1:]
	data = [d.replace("),", "; ") for d in data]
	data = [d.replace(",]);", "") for d in data]
	data = np.array([process_cit_data_chunk(d) for d in data]).T
	df = pd.DataFrame(dict(
	date=data[0],
	cpu_hrs=data[1]
	))
	return df

	def process_cit_data_chunk(d):
	d = d.split(";")
	ymd = [int(i) for i in d[0].split(", ")]
	date= datetime.strptime(f"{ymd[0]}, {ymd[1]+1}, {ymd[2]}", "%Y, %m, %d")
	nums = [int(i) for i in d[1].split(",")]
	num = sum(nums)
	return (date, num)





	def main():
	data = load_ozstar_cpu_data()
	total_hrs = get_total_cpu_hrs(data)
	print(f"Total CPU hrs: {total_hrs:.2f}")
	plot_time(data)
	plot_cpu_timseries(data["date"], data["CPU Hrs"])
	# CIT
	data = load_cit_data(CIT_HTML)
	plot_cpu_timseries(data.date, data.cpu_hrs)


	if __name__ == '__main__':
	main()