Skip to content

Instantly share code, notes, and snippets.

@ChristinaLK
Created November 18, 2024 17:53
Show Gist options
  • Save ChristinaLK/92de808485db8d4c01bd9205229dfe02 to your computer and use it in GitHub Desktop.
Save ChristinaLK/92de808485db8d4c01bd9205229dfe02 to your computer and use it in GitHub Desktop.
import pandas as pd
## data was downloaded from XDMod, the Jobs by User dashboard
datafile = "Jobs__by_User_2024-10-01_to_2024-10-31_aggregate.csv"
outfile = "Jobs__by_User_2024-10-01_to_2024-10-31_subset.csv"
## build a dictionary with all the data
# needed conditionals to control flow
istitle = False
processing = False
iscols = False
label = ""
cols = list()
# dictionary to populate
d = dict()
# logic to populate dictionary
with open(datafile) as f:
for l in f:
line = l.strip()
# normal case
if processing and line != "---------":
#print("normal")
vals = line.split(',')
d[label][cols[0]].append(vals[0])
d[label][cols[1][1:-1]].append(vals[1])
# unset processing
elif processing and line == "---------":
#print("end of data")
processing = False
# set cols (which will set processing)
elif not processing and line == "---------":
#print("pre cols")
iscols = True
elif iscols:
#print("cols")
vals = line
cols = line.split(",")
label = cols[1][1:-1]
d[label] = dict()
d[label][cols[0]] = []
d[label][cols[1][1:-1]] = []
iscols = False
processing = True
# subset with the pieces we want
keys_we_want = ["Wait Hours: Per Job",
"CPU Hours: Total",
"Number of Jobs Running",
"Job Size: Per Job (Core Count)"]
list_of_series = []
for k in keys_we_want:
#print(k)
tmpdf = pd.DataFrame(d[k])
#tmpdf.head()
list_of_series.append(tmpdf)
## create a dataframe
data = list_of_series[0]
for df in list_of_series[1:]:
#print(df.head())
data = data.join(df.set_index('User'), on = "User", how = "outer")
data.to_csv(outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment