Skip to content

Instantly share code, notes, and snippets.

@lgray
Created September 27, 2021 19:30
Show Gist options
  • Select an option

  • Save lgray/d30826822354a6876260cfc214bcd0d7 to your computer and use it in GitHub Desktop.

Select an option

Save lgray/d30826822354a6876260cfc214bcd0d7 to your computer and use it in GitHub Desktop.
def loadData(fileName):
# 1. Load Data #
#fileName = "/uscms/home/dkim2/nobackup/safetodel/nano_mc2017_1-11_Skim.root"
file = uproot.open(fileName)
printer.info("file: "+ fileName)
events = file["Events"].arrays(how="zip")
mask = events.pfcand_pt > 0
counts = ak.sum(mask, axis=1)
pfcands = ak.zip({f.split("pfcand_")[1]: events[f][mask] for f in ak.fields(events) if "pfcand_" in f})
# with_name: just return arr with a new name. Where name to give to the records or tuples???
pfcands = ak.with_name(pfcands, "PFCand")
pfcands = ak.unflatten(pfcands, counts)
ak.behavior[("__typestr__", "PFCand")] = "pfcand"
fj_properties = {f.split("fj_")[1]: events[f] for f in ak.fields(events) if "fj_" in f}
fj_properties.update({"pfcands": pfcands})
fjs = ak.zip(fj_properties, depth_limit=1)
fjs = ak.with_name(fjs, "FatJet")
ak.behavior[("__typestr__", "FatJet")] = "fatjet"
# to get it into torch or tensor flow we need to flatten
# into rectangular arrays again
# y is typically the truth
y_np = ak.to_numpy(fjs[[
'isQCD',
'isTop',
'isHiggs',
'hadronFlavour',
'partonFlavour',
'H_bb',
'H_WW_4q',
'H_WW_elenuqq',
'H_WW_munuqq',
'H_WW_taunuqq',
]])
jet_number = np.arange(len(fjs))
printer.info("total jet number: %.f"%len(fjs))
parents = ak.values_astype(ak.ones_like(fjs.pfcands.pt), np.int64)
jet_number = ak.to_numpy(ak.flatten(parents * jet_number[:, None]))
x_np = ak.to_numpy(ak.flatten(fjs.pfcands))
batch = torch.from_numpy(jet_number) # just jet index
counts_np = ak.to_numpy(counts)
# Transfrom to torch tensor
pt_log_nopuppi = torch.from_numpy(x_np['pt_log_nopuppi'])
e_log_nopuppi = torch.from_numpy(x_np['e_log_nopuppi'])
ptrel_log_nopuppi = torch.from_numpy(x_np['ptrel_log_nopuppi'])
erel_log_nopuppi = torch.from_numpy(x_np['erel_log_nopuppi'])
etarel = torch.from_numpy(x_np['etarel'])
phirel = torch.from_numpy(x_np['phirel'])
pt = torch.from_numpy(x_np['pt'])
e = torch.from_numpy(x_np['e'])
x = torch.stack([etarel, phirel, # position in d(eta), d(phi) coordinate
pt_log_nopuppi, e_log_nopuppi, # features
ptrel_log_nopuppi, erel_log_nopuppi,
pt, e, # same info.. probably won't be used...
batch # jet index
], 1)
#print(x_torch)
isQCD = torch.tensor(y_np['isQCD'],dtype=int)
isTop = torch.tensor(y_np['isTop'],dtype=int)
isHiggs = torch.tensor(y_np['isHiggs'],dtype=int)
y = torch.stack([isQCD+isTop, isHiggs, # 0: Bkg. 1: Sig.
#hadronFlavour, partonFlavour,
#H_bb, H_WW_4q, H_WW_elenuqq, H_WW_munuqq, H_WW_taunuqq
], 1)
# to loop over each jet to build data_list
# https://pytorch-geometric.readthedocs.io/en/latest/notes/create_dataset.html
data_list = []
row_i = 0
for idata, num_pfcands in enumerate(counts_np):
#printer.info("#pfcands in jet %.f = %.f"%(idata, num_pfcands))
edge_index = torch.empty((2,0), dtype=torch.int64) # place holder
# divide row for each jet(batch)
row_f = row_i + num_pfcands
# build data_list
data = Data(x=x[row_i:row_f,0:], edge_index=edge_index, y=y[idata:idata+1,0:])
data_list.append(data)
row_i += num_pfcands
loader = DataLoader(data_list, batch_size=int(config.batch_size))
return loader
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment