Created
September 27, 2021 19:30
-
-
Save lgray/d30826822354a6876260cfc214bcd0d7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def loadData(fileName): | |
| # 1. Load Data # | |
| #fileName = "/uscms/home/dkim2/nobackup/safetodel/nano_mc2017_1-11_Skim.root" | |
| file = uproot.open(fileName) | |
| printer.info("file: "+ fileName) | |
| events = file["Events"].arrays(how="zip") | |
| mask = events.pfcand_pt > 0 | |
| counts = ak.sum(mask, axis=1) | |
| pfcands = ak.zip({f.split("pfcand_")[1]: events[f][mask] for f in ak.fields(events) if "pfcand_" in f}) | |
| # with_name: just return arr with a new name. Where name to give to the records or tuples??? | |
| pfcands = ak.with_name(pfcands, "PFCand") | |
| pfcands = ak.unflatten(pfcands, counts) | |
| ak.behavior[("__typestr__", "PFCand")] = "pfcand" | |
| fj_properties = {f.split("fj_")[1]: events[f] for f in ak.fields(events) if "fj_" in f} | |
| fj_properties.update({"pfcands": pfcands}) | |
| fjs = ak.zip(fj_properties, depth_limit=1) | |
| fjs = ak.with_name(fjs, "FatJet") | |
| ak.behavior[("__typestr__", "FatJet")] = "fatjet" | |
| # to get it into torch or tensor flow we need to flatten | |
| # into rectangular arrays again | |
| # y is typically the truth | |
| y_np = ak.to_numpy(fjs[[ | |
| 'isQCD', | |
| 'isTop', | |
| 'isHiggs', | |
| 'hadronFlavour', | |
| 'partonFlavour', | |
| 'H_bb', | |
| 'H_WW_4q', | |
| 'H_WW_elenuqq', | |
| 'H_WW_munuqq', | |
| 'H_WW_taunuqq', | |
| ]]) | |
| jet_number = np.arange(len(fjs)) | |
| printer.info("total jet number: %.f"%len(fjs)) | |
| parents = ak.values_astype(ak.ones_like(fjs.pfcands.pt), np.int64) | |
| jet_number = ak.to_numpy(ak.flatten(parents * jet_number[:, None])) | |
| x_np = ak.to_numpy(ak.flatten(fjs.pfcands)) | |
| batch = torch.from_numpy(jet_number) # just jet index | |
| counts_np = ak.to_numpy(counts) | |
| # Transfrom to torch tensor | |
| pt_log_nopuppi = torch.from_numpy(x_np['pt_log_nopuppi']) | |
| e_log_nopuppi = torch.from_numpy(x_np['e_log_nopuppi']) | |
| ptrel_log_nopuppi = torch.from_numpy(x_np['ptrel_log_nopuppi']) | |
| erel_log_nopuppi = torch.from_numpy(x_np['erel_log_nopuppi']) | |
| etarel = torch.from_numpy(x_np['etarel']) | |
| phirel = torch.from_numpy(x_np['phirel']) | |
| pt = torch.from_numpy(x_np['pt']) | |
| e = torch.from_numpy(x_np['e']) | |
| x = torch.stack([etarel, phirel, # position in d(eta), d(phi) coordinate | |
| pt_log_nopuppi, e_log_nopuppi, # features | |
| ptrel_log_nopuppi, erel_log_nopuppi, | |
| pt, e, # same info.. probably won't be used... | |
| batch # jet index | |
| ], 1) | |
| #print(x_torch) | |
| isQCD = torch.tensor(y_np['isQCD'],dtype=int) | |
| isTop = torch.tensor(y_np['isTop'],dtype=int) | |
| isHiggs = torch.tensor(y_np['isHiggs'],dtype=int) | |
| y = torch.stack([isQCD+isTop, isHiggs, # 0: Bkg. 1: Sig. | |
| #hadronFlavour, partonFlavour, | |
| #H_bb, H_WW_4q, H_WW_elenuqq, H_WW_munuqq, H_WW_taunuqq | |
| ], 1) | |
| # to loop over each jet to build data_list | |
| # https://pytorch-geometric.readthedocs.io/en/latest/notes/create_dataset.html | |
| data_list = [] | |
| row_i = 0 | |
| for idata, num_pfcands in enumerate(counts_np): | |
| #printer.info("#pfcands in jet %.f = %.f"%(idata, num_pfcands)) | |
| edge_index = torch.empty((2,0), dtype=torch.int64) # place holder | |
| # divide row for each jet(batch) | |
| row_f = row_i + num_pfcands | |
| # build data_list | |
| data = Data(x=x[row_i:row_f,0:], edge_index=edge_index, y=y[idata:idata+1,0:]) | |
| data_list.append(data) | |
| row_i += num_pfcands | |
| loader = DataLoader(data_list, batch_size=int(config.batch_size)) | |
| return loader |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment