Created
April 1, 2019 14:26
-
-
Save mattleblanc/fb5453de9f3f5d086e057a9c1c4f72f4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import uproot | |
| import h5py | |
| import numpy as np | |
| from tempfile import mkdtemp | |
| import os | |
| import argparse | |
| # if we want multiple custom formatters, use inheriting | |
| class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter): | |
| pass | |
| parser = argparse.ArgumentParser( | |
| description='Convert SM4t offline NTuples to HDF5 files', | |
| formatter_class=lambda prog: CustomFormatter(prog, max_help_position=30), | |
| ) | |
| parser.add_argument('fname', type=str, help='File to process') | |
| #parser.add_argument('--num-jets', type=int, help='Number of jets per event', default=2) | |
| #parser.add_argument( | |
| # '--num-emissions', type=int, help='Number of emissions per jet', default=20 | |
| #) | |
| parser.add_argument( | |
| '--num-entries', | |
| type=int, | |
| help='Number of entries to process in file. -1 to process all.', | |
| default=-1, | |
| ) | |
| parser.add_argument( | |
| '--treename', | |
| type=str, | |
| help='Name of tree to use in file', | |
| default='lundjets_InDetTrackParticles', | |
| ) | |
| args = parser.parse_args() | |
| branches = [ | |
| 'lpt', | |
| 'jpt', | |
| 'mtw', | |
| 'met_met', | |
| 'mJSum', | |
| 'nMuons', | |
| 'nElectrons', | |
| 'nJets', | |
| 'nBTags_DL1_60', | |
| 'nBTags_DL1_85' | |
| ] | |
| # open file and get the tree | |
| #f = uproot.open(args.fname) | |
| #tree = f[args.treename] | |
| # make memory-mapped file to hold large arrays | |
| #num_entries = args.num_entries if args.num_entries > 0 else tree.numentries | |
| #maxshape = (num_entries, len(branches)) | |
| #fname_mmap = os.path.join(mkdtemp(), 'mmap.dat') | |
| #fp_mmap = np.memmap(fname_mmap, dtype=float, mode='w+', shape=maxshape) | |
| #fp_mmap[:] = 1.0 | |
| output_dict = dict((branch,np.array([0],dtype=float)) for branch in branches) | |
| output_dict['dsid'] = np.array([-1],dtype=int) | |
| print output_dict | |
| idx = 0 | |
| for arrays in uproot.iterate("in/*.root*", args.treename, branches, reportpath=True, reportentries=True): | |
| print arrays | |
| if('4tops' in arrays[0]): | |
| print 'signal\tidx=',idx | |
| output_dict['dsid'] = np.append(output_dict['dsid'], np.ones([arrays[2]-arrays[1]],dtype=int)) | |
| if('ttbar' in arrays[0]): | |
| print 'background,idx=',idx | |
| output_dict['dsid'] = np.append(output_dict['dsid'], np.zeros([arrays[2]-arrays[1]],dtype=int)) | |
| #print arrays[1].keys() | |
| #print arrays[1]["met_met"] | |
| for k,v in arrays[3].items(): | |
| #print k,v | |
| output_dict[k] = np.append(output_dict[k],v) | |
| idx+=1 | |
| # Get rid of the first entries, since they're placeholders anyway | |
| output_dict = {k: output_dict[k][1:] for k in output_dict} | |
| print output_dict | |
| print "There are ",len(output_dict)," branches and ",len(output_dict[branches[0]])," entries!\t",len(output_dict['dsid']) | |
| #itotal = 0 | |
| #for start, stop, events in tree.iterate( | |
| # branches, entrystop=num_entries, reportentries=True | |
| #): | |
| ''' | |
| for branch in branches: | |
| for ievent, event in enumerate(events[branch]): | |
| for ijet, jet_emissions in enumerate(event): | |
| try: | |
| num_emissions = min(len(jet_emissions), args.num_emissions) | |
| fp_mmap[itotal + ievent][ijet][ | |
| :num_emissions, branches.index(branch) | |
| ] = jet_emissions[:num_emissions] | |
| except TypeError: | |
| # this occurs when jet_emissions is a single float, so we broadcast it | |
| fp_mmap[itotal + ievent][ijet][ | |
| :, branches.index(branch) | |
| ] = jet_emissions | |
| ''' | |
| # itotal += stop - start | |
| #outfilename = args.fname.replace('.root', '.h5') | |
| #print('Creating {}'.format(outfilename)) | |
| #with h5py.File(outfilename, 'w') as h5f: h5f.create_dataset('mlb_nn',data=output_dict) | |
| to_write=[] | |
| #for key,value in tree.arrays(branches).iteritems(): | |
| for key,value in output_dict.iteritems(): | |
| print key,value | |
| to_write.append(value) | |
| # we also want to have the signal / background labels | |
| #if("4tops" in args.fname): | |
| # did = np.ones(len(tree.arrays(branches)['jpt'])) | |
| # to_write.append(did) | |
| #if("ttbar" in args.fname): | |
| # did = np.zeroes(len(tree.arrays(branches)['jpt'])) | |
| # to_write.append(did) | |
| with h5py.File('4tops_LO_FS_ttbar_nonallhad_incl_PhPy8_FS_mc16d.h5', 'w') as h5f: | |
| #h5f.create_dataset(args.treename, data=fp_mmap) | |
| #h5f.create_dataset('branches', tree.arrays(branches)) | |
| #h5f.create_dataset('branches',data=to_write) | |
| for key,value in output_dict.iteritems(): | |
| h5f.create_dataset("mlb/"+key,data=value) | |
| #del fp_mmap |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment