Skip to content

Instantly share code, notes, and snippets.

@mattleblanc
Created April 1, 2019 14:26
Show Gist options
  • Select an option

  • Save mattleblanc/fb5453de9f3f5d086e057a9c1c4f72f4 to your computer and use it in GitHub Desktop.

Select an option

Save mattleblanc/fb5453de9f3f5d086e057a9c1c4f72f4 to your computer and use it in GitHub Desktop.
import uproot
import h5py
import numpy as np
from tempfile import mkdtemp
import os
import argparse
# if we want multiple custom formatters, use inheriting
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter):
pass
parser = argparse.ArgumentParser(
description='Convert SM4t offline NTuples to HDF5 files',
formatter_class=lambda prog: CustomFormatter(prog, max_help_position=30),
)
parser.add_argument('fname', type=str, help='File to process')
#parser.add_argument('--num-jets', type=int, help='Number of jets per event', default=2)
#parser.add_argument(
# '--num-emissions', type=int, help='Number of emissions per jet', default=20
#)
parser.add_argument(
'--num-entries',
type=int,
help='Number of entries to process in file. -1 to process all.',
default=-1,
)
parser.add_argument(
'--treename',
type=str,
help='Name of tree to use in file',
default='lundjets_InDetTrackParticles',
)
args = parser.parse_args()
branches = [
'lpt',
'jpt',
'mtw',
'met_met',
'mJSum',
'nMuons',
'nElectrons',
'nJets',
'nBTags_DL1_60',
'nBTags_DL1_85'
]
# open file and get the tree
#f = uproot.open(args.fname)
#tree = f[args.treename]
# make memory-mapped file to hold large arrays
#num_entries = args.num_entries if args.num_entries > 0 else tree.numentries
#maxshape = (num_entries, len(branches))
#fname_mmap = os.path.join(mkdtemp(), 'mmap.dat')
#fp_mmap = np.memmap(fname_mmap, dtype=float, mode='w+', shape=maxshape)
#fp_mmap[:] = 1.0
output_dict = dict((branch,np.array([0],dtype=float)) for branch in branches)
output_dict['dsid'] = np.array([-1],dtype=int)
print output_dict
idx = 0
for arrays in uproot.iterate("in/*.root*", args.treename, branches, reportpath=True, reportentries=True):
print arrays
if('4tops' in arrays[0]):
print 'signal\tidx=',idx
output_dict['dsid'] = np.append(output_dict['dsid'], np.ones([arrays[2]-arrays[1]],dtype=int))
if('ttbar' in arrays[0]):
print 'background,idx=',idx
output_dict['dsid'] = np.append(output_dict['dsid'], np.zeros([arrays[2]-arrays[1]],dtype=int))
#print arrays[1].keys()
#print arrays[1]["met_met"]
for k,v in arrays[3].items():
#print k,v
output_dict[k] = np.append(output_dict[k],v)
idx+=1
# Get rid of the first entries, since they're placeholders anyway
output_dict = {k: output_dict[k][1:] for k in output_dict}
print output_dict
print "There are ",len(output_dict)," branches and ",len(output_dict[branches[0]])," entries!\t",len(output_dict['dsid'])
#itotal = 0
#for start, stop, events in tree.iterate(
# branches, entrystop=num_entries, reportentries=True
#):
'''
for branch in branches:
for ievent, event in enumerate(events[branch]):
for ijet, jet_emissions in enumerate(event):
try:
num_emissions = min(len(jet_emissions), args.num_emissions)
fp_mmap[itotal + ievent][ijet][
:num_emissions, branches.index(branch)
] = jet_emissions[:num_emissions]
except TypeError:
# this occurs when jet_emissions is a single float, so we broadcast it
fp_mmap[itotal + ievent][ijet][
:, branches.index(branch)
] = jet_emissions
'''
# itotal += stop - start
#outfilename = args.fname.replace('.root', '.h5')
#print('Creating {}'.format(outfilename))
#with h5py.File(outfilename, 'w') as h5f: h5f.create_dataset('mlb_nn',data=output_dict)
to_write=[]
#for key,value in tree.arrays(branches).iteritems():
for key,value in output_dict.iteritems():
print key,value
to_write.append(value)
# we also want to have the signal / background labels
#if("4tops" in args.fname):
# did = np.ones(len(tree.arrays(branches)['jpt']))
# to_write.append(did)
#if("ttbar" in args.fname):
# did = np.zeroes(len(tree.arrays(branches)['jpt']))
# to_write.append(did)
with h5py.File('4tops_LO_FS_ttbar_nonallhad_incl_PhPy8_FS_mc16d.h5', 'w') as h5f:
#h5f.create_dataset(args.treename, data=fp_mmap)
#h5f.create_dataset('branches', tree.arrays(branches))
#h5f.create_dataset('branches',data=to_write)
for key,value in output_dict.iteritems():
h5f.create_dataset("mlb/"+key,data=value)
#del fp_mmap
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment