Created
July 5, 2023 09:18
-
-
Save riga/157625f7323e529a60d83ef5bec68c1d to your computer and use it in GitHub Desktop.
Merge two NanoAOD files while removing duplicate events
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
""" | |
Script that merges the events tree of two NanoAOD files, removing duplicates identified | |
by event number, run number and luminosity block. | |
> nano_unique.py in1.root in2.root out.root | |
NOTE: This is just a first draft whose performance could surely be improved | |
in case there is an option to skip deserializing all branches with uproot | |
but still being able to save them in a second file. | |
""" | |
from __future__ import annotations | |
import os | |
import math | |
from functools import partial | |
from typing import Any | |
import numpy as np | |
import awkward as ak | |
import uproot | |
try: | |
import tqdm | |
HAS_TQDM = True | |
except ImportError: | |
HAS_TQDM = False | |
def nano_unique( | |
input_path1: str, | |
input_path2: str, | |
output_path: str, | |
tree_name: str = "Events", | |
keep_branches: Any | None = None, | |
step_size: int = 100000, | |
verbose: bool = False, | |
) -> tuple[int, int]: | |
""" | |
Joins two NanoAOD files located at *input_path1* and *input_path2*, removes duplicates | |
identified by the (event, run, luminosityBlock) triplet, and saves the joined file at | |
*output_path*. The output file will only contain a tree named *tree_name*, i.e., any other | |
objects contained in one of the input files are dropped. In case a file already exists at | |
*output_path*, it is removed first. Please note that events contained in *input_path1* are | |
priotized in case a duplicate is detected. | |
The output file is filled in chunks with a certain *step_size*, each one resulting in a new | |
basket in the output file. It is recommended to choose this value as large as possible ( | |
depending on the available memory), to speed up the merging process but also to create files | |
that are faster to read. *keep_branches* is forwarded as *filter_name* to | |
:py:meth:`uproot.TTree.iterate` to select which branches to keep. If set, the three index | |
branches (event, run, luminosityBlock) should be accepted. For more info, see this | |
`link <https://uproot.readthedocs.io/en/latest/uproot.behaviors.TTree.TTree.html#arrays>`__. | |
The number of written and overlapping events is returned in a 2-tuple. | |
""" | |
# expand variables | |
expand = lambda path: os.path.abspath(os.path.expandvars(os.path.expanduser(path))) | |
input_path1 = expand(input_path1) | |
input_path2 = expand(input_path2) | |
output_path = expand(output_path) | |
# prepare the output | |
output_dir = os.path.dirname(output_path) | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
elif os.path.exists(output_path): | |
os.remove(output_path) | |
output_file = uproot.create(output_path) | |
# get input trees | |
tree1 = uproot.open(input_path1)[tree_name] | |
tree2 = uproot.open(input_path2)[tree_name] | |
# read index columns over the full reference file | |
index_columns = ["event", "run", "luminosityBlock"] | |
index1 = tree1.arrays(index_columns) | |
# prepare counts | |
n_written = 0 | |
n_overlap = 0 | |
# iteration helper | |
def iterate(tree, name): | |
if verbose: | |
print(f"iterating through {name} tree with {tree.num_entries} events") | |
progress = ( | |
partial(tqdm.tqdm, total=int(math.ceil(tree.num_entries / step_size))) | |
if verbose and HAS_TQDM else | |
(lambda gen: gen) | |
) | |
return progress(tree.iterate(step_size=step_size, filter_name=keep_branches)) | |
# fill chunks of the first tree | |
# note: if there was a way to properly "update" and extend existing trees with uproot, one | |
# could just copy input_path1 to output_path first and skip this first loop | |
for chunk1 in iterate(tree1, "first"): | |
# update counts | |
n_written += len(chunk1) | |
# save or extend the tree | |
# workaround: according to the uproot docs, it should be possible to just assign a flat | |
# awkward array to an output file to create a tree; however, it seems like | |
# variable length arrays, although having a standard type (e.g. "var * float32") | |
# are not properly accepted; an issue will be opened for this | |
chunk1 = dict(zip(chunk1.fields, ak.unzip(chunk1))) | |
# end of workaround | |
if tree_name in output_file: | |
output_file[tree_name].extend(chunk1) | |
else: | |
output_file[tree_name] = chunk1 | |
# fill chunks of the second tree | |
for chunk2 in iterate(tree2, "second"): | |
# determine a mask of events in tree2 that are also in tree1 | |
mask2 = np.isin(chunk2[index_columns], index1, assume_unique=True) | |
chunk2 = chunk2[~mask2] | |
# update counts | |
n_written += len(chunk2) | |
n_overlap += ak.sum(mask2) | |
# skip the chunk if all events are overlapping | |
if ak.all(mask2): | |
continue | |
# save or extend the tree | |
# workaround: same as above | |
chunk2 = dict(zip(chunk2.fields, ak.unzip(chunk2))) | |
# end of workaround | |
output_file[tree_name].extend(chunk2) | |
if verbose: | |
print(f"written {n_written} and found {n_overlap} overlapping event(s)") | |
return n_written, n_overlap | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser( | |
description="joins two NanoAOD files and removes duplicate events", | |
) | |
parser.add_argument( | |
"file1", | |
help="path to the first file", | |
) | |
parser.add_argument( | |
"file2", | |
help="path to the second file", | |
) | |
parser.add_argument( | |
"output", | |
help="path to the output file to be created", | |
) | |
parser.add_argument( | |
"--tree", | |
"-t", | |
default="Events", | |
help="name of the trees to merge and create; default: Events", | |
) | |
parser.add_argument( | |
"--step-size", | |
"-s", | |
type=int, | |
default=100000, | |
help="step size for iterations; default: 100000", | |
) | |
parser.add_argument( | |
"--verbose", | |
"-v", | |
action="store_true", | |
help="verbose output, potentially with tqdm if installed", | |
) | |
args = parser.parse_args() | |
nano_unique( | |
input_path1=args.file1, | |
input_path2=args.file2, | |
output_path=args.output, | |
tree_name=args.tree, | |
step_size=args.step_size, | |
verbose=args.verbose, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment