Last active
November 22, 2020 05:37
-
-
Save aminnj/7d27376886a2a5414b61841aa7457e8a to your computer and use it in GitHub Desktop.
utilities for pandas + ROOT (via monkeypatching pandas)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
import pandas as pd | |
import uproot | |
from yahist import Hist1D, Hist2D | |
from pandas.core.base import PandasObject | |
from cachetools import cached, LRUCache | |
from cachetools.keys import hashkey | |
def myhasher(*args, **kwargs): | |
# pd.DataFrame disabled __hash__ because they are mutable | |
hbins = kwargs.pop("bins", 0) | |
if "ndarray" in str(type(hbins)): | |
hbins = hash(tuple(hbins.tolist())) | |
if type(hbins) in [list, tuple] and len(hbins) == 2: | |
hbins = hash(map(tuple,hbins)) | |
return hashkey(args[0]._data, args[1:], hbins, **kwargs) | |
@cached(cache=LRUCache(maxsize=256), key=myhasher) | |
def tree_draw(df, varexp, sel="", **kwargs): | |
from tokenize import tokenize, NAME, OP | |
from io import BytesIO | |
tokens = tokenize(BytesIO(f"{varexp}${sel}".encode("utf-8")).readline) | |
nops = 0 | |
for x in tokens: | |
toknum, tokval = x[:2] | |
nops += (toknum == NAME) and (tokval in ["and", "or"]) | |
nops += toknum == OP | |
fast = nops < 5 | |
twodim = ":" in varexp | |
if fast and not twodim: | |
raw = hacky_query_eval(df, varexp, sel) | |
# performance hit for float16's. copy to float32 first. | |
if raw.dtype in ["float16"]: | |
vals = raw.astype("float32") | |
else: | |
vals = raw | |
if "weights" in kwargs: | |
raise Exception("`weights` kwarg not supported for `fast=True` (yet?)") | |
return Hist1D(vals, **kwargs) | |
if not sel: | |
mask = slice(None) | |
else: | |
mask = df.eval(sel) | |
if twodim: | |
assert(varexp.count(":") == 1) | |
varexp1, varexp2 = varexp.split(":") | |
vals = np.c_[df[mask].eval(varexp1), df[mask].eval(varexp2)] | |
return Hist2D(vals, **kwargs) | |
else: | |
vals = df[mask].eval(varexp) | |
return Hist1D(vals, **kwargs) | |
PandasObject.draw = tree_draw | |
def read_root(filename, treename="t", columns=None, progress=False, **kwargs): | |
""" | |
Read ROOT file containing one TTree into pandas DataFrame. | |
Thin wrapper around `uproot.iterate`. | |
filename: filename(s)/file pattern(s) | |
treename: name of input TTree | |
progress: show tqdm progress bar? | |
columns: list of columns ("branches") to read (default of `None` reads all) | |
**kwargs: extra kwargs to pass to `uproot.iterate` | |
Passing `entrysteps=[(0, 10)]` will read the first 10 rows, for example. | |
""" | |
# entrysteps of None iterates by basket to match `dataframe_to_ttree` | |
iterable = uproot.iterate( | |
filename, | |
treename, | |
columns, | |
entrysteps=kwargs.pop("entrysteps", None), | |
outputtype=dict, | |
namedecode="ascii", | |
**kwargs, | |
) | |
if progress: | |
from tqdm.auto import tqdm | |
iterable = tqdm(iterable) | |
f = uproot.open(filename) | |
categorical_columns = [k.decode().split("_",1)[1].rsplit(";",1)[0] for k in f.keys() if k.startswith(b"categories_")] | |
def to_df(chunk): | |
for column in list(chunk.keys()): | |
vals = chunk[column] | |
if (vals.dtype == "object") and (column+"_strn" in chunk.keys()): | |
del chunk[column+"_strn"] | |
import awkward | |
chunk[column] = awkward.StringArray.fromjagged(vals.astype("uint8")) | |
elif column in categorical_columns: | |
sep = "<!SEP!>" | |
categories = np.array(f[f"categories_{column}"].decode().split(sep)) | |
chunk[column] = categories[vals] | |
return pd.DataFrame(chunk) | |
df = pd.concat(map(to_df, iterable), ignore_index=True, sort=True) | |
return df | |
setattr(pd, "read_root", read_root) | |
def to_root( | |
df, filename, treename="t", chunksize=1e6, compression=uproot.LZ4(1), progress=False | |
): | |
""" | |
Writes ROOT file containing one TTree with the input pandas DataFrame. | |
filename: name of output file | |
treename: name of output TTree | |
chunksize: number of rows per basket | |
compression: uproot compression object (LZ4, ZLIB, LZMA, or None) | |
progress: show tqdm progress bar? | |
""" | |
tree_dtypes = dict() | |
string_branches = [] | |
category_branches = [] | |
for bname, dtype in df.dtypes.items(): | |
if (dtype == "object"): | |
if type(df.iloc[0][bname]) in [str]: | |
tree_dtypes[bname] = uproot.newbranch(np.dtype(">i2"), size=bname+"_strn", compression=uproot.ZLIB(6)) | |
string_branches.append(bname) | |
else: | |
raise Exception(f"Don't know what kind of branch {bname} is.") | |
elif (str(dtype) == "category"): | |
tree_dtypes[bname] = np.int8 | |
category_branches.append(bname) | |
else: | |
tree_dtypes[bname] = dtype | |
with uproot.recreate(filename, compression=compression) as f: | |
t = uproot.newtree(tree_dtypes) | |
f[treename] = t | |
for bname in category_branches: | |
sep = "<!SEP!>" | |
f[f"categories_{bname}"] = sep.join(df[bname].cat.categories) | |
chunksize = int(chunksize) | |
iterable = range(0, len(df), chunksize) | |
if progress: | |
from tqdm.auto import tqdm | |
iterable = tqdm(iterable) | |
for i in iterable: | |
chunk = df.iloc[i : i + chunksize] | |
basket = dict() | |
for column in chunk.columns: | |
if column in string_branches: | |
arr = chunk[column].values.astype("str") | |
import awkward | |
jagged = awkward.StringArray.fromnumpy(arr)._content | |
jagged = jagged[jagged != 0] | |
basket[column] = jagged | |
basket[column+"_strn"] = jagged.counts | |
elif column in category_branches: | |
basket[column] = chunk[column].cat.codes.values.astype(np.int8) | |
else: | |
basket[column] = chunk[column].values | |
f[treename].extend(basket) | |
PandasObject.to_root = to_root | |
def iter_draw(path, varexp, sel="", treepath="t", progress=False, entrysteps="50MB", **kwargs): | |
""" | |
Same as `tree_draw`, except this requires an additional first argument for the path/pattern | |
of input root files. Tree name is specified via `treepath`. | |
Iterates over the files in chunks of `entrysteps` (as per `uproot.iterate`), reading | |
only the branches deemed necessary according to `find_variables`. | |
""" | |
hists = [] | |
t0 = time.time() | |
nevents = 0 | |
edges = None | |
branches = find_variables(varexp + ":" + sel) | |
iterable = enumerate(uproot.iterate(path, | |
treepath="t", | |
entrysteps=entrysteps, | |
branches=branches, | |
namedecode="ascii", | |
outputtype=pd.DataFrame, | |
)) | |
if progress: | |
from tqdm.auto import tqdm | |
iterable = tqdm(iterable) | |
for i,df in iterable: | |
nevents += len(df) | |
if i >= 1 and "bins" not in kwargs: | |
kwargs["bins"] = edges | |
h = df.draw(varexp, sel, **kwargs) | |
edges = h.edges | |
hists.append(h) | |
h = sum(hists) | |
t1 = time.time() | |
if progress: | |
print(f"Processed {nevents} in {t1-t0:.2f}s ({1e-6*nevents/(t1-t0):.2f}MHz)") | |
return h | |
def find_variables(expr): | |
""" | |
Given a string like "DV_x:DV_y:(lxy < DV_x+1) and (lxy>1)", returns a list of | |
["DV_x", "DV_y", "lxy"] | |
(i.e., extracts what seem to be column names) | |
""" | |
from io import BytesIO | |
from tokenize import tokenize, NAME | |
varnames = [] | |
g = list(tokenize(BytesIO(expr.encode("utf-8")).readline)) | |
for ix, x in enumerate(g): | |
toknum = x[0] | |
tokval = x[1] | |
if toknum != NAME: | |
continue | |
if ix > 0 and g[ix - 1][1] in ["."]: | |
continue | |
if ix < len(g) - 1 and g[ix + 1][1] in [".", "("]: | |
continue | |
if tokval in ["and", "or"]: | |
continue | |
varnames.append(tokval) | |
varnames = list(set(varnames)) | |
return varnames | |
def hacky_query_eval(df, varstr, selstr="", verbose=False): | |
""" | |
Please don't read/use. This is dangerous and stupid, kind of like | |
integrating a function by printing out a plot, coloring the area under it in red, | |
faxing it to yourself, then counting red pixels to get the area. | |
Basically I wanted some way to convert | |
df.query("dimuon_mass > 5 and pass_baseline_iso").eval("dimuon_mass").mean() | |
into | |
df["dimuon_mass"][ (df["dimuon_mass"] > 5) & (df["pass_baseline_iso"]) ].mean() | |
because the latter doesn't make an intermediate copy of all the columns with query(), | |
and it also doesn't do jitting with numexpr. In principle, this is much faster to execute. | |
Usage: | |
arr = hacky_query_eval( | |
df_data, | |
varstr = "dimuon_mass", | |
selstr = "pass_baseline_iso and 0<logabsetaphi<1.25", | |
) | |
print(arr.mean()) | |
""" | |
from pandas.core.computation.expr import Expr | |
from pandas.core.computation.scope import Scope | |
env = Scope( | |
1, global_dict=globals(), local_dict=locals(), resolvers=[df], target=None, | |
) | |
def inject_df(s): | |
""" | |
convert expression string like (a > 1) to (df["a"] > 1) | |
so that it can be eval'd later | |
""" | |
expr = Expr(s, env=env, parser="pandas") | |
self = expr._visitor | |
def visit_Name_hack(node, **kwargs): | |
result = self.term_type(node.id, self.env, **kwargs) | |
result._name = f'df["{result._name}"]' | |
return result | |
def _maybe_downcast_constants_hack(left, right): | |
return left, right | |
expr._visitor.visit_Name = visit_Name_hack | |
expr._visitor._maybe_downcast_constants = _maybe_downcast_constants_hack | |
expr.terms = expr.parse() | |
return str(expr) | |
varexpr = inject_df(varstr) | |
toeval = f"({varexpr})" | |
if selstr: | |
selexpr = str(inject_df(selstr)) | |
toeval += f"[{selexpr}].values" | |
if verbose: | |
print(f"Evaluating string: {toeval}") | |
result = eval(toeval) | |
return result | |
if __name__ == "__main__": | |
import numpy as np | |
N = int(1e5) | |
# make dataframe | |
df = pd.DataFrame( | |
dict( | |
mass=np.random.normal(3.0, 0.1, N), | |
foo=np.random.random(N), | |
bar=np.random.random(N), | |
) | |
) | |
# write out 3 columns to a root file | |
df.to_root("test.root") | |
# read back two columns | |
df = pd.read_root("test.root", columns=["mass", "foo"]) | |
# make 1D histogram of `mass+0.1` for rows where `0.1<foo<0.2` | |
# kwargs after first two required args are passed to yahist's Hist1D() | |
df.draw("mass + 0.1", "0.1<foo<0.2", bins="200,0,10").plot(histtype="step") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
then
...will add some functions to pandas namespace and for dataframes:
Write out dataframes to a ROOT file (including strings, though these are slow and you should
.astype("category")
if possible):Read ROOT files and specify certain columns and/or a range of rows. Let's just say that ROOT is the original
parquet
(just with different words: rows -> entries/events, rowgroups -> baskets, columns -> branches).And, for those familiar with
TTree::Draw()
, you can draw directly from a dataframe. This will make a 1D histogram ofmass+0.1
for rows where0.1<foo<0.2
. All kwargs after first two required args are passed to yahist's Hist1D(). Seepip install yahist
for details on the (nice and simple) histogram object.or 2D with "x:y"
Lastly, this will read root files and make a histogram in one pass (chunked reading and only branches that are needed)