Last active
April 19, 2022 14:45
-
-
Save braaannigan/2955d81cb27fb0f1a5ff4f990ef6d551 to your computer and use it in GitHub Desktop.
Query optimisation with Polars in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import polars as pl | |
import dask.dataframe as dd | |
# You can get the data from here: | |
# s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv | |
fl = "data/G1_1e8_1e2_0_0.csv" | |
# Pandas and Dask examples based on this blog post | |
# https://coiled.io/blog/speed-up-pandas-query-10x-with-dask/ | |
def pandasGroupby(engine="c"): | |
# Group by with pandas using pyarrow and specified dtypes | |
better_dtypes = { | |
"id1": "string[pyarrow]", | |
"id2": "string[pyarrow]", | |
"id3": "string[pyarrow]", | |
"id4": "int64", | |
"id5": "int64", | |
"id6": "int64", | |
"v1": "int64", | |
"v2": "int64", | |
"v3": "float64", | |
} | |
df = pd.read_csv(fl, engine=engine, dtype=better_dtypes,usecols=['id1','v1']) | |
return df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}) | |
def polarsGroupby(lazy=False): | |
# Groupby with polars | |
# Supports both lazy evaluation (with query optimisation) and eager evaluation | |
if lazy: | |
df = pl.scan_csv(fl) | |
else: | |
df = pl.read_csv(fl) | |
gb = df.groupby("id1").agg([pl.sum("v1")]).sort("id1") | |
# If lazy then evaluate the query | |
if isinstance(gb, pl.LazyFrame): | |
gb = gb.collect() | |
return gb | |
def daskGroupby(): | |
# Groupby with Dask | |
better_dtypes = { | |
"id1": "string[pyarrow]", | |
"id2": "string[pyarrow]", | |
"id3": "string[pyarrow]", | |
"id4": "int64", | |
"id5": "int64", | |
"id6": "int64", | |
"v1": "int64", | |
"v2": "int64", | |
"v3": "float64", | |
} | |
ddf = dd.read_csv(fl, dtype=better_dtypes) | |
return ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute() | |
# Check the outputs are the same | |
pan = pandasGroupby() | |
# Convert polars output to pandas and so some minor transformations before comparing | |
pol = polarsGroupby().to_pandas().set_index("id1") | |
pol.index = pol.index.astype("string[pyarrow]") | |
das = daskGroupby() | |
# Test that they provide the same output | |
pd.testing.assert_frame_equal(pan, das) | |
pd.testing.assert_frame_equal(pan, pol) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment