braaannigan · April 19, 2022 14:45
diff --git a/queries.py b/queries.py
 import pandas as pd
 import polars as pl
 import dask.dataframe as dd

 # You can get the data from here:
 # s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv
 fl = "data/G1_1e8_1e2_0_0.csv"
 # Pandas and Dask examples based on this blog post
 # https://coiled.io/blog/speed-up-pandas-query-10x-with-dask/


 def pandasGroupby(engine="c"):
    # Group by with pandas using pyarrow and specified dtypes
    better_dtypes = {
        "id1": "string[pyarrow]",
        "id2": "string[pyarrow]",
        "id3": "string[pyarrow]",
        "id4": "int64",
        "id5": "int64",
        "id6": "int64",
        "v1": "int64",
        "v2": "int64",
        "v3": "float64",
    }

    df = pd.read_csv(fl, engine=engine, dtype=better_dtypes,usecols=['id1','v1'])
    return df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"})


 def polarsGroupby(lazy=False):
    # Groupby with polars
    # Supports both lazy evaluation (with query optimisation) and eager evaluation
    if lazy:
        df = pl.scan_csv(fl)
    else:
        df = pl.read_csv(fl)
    gb = df.groupby("id1").agg([pl.sum("v1")]).sort("id1")
    # If lazy then evaluate the query
    if isinstance(gb, pl.LazyFrame):
        gb = gb.collect()
    return gb


 def daskGroupby():
    # Groupby with Dask
    better_dtypes = {
        "id1": "string[pyarrow]",
        "id2": "string[pyarrow]",
        "id3": "string[pyarrow]",
        "id4": "int64",
        "id5": "int64",
        "id6": "int64",
        "v1": "int64",
        "v2": "int64",
        "v3": "float64",
    }
    ddf = dd.read_csv(fl, dtype=better_dtypes)
    return ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()


 # Check the outputs are the same
 pan = pandasGroupby()
 # Convert polars output to pandas and so some minor transformations before comparing
 pol = polarsGroupby().to_pandas().set_index("id1")
 pol.index = pol.index.astype("string[pyarrow]")

 das = daskGroupby()
 # Test that they provide the same output
 pd.testing.assert_frame_equal(pan, das)
 pd.testing.assert_frame_equal(pan, pol)
	import pandas as pd
	import polars as pl
	import dask.dataframe as dd

	# You can get the data from here:
	# s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv
	fl = "data/G1_1e8_1e2_0_0.csv"
	# Pandas and Dask examples based on this blog post
	# https://coiled.io/blog/speed-up-pandas-query-10x-with-dask/


	def pandasGroupby(engine="c"):
	# Group by with pandas using pyarrow and specified dtypes
	better_dtypes = {
	"id1": "string[pyarrow]",
	"id2": "string[pyarrow]",
	"id3": "string[pyarrow]",
	"id4": "int64",
	"id5": "int64",
	"id6": "int64",
	"v1": "int64",
	"v2": "int64",
	"v3": "float64",
	}

	df = pd.read_csv(fl, engine=engine, dtype=better_dtypes,usecols=['id1','v1'])
	return df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"})


	def polarsGroupby(lazy=False):
	# Groupby with polars
	# Supports both lazy evaluation (with query optimisation) and eager evaluation
	if lazy:
	df = pl.scan_csv(fl)
	else:
	df = pl.read_csv(fl)
	gb = df.groupby("id1").agg([pl.sum("v1")]).sort("id1")
	# If lazy then evaluate the query
	if isinstance(gb, pl.LazyFrame):
	gb = gb.collect()
	return gb


	def daskGroupby():
	# Groupby with Dask
	better_dtypes = {
	"id1": "string[pyarrow]",
	"id2": "string[pyarrow]",
	"id3": "string[pyarrow]",
	"id4": "int64",
	"id5": "int64",
	"id6": "int64",
	"v1": "int64",
	"v2": "int64",
	"v3": "float64",
	}
	ddf = dd.read_csv(fl, dtype=better_dtypes)
	return ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()


	# Check the outputs are the same
	pan = pandasGroupby()
	# Convert polars output to pandas and so some minor transformations before comparing
	pol = polarsGroupby().to_pandas().set_index("id1")
	pol.index = pol.index.astype("string[pyarrow]")

	das = daskGroupby()
	# Test that they provide the same output
	pd.testing.assert_frame_equal(pan, das)
	pd.testing.assert_frame_equal(pan, pol)