Skip to content

Instantly share code, notes, and snippets.

@cpcloud
Last active November 4, 2022 13:33
Show Gist options
  • Select an option

  • Save cpcloud/4676e93f505c9fb9071f6be7d1fc14c9 to your computer and use it in GitHub Desktop.

Select an option

Save cpcloud/4676e93f505c9fb9071f6be7d1fc14c9 to your computer and use it in GitHub Desktop.
comparision of pandas and ibis + duckdb on a real use case
import contextlib
import time
from datetime import timedelta
import ibis
import pandas as pd
from ibis import _
def pandas():
print("pandas")
names = pd.read_csv("name.basics.tsv", sep="\t", na_values="\\N", engine="pyarrow")
names = names.loc[names.primaryProfession.str.contains(".*act(?:or|ress).*")]
titles = pd.read_csv("title.basics.tsv", sep="\t", na_values="\\N")
# principal cast/crew
# only actors/actresses
principals = pd.read_csv("title.principals.tsv.gz", sep="\t", na_values="\\N")
actors = principals.loc[principals.category.isin(["actor", "actress"])]
# assign actor names to titles
named_actors = pd.merge(pd.merge(actors, names, on="nconst"), titles, on="tconst")
# bring in the ratings
ratings = pd.read_csv("ratings.csv")
links = pd.read_csv("links.csv")
return (
pd.merge(
named_actors.assign(imdbId=lambda df: df.tconst.str[2:].astype("int32")),
pd.merge(ratings, links, on="movieId"),
on="imdbId",
)
.groupby("primaryName")
.agg(avg_rating=("rating", "mean"), n=("rating", "count"))
.sort_values(by="n", ascending=False)
)
def duckdb():
opts = dict(quote="", nullstr="\\N")
names = ibis.file("name.basics.tsv", **opts).filter(
_.primaryProfession.re_search(".*act(or|ress).*")
)
titles = ibis.file("title.basics.tsv", **opts)
# principal cast/crew
# only actors/actresses
actors = ibis.file("title.principals.tsv.gz", **opts).filter(
_.category.isin(["actor", "actress"])
)
# assign actor names to titles
named_actors = (
actors.join(names, "nconst")
.join(titles, "tconst")
.mutate(imdbId=_.tconst[2:].cast("int32"))
)
# bring in the ratings
links = ibis.file("links.csv")
ratings = ibis.file("ratings.csv")
rated_movies = ratings.join(links, "movieId")
return (
named_actors.join(rated_movies, "imdbId")
.group_by(name=_.primaryName)
.aggregate(avg_rating=_.rating.mean(), n=_.count())
.order_by(_.n.desc())
)
@contextlib.contextmanager
def timer(label: str) -> None:
start = time.time()
yield
stop = time.time()
seconds = stop - start
print(f"{label}: {timedelta(seconds=seconds)}")
if __name__ == "__main__":
import argparse
import os
from pathlib import Path
ibis.options.interactive = True
p = argparse.ArgumentParser()
p.add_argument("-p", "--pandas", action="store_true")
args = p.parse_args()
d = Path(__file__).parent
os.chdir(d)
func = pandas if args.pandas else duckdb
with timer("execute"):
if not args.pandas:
with timer("build expr"):
joined = func()
else:
joined = func()
print(joined)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment