Last active
March 19, 2023 17:53
-
-
Save mneedham/bd69020cf76dd0baac8282b2cf4265e7 to your computer and use it in GitHub Desktop.
DuckDB Relational API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import duckdb | |
import pandas as pd | |
con = duckdb.connect('atp-matches.db') | |
con.sql("INSTALL httpfs") | |
con.sql("LOAD httpfs") | |
csv_files = [ | |
f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv" | |
for year in range(1968,2024) | |
] | |
con.execute(""" | |
CREATE OR REPLACE TABLE matches AS | |
SELECT * FROM read_csv_auto($1, types={'winner_seed': 'VARCHAR', 'loser_seed': 'VARCHAR'}) | |
""", [csv_files]) | |
df = con.execute(""" | |
SELECT * | |
FROM matches | |
WHERE (loser_name = $1 AND winner_name = $2) OR | |
(loser_name = $2 AND winner_name = $1) | |
ORDER BY tourney_date | |
""", ["Richard Gasquet", "Rafael Nadal"]).fetchdf() | |
rel = con.from_df(df) | |
players = con.from_df(pd.DataFrame({"player": ["Rafael Nadal", "Richard Gasquet"]})).set_alias("players") | |
surfaces = con.sql("select distinct surface from matches WHERE surface is not null").set_alias('surfaces') | |
ps = con.sql('select * from players cross join surfaces').set_alias("ps") | |
winners = rel.aggregate("count(*) AS wins, winner_name, surface").set_alias("winners") | |
(ps.join(winners, | |
condition="""ps.player = winners.winner_name AND | |
ps.surface = winners.surface""", | |
how="left") | |
.project("player, ps.surface as surface, coalesce(wins, 0) AS wins") | |
.aggregate("surface, list(row(player,wins)) AS results") | |
) | |
# Make sure to always use the same DuckDB connection when creating relations, or you'll get this erro: | |
# duckdb.Error: Cannot combine LEFT and RIGHT relations of different connections! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment