Created
March 4, 2025 19:25
-
-
Save amoeba/2482d878c4da4ece90bbe349ca9ab8b3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How do two Substrait producers produce a plan for this relatively simple | |
# query? | |
# | |
# SELECT * FROM penguins LIMIT 10; | |
# | |
# Let's see: | |
import duckdb | |
import json | |
print(duckdb.__version__) | |
con = duckdb.connect() | |
con.sql("INSTALL substrait FROM community") | |
con.sql("LOAD substrait") | |
con.sql("CREATE TABLE 'penguins' AS SELECT * FROM './data/penguins.parquet';") | |
# File above is available at https://files.brycemecum.com/penguins.parquet | |
query_str = "SELECT * FROM penguins LIMIT 10;" | |
con.sql(query_str) | |
# ┌─────────┬───────────┬────────────────┬───┬─────────────┬─────────┬───────┐ | |
# │ species │ island │ bill_length_mm │ … │ body_mass_g │ sex │ year │ | |
# │ varchar │ varchar │ double │ │ int32 │ varchar │ int32 │ | |
# ├─────────┼───────────┼────────────────┼───┼─────────────┼─────────┼───────┤ | |
# │ Adelie │ Torgersen │ 39.1 │ … │ 3750 │ male │ 2007 │ | |
# │ Adelie │ Torgersen │ 39.5 │ … │ 3800 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ 40.3 │ … │ 3250 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ NULL │ … │ NULL │ NULL │ 2007 │ | |
# │ Adelie │ Torgersen │ 36.7 │ … │ 3450 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ 39.3 │ … │ 3650 │ male │ 2007 │ | |
# │ Adelie │ Torgersen │ 38.9 │ … │ 3625 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ 39.2 │ … │ 4675 │ male │ 2007 │ | |
# │ Adelie │ Torgersen │ 34.1 │ … │ 3475 │ NULL │ 2007 │ | |
# │ Adelie │ Torgersen │ 42.0 │ … │ 4250 │ NULL │ 2007 │ | |
# ├─────────┴───────────┴────────────────┴───┴─────────────┴─────────┴───────┤ | |
# │ 10 rows 8 columns (6 shown) │ | |
# └──────────────────────────────────────────────────────────────────────────┘ | |
# Based upon my reading of the Subtsrait spec, the plan for `SELECT * FROM | |
# penguins LIMIT 10;` should structured like this: | |
# Root | |
# |-- Fetch(offset=0, limit=10) | |
# |-- Read | |
# Up first, DuckDB | |
plan_json = con.sql(f"CALL get_substrait_json('{query_str}')").fetchall()[0][0] | |
print(json.dumps(json.loads(plan_json), indent=4)) | |
# Simplified from the above, the result is structured like this: | |
# | |
# Root | |
# |-- Project | |
# |-- Project | |
# |-- Sort | |
# |-- Project | |
# |-- Join | |
# |-- Left | |
# | |-- Read | |
# |-- Right | |
# |-- Fetch(offset=0, limit=10) | |
# |-- Read | |
# Altogether this plan looks very strange and overcomplicated. | |
# Does this query produce the correct result? | |
con.sql(f"CALL from_substrait_json('{plan_json}')") | |
# ┌─────────┬───────────┬────────────────┬───┬─────────────┬─────────┬───────┐ | |
# │ species │ island │ bill_length_mm │ … │ body_mass_g │ sex │ year │ | |
# │ varchar │ varchar │ double │ │ int32 │ varchar │ int32 │ | |
# ├─────────┼───────────┼────────────────┼───┼─────────────┼─────────┼───────┤ | |
# │ Adelie │ Torgersen │ 39.1 │ … │ 3750 │ male │ 2007 │ | |
# │ Adelie │ Torgersen │ 39.5 │ … │ 3800 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ 40.3 │ … │ 3250 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ NULL │ … │ NULL │ NULL │ 2007 │ | |
# │ Adelie │ Torgersen │ 36.7 │ … │ 3450 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ 39.3 │ … │ 3650 │ male │ 2007 │ | |
# │ Adelie │ Torgersen │ 38.9 │ … │ 3625 │ female │ 2007 │ | |
# │ Adelie │ Torgersen │ 39.2 │ … │ 4675 │ male │ 2007 │ | |
# │ Adelie │ Torgersen │ 34.1 │ … │ 3475 │ NULL │ 2007 │ | |
# │ Adelie │ Torgersen │ 42.0 │ … │ 4250 │ NULL │ 2007 │ | |
# │ · │ · │ · │ · │ · │ · │ · │ | |
# │ · │ · │ · │ · │ · │ · │ · │ | |
# │ · │ · │ · │ · │ · │ · │ · │ | |
# │ Adelie │ Dream │ 32.1 │ … │ 3050 │ female │ 2009 │ | |
# │ Adelie │ Dream │ 40.7 │ … │ 3725 │ male │ 2009 │ | |
# │ Adelie │ Dream │ 37.3 │ … │ 3000 │ female │ 2009 │ | |
# │ Adelie │ Dream │ 39.0 │ … │ 3650 │ male │ 2009 │ | |
# │ Adelie │ Dream │ 39.2 │ … │ 4250 │ male │ 2009 │ | |
# │ Adelie │ Dream │ 36.6 │ … │ 3475 │ female │ 2009 │ | |
# │ Adelie │ Dream │ 36.0 │ … │ 3450 │ female │ 2009 │ | |
# │ Adelie │ Dream │ 37.8 │ … │ 3750 │ male │ 2009 │ | |
# │ Adelie │ Dream │ 36.0 │ … │ 3700 │ female │ 2009 │ | |
# │ Adelie │ Dream │ 41.5 │ … │ 4000 │ male │ 2009 │ | |
# ├─────────┴───────────┴────────────────┴───┴─────────────┴─────────┴───────┤ | |
# │ 152 rows (20 shown) 8 columns (6 shown) │ | |
# └──────────────────────────────────────────────────────────────────────────┘ | |
# No. This looks like a bug in the DuckDB extension. | |
# Next up, DataFusion | |
import datafusion | |
import pyarrow as pa | |
from substrait.proto import Plan | |
ctx = datafusion.SessionContext(datafusion.SessionConfig()) | |
# Manually specify schema to avoid dictionary columns | |
schema = pa.schema( | |
[ | |
pa.field("species", pa.string()), | |
pa.field("island", pa.string()), | |
pa.field("bill_length_mm", pa.float64()), | |
pa.field("bill_depth_mm", pa.float64()), | |
pa.field("body_mass_g", pa.int32()), | |
pa.field("sex", pa.string()), | |
pa.field("year", pa.int32()), | |
] | |
) | |
ctx.register_parquet("penguins", "./data/penguins.parquet", schema=schema) | |
plan_bytes = datafusion.substrait.Serde.serialize_bytes(query_str, ctx) | |
# Convert to a proto Plan so we can look at it | |
p = Plan() | |
p.ParseFromString(plan_bytes) | |
p | |
# This looks pretty good and the only note I have is that the select projection | |
# on the ReadRel isn't needed. | |
# Now let's see if it produces the right result: | |
df_substrait_plan = datafusion.substrait.Serde.deserialize_bytes(plan_bytes) | |
df_logical_plan = datafusion.substrait.Consumer.from_substrait_plan(ctx, df_substrait_plan) | |
ctx.create_dataframe_from_logical_plan(df_logical_plan) | |
# DataFrame() | |
# +---------+-----------+----------------+---------------+-------------+--------+------+ | |
# | species | island | bill_length_mm | bill_depth_mm | body_mass_g | sex | year | | |
# +---------+-----------+----------------+---------------+-------------+--------+------+ | |
# | Adelie | Torgersen | 39.1 | 18.7 | 3750 | male | 2007 | | |
# | Adelie | Torgersen | 39.5 | 17.4 | 3800 | female | 2007 | | |
# | Adelie | Torgersen | 40.3 | 18.0 | 3250 | female | 2007 | | |
# | Adelie | Torgersen | | | | | 2007 | | |
# | Adelie | Torgersen | 36.7 | 19.3 | 3450 | female | 2007 | | |
# | Adelie | Torgersen | 39.3 | 20.6 | 3650 | male | 2007 | | |
# | Adelie | Torgersen | 38.9 | 17.8 | 3625 | female | 2007 | | |
# | Adelie | Torgersen | 39.2 | 19.6 | 4675 | male | 2007 | | |
# | Adelie | Torgersen | 34.1 | 18.1 | 3475 | | 2007 | | |
# | Adelie | Torgersen | 42.0 | 20.2 | 4250 | | 2007 | | |
# +---------+-----------+----------------+---------------+-------------+--------+------+ | |
# This looks correct. | |
import duckdb | |
con = duckdb.connect() | |
con.sql( | |
"LOAD '/Users/bryce/src/substrait-io/duckdb-substrait-extension/build/release/extension/substrait/substrait.duckdb_extension'" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment