Skip to content

Instantly share code, notes, and snippets.

@lmmx
Last active January 17, 2026 22:12
Show Gist options
  • Select an option

  • Save lmmx/1053c35327c72dab07ec25bc736fe1bb to your computer and use it in GitHub Desktop.

Select an option

Save lmmx/1053c35327c72dab07ec25bc736fe1bb to your computer and use it in GitHub Desktop.
Eliminating a Cartesian product in Polars dataframe handling https://cog.spin.systems/wikidata-digging-avoiding-combinatorial-explosion
import polars as pl
from pathlib import Path
claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet")
lf = pl.scan_parquet(claims_file)
base = (
lf.select(pl.col("claims").explode().struct.unnest())
.drop("key")
.explode("value")
.unnest("value")
.unnest("mainsnak")
.head(1)
.collect()
)
print(base)
print()
print(base.schema)
import polars as pl
from pathlib import Path
claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet")
WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"]
lf = pl.scan_parquet(claims_file)
base = (
lf.select(pl.col("claims").explode().struct.unnest())
.drop("key")
.explode("value")
.unnest("value")
.unnest("mainsnak")
.filter(pl.col("datatype").is_in(WIKIBASE_TYPES))
.head(1)
.collect()
)
cartesian = (
base.lazy()
.explode("property-labels")
.with_columns(
pl.col("property-labels")
.struct.rename_fields(["prop_lang", "property_label"])
)
.unnest("property-labels")
.with_columns(pl.col("datavalue").struct.field("labels").alias("dv_labels"))
.explode("dv_labels")
.with_columns(
pl.col("dv_labels")
.struct.rename_fields(["dv_lang", "datavalue_label"])
)
.unnest("dv_labels")
.select(["prop_lang", "dv_lang", "property_label", "datavalue_label"])
.collect()
)
print(cartesian)
print(f"\nrows produced: {cartesian.height}")
import polars as pl
from pathlib import Path
claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet")
lf = pl.scan_parquet(claims_file)
df = (
lf.select(pl.col("claims").explode().struct.unnest())
.drop("key")
.explode("value")
.unnest("value")
.unnest("mainsnak")
.head(3)
.with_columns(
pl.col("datavalue")
.struct.field("labels")
.list.eval(pl.element().struct.field("key"))
.alias("_dv_langs")
)
.select(["property", "_dv_langs"])
.collect()
)
print(df)
import polars as pl
from pathlib import Path
claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet")
WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"]
lf = pl.scan_parquet(claims_file)
base = (
lf.select(pl.col("claims").explode().struct.unnest())
.drop("key")
.explode("value")
.unnest("value")
.unnest("mainsnak")
.filter(pl.col("datatype").is_in(WIKIBASE_TYPES))
.head(1)
)
efficient_stage = (
base
.with_columns(
pl.col("datavalue")
.struct.field("labels")
.list.eval(pl.element().struct.field("key"))
.alias("_dv_langs")
)
.explode("property-labels")
.with_columns(
pl.col("property-labels")
.struct.rename_fields(["language", "property_label"])
)
.unnest("property-labels")
.filter(pl.col("language").is_in(pl.col("_dv_langs")))
.select(["language", "property_label"])
.collect()
)
print(efficient_stage)
print(f"\nrows kept: {efficient_stage.height}")
#!/usr/bin/env python3
"""Compare performance of cartesian vs efficient approach."""
import time
from pathlib import Path
import polars as pl
claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet")
WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"]
def get_base(lf: pl.LazyFrame, n: int) -> pl.DataFrame:
return (
lf.select(pl.col("claims").explode().struct.unnest())
.drop("key")
.explode("value")
.unnest("value")
.unnest("mainsnak")
.head(n)
.collect()
)
def transform_cartesian(base: pl.DataFrame) -> pl.DataFrame:
"""Original approach - cartesian product then filter."""
return (
base.lazy()
.filter(pl.col("datatype").is_in(WIKIBASE_TYPES))
.explode("property-labels")
.with_columns(
pl.col("property-labels").struct.rename_fields(
["prop_lang", "property_label"]
)
)
.unnest("property-labels")
.with_columns(pl.col("datavalue").struct.field("labels").alias("dv_labels"))
.explode("dv_labels")
.with_columns(
pl.col("dv_labels").struct.rename_fields(["dv_lang", "datavalue_label"])
)
.unnest("dv_labels")
.filter(pl.col("prop_lang") == pl.col("dv_lang"))
.rename({"prop_lang": "language"})
.drop("dv_lang")
.collect()
)
def transform_efficient(base: pl.DataFrame) -> pl.DataFrame:
"""Optimized - filter before extraction, use map_elements for lookup."""
return (
base.lazy()
.filter(pl.col("datatype").is_in(WIKIBASE_TYPES))
# Get available languages in datavalue.labels
.with_columns(
pl.col("datavalue")
.struct.field("labels")
.list.eval(pl.element().struct.field("key"))
.alias("_dv_langs")
)
# Explode property-labels only
.explode("property-labels")
.with_columns(
pl.col("property-labels").struct.rename_fields(
["language", "property_label"]
)
)
.unnest("property-labels")
# Filter to languages that exist in datavalue.labels
.filter(pl.col("language").is_in(pl.col("_dv_langs")))
# Extract matching label without exploding
.with_columns(
pl.struct(["datavalue", "language"])
.map_elements(
lambda row: next(
(
item["value"]
for item in (row["datavalue"]["labels"] or [])
if item["key"] == row["language"]
),
None,
),
return_dtype=pl.String,
)
.alias("datavalue_label")
)
.drop("_dv_langs")
.collect()
)
# Test with increasing sizes
lf = pl.scan_parquet(claims_file)
for n in [10, 50, 100, 500]:
print(f"\n{'='*60}")
print(f"Testing with {n} base claims")
print("=" * 60)
base = get_base(lf, n)
wb_count = base.filter(pl.col("datatype").is_in(WIKIBASE_TYPES)).height
print(f" ({wb_count} are wikibase-item/property)")
t0 = time.perf_counter()
r1 = transform_cartesian(base)
t1 = time.perf_counter()
print(f"Cartesian: {t1-t0:.3f}s, {len(r1)} rows")
t0 = time.perf_counter()
r2 = transform_efficient(base)
t1 = time.perf_counter()
print(f"Efficient: {t1-t0:.3f}s, {len(r2)} rows")
# Verify same results
assert len(r1) == len(r2), f"Row count mismatch: {len(r1)} vs {len(r2)}"
#!/usr/bin/env python3
"""Benchmark map_elements vs join-based lookup for label extraction."""
import time
from pathlib import Path
import polars as pl
claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet")
WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"]
def get_base(n: int | None = None) -> pl.LazyFrame:
"""Get base claims data, optionally limited to n rows."""
lf = (
pl.scan_parquet(claims_file)
.select(pl.col("claims").explode().struct.unnest())
.drop("key")
.explode("value")
.unnest("value")
.unnest("mainsnak")
.filter(pl.col("datatype").is_in(WIKIBASE_TYPES))
)
if n:
lf = lf.head(n)
return lf
def transform_map_elements(base: pl.LazyFrame) -> pl.LazyFrame:
"""Current approach using map_elements (slow, no parallelization)."""
return (
base.with_columns(
pl.col("datavalue")
.struct.field("labels")
.list.eval(pl.element().struct.field("key"))
.alias("dv_langs")
)
.explode("property-labels")
.with_columns(
pl.col("property-labels").struct.rename_fields(
["language", "property_label"]
)
)
.unnest("property-labels")
.filter(pl.col("language").is_in(pl.col("dv_langs")))
.with_columns(
pl.struct(["datavalue", "language"])
.map_elements(
lambda row: next(
(
item["value"]
for item in (row["datavalue"]["labels"] or [])
if item["key"] == row["language"]
),
None,
),
return_dtype=pl.String,
)
.alias("datavalue_label")
)
.drop("dv_langs")
)
def transform_join_lookup(base: pl.LazyFrame) -> pl.LazyFrame:
"""Native Polars approach: join-based lookup."""
# Add row index to correlate after exploding
indexed = base.with_row_index("_row_id")
# Create lookup table: explode labels, extract key/value
labels_lookup = (
indexed.select(
"_row_id", pl.col("datavalue").struct.field("labels").alias("dv_labels")
)
.explode("dv_labels")
.with_columns(
pl.col("dv_labels").struct.field("key").alias("_lang"),
pl.col("dv_labels").struct.field("value").alias("datavalue_label"),
)
.select("_row_id", "_lang", "datavalue_label")
)
# Main table: explode property-labels
main = (
indexed.explode("property-labels")
.with_columns(
pl.col("property-labels").struct.rename_fields(
["language", "property_label"]
)
)
.unnest("property-labels")
)
# Join to get matching labels - inner join naturally filters to matches only
return main.join(
labels_lookup,
left_on=["_row_id", "language"],
right_on=["_row_id", "_lang"],
how="inner",
).drop("_row_id")
# Benchmark
for n in [1, 5, 10, 50, 500, None]:
print(f"\n{'='*60}")
print(f"Testing with {n} claims")
print("=" * 60)
base = get_base(n)
count = base.select(pl.len()).collect().item()
print(f" Wikibase claims: {count}")
# map_elements approach
t0 = time.perf_counter()
r1 = transform_map_elements(base).collect()
t1 = time.perf_counter()
map_time = t1 - t0
print(f" map_elements: {map_time:.3f}s, {len(r1)} rows")
# Join lookup approach
t0 = time.perf_counter()
r2 = transform_join_lookup(base).collect()
t1 = time.perf_counter()
join_time = t1 - t0
print(f" join_lookup: {join_time:.3f}s, {len(r2)} rows")
speedup = map_time / join_time if join_time > 0 else float("inf")
print(f" Speedup: {speedup:.1f}x")
# Verify results match
if len(r1) == len(r2):
print(f" ✓ Row counts match")
# Check values
cols = ["property", "language", "property_label", "datavalue_label"]
r1_check = r1.sort(cols[:2]).select(cols).head(10)
r2_check = r2.sort(cols[:2]).select(cols).head(10)
if r1_check.equals(r2_check):
print(f" ✓ Values match")
else:
print(f" ⚠️ Values differ!")
print("map_elements:")
print(r1_check)
print("join_lookup:")
print(r2_check)
else:
print(f" ⚠️ Row count mismatch: {len(r1)} vs {len(r2)}")
============================================================
Testing with 1 claims
============================================================
Wikibase claims: 1
map_elements: 1.162s, 158 rows
join_lookup: 1.078s, 158 rows
Speedup: 1.1x
✓ Row counts match
✓ Values match
============================================================
Testing with 5 claims
============================================================
Wikibase claims: 5
map_elements: 1.177s, 533 rows
join_lookup: 1.143s, 533 rows
Speedup: 1.0x
✓ Row counts match
✓ Values match
============================================================
Testing with 10 claims
============================================================
Wikibase claims: 10
map_elements: 1.222s, 741 rows
join_lookup: 1.130s, 741 rows
Speedup: 1.1x
✓ Row counts match
✓ Values match
============================================================
Testing with 50 claims
============================================================
Wikibase claims: 50
map_elements: 1.452s, 3839 rows
join_lookup: 1.223s, 3839 rows
Speedup: 1.2x
✓ Row counts match
✓ Values match
============================================================
Testing with 500 claims
============================================================
Wikibase claims: 500
map_elements: 4.636s, 41417 rows
join_lookup: 2.155s, 41417 rows
Speedup: 2.2x
✓ Row counts match
✓ Values match
============================================================
Testing with None claims
============================================================
Wikibase claims: 6180
map_elements: 44.759s, 558850 rows
join_lookup: 10.567s, 558850 rows
Speedup: 4.2x
✓ Row counts match
✓ Values match
(wikidata) louis 🌟 ~/lab/wikidata $ time python debug_claims5.py
============================================================
Testing with 10 base claims
============================================================
(4 are wikibase-item/property)
Cartesian: 3.361s, 395 rows
Efficient: 0.070s, 395 rows
============================================================
Testing with 50 base claims
============================================================
(28 are wikibase-item/property)
Cartesian: 10.071s, 1993 rows
Efficient: 0.258s, 1993 rows
============================================================
Testing with 100 base claims
============================================================
(47 are wikibase-item/property)
Cartesian: 16.785s, 3717 rows
Efficient: 0.334s, 3717 rows
============================================================
Testing with 500 base claims
============================================================
(159 are wikibase-item/property)
Cartesian: 41.403s, 14579 rows
Efficient: 1.602s, 14579 rows
real 1m23.240s
user 1m10.232s
sys 0m38.544s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment