Last active
January 17, 2026 22:12
-
-
Save lmmx/1053c35327c72dab07ec25bc736fe1bb to your computer and use it in GitHub Desktop.
Eliminating a Cartesian product in Polars dataframe handling https://cog.spin.systems/wikidata-digging-avoiding-combinatorial-explosion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import polars as pl | |
| from pathlib import Path | |
| claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet") | |
| lf = pl.scan_parquet(claims_file) | |
| base = ( | |
| lf.select(pl.col("claims").explode().struct.unnest()) | |
| .drop("key") | |
| .explode("value") | |
| .unnest("value") | |
| .unnest("mainsnak") | |
| .head(1) | |
| .collect() | |
| ) | |
| print(base) | |
| print() | |
| print(base.schema) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import polars as pl | |
| from pathlib import Path | |
| claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet") | |
| WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"] | |
| lf = pl.scan_parquet(claims_file) | |
| base = ( | |
| lf.select(pl.col("claims").explode().struct.unnest()) | |
| .drop("key") | |
| .explode("value") | |
| .unnest("value") | |
| .unnest("mainsnak") | |
| .filter(pl.col("datatype").is_in(WIKIBASE_TYPES)) | |
| .head(1) | |
| .collect() | |
| ) | |
| cartesian = ( | |
| base.lazy() | |
| .explode("property-labels") | |
| .with_columns( | |
| pl.col("property-labels") | |
| .struct.rename_fields(["prop_lang", "property_label"]) | |
| ) | |
| .unnest("property-labels") | |
| .with_columns(pl.col("datavalue").struct.field("labels").alias("dv_labels")) | |
| .explode("dv_labels") | |
| .with_columns( | |
| pl.col("dv_labels") | |
| .struct.rename_fields(["dv_lang", "datavalue_label"]) | |
| ) | |
| .unnest("dv_labels") | |
| .select(["prop_lang", "dv_lang", "property_label", "datavalue_label"]) | |
| .collect() | |
| ) | |
| print(cartesian) | |
| print(f"\nrows produced: {cartesian.height}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import polars as pl | |
| from pathlib import Path | |
| claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet") | |
| lf = pl.scan_parquet(claims_file) | |
| df = ( | |
| lf.select(pl.col("claims").explode().struct.unnest()) | |
| .drop("key") | |
| .explode("value") | |
| .unnest("value") | |
| .unnest("mainsnak") | |
| .head(3) | |
| .with_columns( | |
| pl.col("datavalue") | |
| .struct.field("labels") | |
| .list.eval(pl.element().struct.field("key")) | |
| .alias("_dv_langs") | |
| ) | |
| .select(["property", "_dv_langs"]) | |
| .collect() | |
| ) | |
| print(df) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import polars as pl | |
| from pathlib import Path | |
| claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet") | |
| WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"] | |
| lf = pl.scan_parquet(claims_file) | |
| base = ( | |
| lf.select(pl.col("claims").explode().struct.unnest()) | |
| .drop("key") | |
| .explode("value") | |
| .unnest("value") | |
| .unnest("mainsnak") | |
| .filter(pl.col("datatype").is_in(WIKIBASE_TYPES)) | |
| .head(1) | |
| ) | |
| efficient_stage = ( | |
| base | |
| .with_columns( | |
| pl.col("datavalue") | |
| .struct.field("labels") | |
| .list.eval(pl.element().struct.field("key")) | |
| .alias("_dv_langs") | |
| ) | |
| .explode("property-labels") | |
| .with_columns( | |
| pl.col("property-labels") | |
| .struct.rename_fields(["language", "property_label"]) | |
| ) | |
| .unnest("property-labels") | |
| .filter(pl.col("language").is_in(pl.col("_dv_langs"))) | |
| .select(["language", "property_label"]) | |
| .collect() | |
| ) | |
| print(efficient_stage) | |
| print(f"\nrows kept: {efficient_stage.height}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Compare performance of cartesian vs efficient approach.""" | |
| import time | |
| from pathlib import Path | |
| import polars as pl | |
| claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet") | |
| WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"] | |
| def get_base(lf: pl.LazyFrame, n: int) -> pl.DataFrame: | |
| return ( | |
| lf.select(pl.col("claims").explode().struct.unnest()) | |
| .drop("key") | |
| .explode("value") | |
| .unnest("value") | |
| .unnest("mainsnak") | |
| .head(n) | |
| .collect() | |
| ) | |
| def transform_cartesian(base: pl.DataFrame) -> pl.DataFrame: | |
| """Original approach - cartesian product then filter.""" | |
| return ( | |
| base.lazy() | |
| .filter(pl.col("datatype").is_in(WIKIBASE_TYPES)) | |
| .explode("property-labels") | |
| .with_columns( | |
| pl.col("property-labels").struct.rename_fields( | |
| ["prop_lang", "property_label"] | |
| ) | |
| ) | |
| .unnest("property-labels") | |
| .with_columns(pl.col("datavalue").struct.field("labels").alias("dv_labels")) | |
| .explode("dv_labels") | |
| .with_columns( | |
| pl.col("dv_labels").struct.rename_fields(["dv_lang", "datavalue_label"]) | |
| ) | |
| .unnest("dv_labels") | |
| .filter(pl.col("prop_lang") == pl.col("dv_lang")) | |
| .rename({"prop_lang": "language"}) | |
| .drop("dv_lang") | |
| .collect() | |
| ) | |
| def transform_efficient(base: pl.DataFrame) -> pl.DataFrame: | |
| """Optimized - filter before extraction, use map_elements for lookup.""" | |
| return ( | |
| base.lazy() | |
| .filter(pl.col("datatype").is_in(WIKIBASE_TYPES)) | |
| # Get available languages in datavalue.labels | |
| .with_columns( | |
| pl.col("datavalue") | |
| .struct.field("labels") | |
| .list.eval(pl.element().struct.field("key")) | |
| .alias("_dv_langs") | |
| ) | |
| # Explode property-labels only | |
| .explode("property-labels") | |
| .with_columns( | |
| pl.col("property-labels").struct.rename_fields( | |
| ["language", "property_label"] | |
| ) | |
| ) | |
| .unnest("property-labels") | |
| # Filter to languages that exist in datavalue.labels | |
| .filter(pl.col("language").is_in(pl.col("_dv_langs"))) | |
| # Extract matching label without exploding | |
| .with_columns( | |
| pl.struct(["datavalue", "language"]) | |
| .map_elements( | |
| lambda row: next( | |
| ( | |
| item["value"] | |
| for item in (row["datavalue"]["labels"] or []) | |
| if item["key"] == row["language"] | |
| ), | |
| None, | |
| ), | |
| return_dtype=pl.String, | |
| ) | |
| .alias("datavalue_label") | |
| ) | |
| .drop("_dv_langs") | |
| .collect() | |
| ) | |
| # Test with increasing sizes | |
| lf = pl.scan_parquet(claims_file) | |
| for n in [10, 50, 100, 500]: | |
| print(f"\n{'='*60}") | |
| print(f"Testing with {n} base claims") | |
| print("=" * 60) | |
| base = get_base(lf, n) | |
| wb_count = base.filter(pl.col("datatype").is_in(WIKIBASE_TYPES)).height | |
| print(f" ({wb_count} are wikibase-item/property)") | |
| t0 = time.perf_counter() | |
| r1 = transform_cartesian(base) | |
| t1 = time.perf_counter() | |
| print(f"Cartesian: {t1-t0:.3f}s, {len(r1)} rows") | |
| t0 = time.perf_counter() | |
| r2 = transform_efficient(base) | |
| t1 = time.perf_counter() | |
| print(f"Efficient: {t1-t0:.3f}s, {len(r2)} rows") | |
| # Verify same results | |
| assert len(r1) == len(r2), f"Row count mismatch: {len(r1)} vs {len(r2)}" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Benchmark map_elements vs join-based lookup for label extraction.""" | |
| import time | |
| from pathlib import Path | |
| import polars as pl | |
| claims_file = Path("results/claims/chunk_0-00400-of-00546.parquet") | |
| WIKIBASE_TYPES = ["wikibase-item", "wikibase-property"] | |
| def get_base(n: int | None = None) -> pl.LazyFrame: | |
| """Get base claims data, optionally limited to n rows.""" | |
| lf = ( | |
| pl.scan_parquet(claims_file) | |
| .select(pl.col("claims").explode().struct.unnest()) | |
| .drop("key") | |
| .explode("value") | |
| .unnest("value") | |
| .unnest("mainsnak") | |
| .filter(pl.col("datatype").is_in(WIKIBASE_TYPES)) | |
| ) | |
| if n: | |
| lf = lf.head(n) | |
| return lf | |
| def transform_map_elements(base: pl.LazyFrame) -> pl.LazyFrame: | |
| """Current approach using map_elements (slow, no parallelization).""" | |
| return ( | |
| base.with_columns( | |
| pl.col("datavalue") | |
| .struct.field("labels") | |
| .list.eval(pl.element().struct.field("key")) | |
| .alias("dv_langs") | |
| ) | |
| .explode("property-labels") | |
| .with_columns( | |
| pl.col("property-labels").struct.rename_fields( | |
| ["language", "property_label"] | |
| ) | |
| ) | |
| .unnest("property-labels") | |
| .filter(pl.col("language").is_in(pl.col("dv_langs"))) | |
| .with_columns( | |
| pl.struct(["datavalue", "language"]) | |
| .map_elements( | |
| lambda row: next( | |
| ( | |
| item["value"] | |
| for item in (row["datavalue"]["labels"] or []) | |
| if item["key"] == row["language"] | |
| ), | |
| None, | |
| ), | |
| return_dtype=pl.String, | |
| ) | |
| .alias("datavalue_label") | |
| ) | |
| .drop("dv_langs") | |
| ) | |
| def transform_join_lookup(base: pl.LazyFrame) -> pl.LazyFrame: | |
| """Native Polars approach: join-based lookup.""" | |
| # Add row index to correlate after exploding | |
| indexed = base.with_row_index("_row_id") | |
| # Create lookup table: explode labels, extract key/value | |
| labels_lookup = ( | |
| indexed.select( | |
| "_row_id", pl.col("datavalue").struct.field("labels").alias("dv_labels") | |
| ) | |
| .explode("dv_labels") | |
| .with_columns( | |
| pl.col("dv_labels").struct.field("key").alias("_lang"), | |
| pl.col("dv_labels").struct.field("value").alias("datavalue_label"), | |
| ) | |
| .select("_row_id", "_lang", "datavalue_label") | |
| ) | |
| # Main table: explode property-labels | |
| main = ( | |
| indexed.explode("property-labels") | |
| .with_columns( | |
| pl.col("property-labels").struct.rename_fields( | |
| ["language", "property_label"] | |
| ) | |
| ) | |
| .unnest("property-labels") | |
| ) | |
| # Join to get matching labels - inner join naturally filters to matches only | |
| return main.join( | |
| labels_lookup, | |
| left_on=["_row_id", "language"], | |
| right_on=["_row_id", "_lang"], | |
| how="inner", | |
| ).drop("_row_id") | |
| # Benchmark | |
| for n in [1, 5, 10, 50, 500, None]: | |
| print(f"\n{'='*60}") | |
| print(f"Testing with {n} claims") | |
| print("=" * 60) | |
| base = get_base(n) | |
| count = base.select(pl.len()).collect().item() | |
| print(f" Wikibase claims: {count}") | |
| # map_elements approach | |
| t0 = time.perf_counter() | |
| r1 = transform_map_elements(base).collect() | |
| t1 = time.perf_counter() | |
| map_time = t1 - t0 | |
| print(f" map_elements: {map_time:.3f}s, {len(r1)} rows") | |
| # Join lookup approach | |
| t0 = time.perf_counter() | |
| r2 = transform_join_lookup(base).collect() | |
| t1 = time.perf_counter() | |
| join_time = t1 - t0 | |
| print(f" join_lookup: {join_time:.3f}s, {len(r2)} rows") | |
| speedup = map_time / join_time if join_time > 0 else float("inf") | |
| print(f" Speedup: {speedup:.1f}x") | |
| # Verify results match | |
| if len(r1) == len(r2): | |
| print(f" ✓ Row counts match") | |
| # Check values | |
| cols = ["property", "language", "property_label", "datavalue_label"] | |
| r1_check = r1.sort(cols[:2]).select(cols).head(10) | |
| r2_check = r2.sort(cols[:2]).select(cols).head(10) | |
| if r1_check.equals(r2_check): | |
| print(f" ✓ Values match") | |
| else: | |
| print(f" ⚠️ Values differ!") | |
| print("map_elements:") | |
| print(r1_check) | |
| print("join_lookup:") | |
| print(r2_check) | |
| else: | |
| print(f" ⚠️ Row count mismatch: {len(r1)} vs {len(r2)}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ============================================================ | |
| Testing with 1 claims | |
| ============================================================ | |
| Wikibase claims: 1 | |
| map_elements: 1.162s, 158 rows | |
| join_lookup: 1.078s, 158 rows | |
| Speedup: 1.1x | |
| ✓ Row counts match | |
| ✓ Values match | |
| ============================================================ | |
| Testing with 5 claims | |
| ============================================================ | |
| Wikibase claims: 5 | |
| map_elements: 1.177s, 533 rows | |
| join_lookup: 1.143s, 533 rows | |
| Speedup: 1.0x | |
| ✓ Row counts match | |
| ✓ Values match | |
| ============================================================ | |
| Testing with 10 claims | |
| ============================================================ | |
| Wikibase claims: 10 | |
| map_elements: 1.222s, 741 rows | |
| join_lookup: 1.130s, 741 rows | |
| Speedup: 1.1x | |
| ✓ Row counts match | |
| ✓ Values match | |
| ============================================================ | |
| Testing with 50 claims | |
| ============================================================ | |
| Wikibase claims: 50 | |
| map_elements: 1.452s, 3839 rows | |
| join_lookup: 1.223s, 3839 rows | |
| Speedup: 1.2x | |
| ✓ Row counts match | |
| ✓ Values match | |
| ============================================================ | |
| Testing with 500 claims | |
| ============================================================ | |
| Wikibase claims: 500 | |
| map_elements: 4.636s, 41417 rows | |
| join_lookup: 2.155s, 41417 rows | |
| Speedup: 2.2x | |
| ✓ Row counts match | |
| ✓ Values match | |
| ============================================================ | |
| Testing with None claims | |
| ============================================================ | |
| Wikibase claims: 6180 | |
| map_elements: 44.759s, 558850 rows | |
| join_lookup: 10.567s, 558850 rows | |
| Speedup: 4.2x | |
| ✓ Row counts match | |
| ✓ Values match |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (wikidata) louis 🌟 ~/lab/wikidata $ time python debug_claims5.py | |
| ============================================================ | |
| Testing with 10 base claims | |
| ============================================================ | |
| (4 are wikibase-item/property) | |
| Cartesian: 3.361s, 395 rows | |
| Efficient: 0.070s, 395 rows | |
| ============================================================ | |
| Testing with 50 base claims | |
| ============================================================ | |
| (28 are wikibase-item/property) | |
| Cartesian: 10.071s, 1993 rows | |
| Efficient: 0.258s, 1993 rows | |
| ============================================================ | |
| Testing with 100 base claims | |
| ============================================================ | |
| (47 are wikibase-item/property) | |
| Cartesian: 16.785s, 3717 rows | |
| Efficient: 0.334s, 3717 rows | |
| ============================================================ | |
| Testing with 500 base claims | |
| ============================================================ | |
| (159 are wikibase-item/property) | |
| Cartesian: 41.403s, 14579 rows | |
| Efficient: 1.602s, 14579 rows | |
| real 1m23.240s | |
| user 1m10.232s | |
| sys 0m38.544s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment