Last active
March 23, 2026 19:29
-
-
Save ebetica/5e21501b3249e642e66e71f812878be9 to your computer and use it in GitHub Desktop.
Benchmark lance scalar index lookups: sequential, batched, and async
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Benchmark lance scalar index lookups: sequential, batched, and async. | |
| Lance (https://lancedb.github.io/lance/) is a columnar format that supports | |
| BTREE scalar indices, making point lookups fast even over S3 — no local copy | |
| or database server needed. This script benchmarks three lookup patterns against | |
| a 120M-row dataset stored on S3: | |
| - Sequential: one filter query at a time (baseline) | |
| - Batched IN: single query with WHERE protein_hash IN (...) | |
| - Async: N concurrent queries via thread pool | |
| The dataset has BTREE indices on both `header` and `protein_hash`. | |
| This script queries by `protein_hash` (32-char hex string, | |
| e.g. "000099e72d43f68f7466a52654dfb9dc"), but you can swap to `header`. | |
| Usage: | |
| python lance_query_bench.py | |
| python lance_query_bench.py /local/path/to/dataset.lance | |
| Example results (120M rows, 256 queries, over S3): | |
| Sequential: 141.15s (551 ms/query) | |
| Batched IN: 4.20s (16.4 ms/query) | |
| Async: 0.98s (3.8 ms/query effective) | |
| Speedup vs sequential: batch=34x async=143x | |
| """ | |
| import asyncio | |
| import random | |
| import sys | |
| import time | |
| from concurrent.futures import ThreadPoolExecutor | |
| import lance | |
| import pyarrow as pa | |
| N = 256 | |
| COLS = ["header", "protein_hash", "ptm", "mean_plddt"] | |
| DEFAULT = "s3://es-shared-aws-storage/data/esmatlas_v2/260323-atlas-small-test.lance" | |
| def lookup(ds: lance.LanceDataset, protein_hash: str) -> pa.Table: | |
| return ds.to_table(filter=f"protein_hash = '{protein_hash}'", columns=COLS) | |
| def lookup_batch(ds: lance.LanceDataset, protein_hashes: list[str]) -> pa.Table: | |
| quoted = ", ".join(f"'{h}'" for h in protein_hashes) | |
| return ds.to_table(filter=f"protein_hash IN ({quoted})", columns=COLS) | |
| async def lookup_async(ds: lance.LanceDataset, protein_hashes: list[str]) -> list[pa.Table]: | |
| loop = asyncio.get_event_loop() | |
| with ThreadPoolExecutor(max_workers=len(protein_hashes)) as pool: | |
| return await asyncio.gather(*(loop.run_in_executor(pool, lookup, ds, h) for h in protein_hashes)) | |
| def bench(fn): | |
| t0 = time.perf_counter() | |
| result = fn() | |
| return time.perf_counter() - t0, result | |
| def main(): | |
| uri = sys.argv[1] if len(sys.argv) > 1 else DEFAULT | |
| ds = lance.dataset(uri) | |
| indices = random.sample(range(ds.count_rows()), N) | |
| protein_hashes: list[str] = ds.take(indices, columns=["protein_hash"]).column("protein_hash").to_pylist() | |
| t_seq, _ = bench(lambda: [lookup(ds, h) for h in protein_hashes]) | |
| t_batch, _ = bench(lambda: lookup_batch(ds, protein_hashes)) | |
| t_async, _ = bench(lambda: asyncio.run(lookup_async(ds, protein_hashes))) | |
| print(f"Dataset: {uri}") | |
| print(f"Rows: {ds.count_rows():,} Queries: {N}\n") | |
| print(f"Sequential: {t_seq:6.2f}s ({t_seq/N*1000:.0f} ms/query)") | |
| print(f"Batched IN: {t_batch:6.2f}s ({t_batch/N*1000:.1f} ms/query)") | |
| print(f"Async: {t_async:6.2f}s ({t_async/N*1000:.1f} ms/query effective)") | |
| print(f"\nSpeedup vs sequential: batch={t_seq/t_batch:.0f}x async={t_seq/t_async:.0f}x") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment