Skip to content

Instantly share code, notes, and snippets.

@ebetica
Last active March 23, 2026 19:29
Show Gist options
  • Select an option

  • Save ebetica/5e21501b3249e642e66e71f812878be9 to your computer and use it in GitHub Desktop.

Select an option

Save ebetica/5e21501b3249e642e66e71f812878be9 to your computer and use it in GitHub Desktop.
Benchmark lance scalar index lookups: sequential, batched, and async
"""
Benchmark lance scalar index lookups: sequential, batched, and async.
Lance (https://lancedb.github.io/lance/) is a columnar format that supports
BTREE scalar indices, making point lookups fast even over S3 — no local copy
or database server needed. This script benchmarks three lookup patterns against
a 120M-row dataset stored on S3:
- Sequential: one filter query at a time (baseline)
- Batched IN: single query with WHERE protein_hash IN (...)
- Async: N concurrent queries via thread pool
The dataset has BTREE indices on both `header` and `protein_hash`.
This script queries by `protein_hash` (32-char hex string,
e.g. "000099e72d43f68f7466a52654dfb9dc"), but you can swap to `header`.
Usage:
python lance_query_bench.py
python lance_query_bench.py /local/path/to/dataset.lance
Example results (120M rows, 256 queries, over S3):
Sequential: 141.15s (551 ms/query)
Batched IN: 4.20s (16.4 ms/query)
Async: 0.98s (3.8 ms/query effective)
Speedup vs sequential: batch=34x async=143x
"""
import asyncio
import random
import sys
import time
from concurrent.futures import ThreadPoolExecutor
import lance
import pyarrow as pa
N = 256
COLS = ["header", "protein_hash", "ptm", "mean_plddt"]
DEFAULT = "s3://es-shared-aws-storage/data/esmatlas_v2/260323-atlas-small-test.lance"
def lookup(ds: lance.LanceDataset, protein_hash: str) -> pa.Table:
return ds.to_table(filter=f"protein_hash = '{protein_hash}'", columns=COLS)
def lookup_batch(ds: lance.LanceDataset, protein_hashes: list[str]) -> pa.Table:
quoted = ", ".join(f"'{h}'" for h in protein_hashes)
return ds.to_table(filter=f"protein_hash IN ({quoted})", columns=COLS)
async def lookup_async(ds: lance.LanceDataset, protein_hashes: list[str]) -> list[pa.Table]:
loop = asyncio.get_event_loop()
with ThreadPoolExecutor(max_workers=len(protein_hashes)) as pool:
return await asyncio.gather(*(loop.run_in_executor(pool, lookup, ds, h) for h in protein_hashes))
def bench(fn):
t0 = time.perf_counter()
result = fn()
return time.perf_counter() - t0, result
def main():
uri = sys.argv[1] if len(sys.argv) > 1 else DEFAULT
ds = lance.dataset(uri)
indices = random.sample(range(ds.count_rows()), N)
protein_hashes: list[str] = ds.take(indices, columns=["protein_hash"]).column("protein_hash").to_pylist()
t_seq, _ = bench(lambda: [lookup(ds, h) for h in protein_hashes])
t_batch, _ = bench(lambda: lookup_batch(ds, protein_hashes))
t_async, _ = bench(lambda: asyncio.run(lookup_async(ds, protein_hashes)))
print(f"Dataset: {uri}")
print(f"Rows: {ds.count_rows():,} Queries: {N}\n")
print(f"Sequential: {t_seq:6.2f}s ({t_seq/N*1000:.0f} ms/query)")
print(f"Batched IN: {t_batch:6.2f}s ({t_batch/N*1000:.1f} ms/query)")
print(f"Async: {t_async:6.2f}s ({t_async/N*1000:.1f} ms/query effective)")
print(f"\nSpeedup vs sequential: batch={t_seq/t_batch:.0f}x async={t_seq/t_async:.0f}x")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment