ebetica · March 23, 2026 19:29
diff --git a/lance_query_bench.py b/lance_query_bench.py
 """
 Benchmark lance scalar index lookups: sequential, batched, and async.

 Lance (https://lancedb.github.io/lance/) is a columnar format that supports
 BTREE scalar indices, making point lookups fast even over S3 — no local copy
 or database server needed. This script benchmarks three lookup patterns against
 a 120M-row dataset stored on S3:

  - Sequential: one filter query at a time (baseline)
  - Batched IN: single query with WHERE protein_hash IN (...)
  - Async: N concurrent queries via thread pool

 The dataset has BTREE indices on both `header` and `protein_hash`.
 This script queries by `protein_hash` (32-char hex string,
 e.g. "000099e72d43f68f7466a52654dfb9dc"), but you can swap to `header`.

 Usage:
    python lance_query_bench.py
    python lance_query_bench.py /local/path/to/dataset.lance

 Example results (120M rows, 256 queries, over S3):
    Sequential:  141.15s  (551 ms/query)
    Batched IN:    4.20s  (16.4 ms/query)
    Async:         0.98s  (3.8 ms/query effective)

    Speedup vs sequential:  batch=34x  async=143x
 """

 import asyncio
 import random
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor

 import lance
 import pyarrow as pa

 N = 256
 COLS = ["header", "protein_hash", "ptm", "mean_plddt"]
 DEFAULT = "s3://es-shared-aws-storage/data/esmatlas_v2/260323-atlas-small-test.lance"


 def lookup(ds: lance.LanceDataset, protein_hash: str) -> pa.Table:
    return ds.to_table(filter=f"protein_hash = '{protein_hash}'", columns=COLS)


 def lookup_batch(ds: lance.LanceDataset, protein_hashes: list[str]) -> pa.Table:
    quoted = ", ".join(f"'{h}'" for h in protein_hashes)
    return ds.to_table(filter=f"protein_hash IN ({quoted})", columns=COLS)


 async def lookup_async(ds: lance.LanceDataset, protein_hashes: list[str]) -> list[pa.Table]:
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor(max_workers=len(protein_hashes)) as pool:
        return await asyncio.gather(*(loop.run_in_executor(pool, lookup, ds, h) for h in protein_hashes))


 def bench(fn):
    t0 = time.perf_counter()
    result = fn()
    return time.perf_counter() - t0, result


 def main():
    uri = sys.argv[1] if len(sys.argv) > 1 else DEFAULT
    ds = lance.dataset(uri)
    indices = random.sample(range(ds.count_rows()), N)
    protein_hashes: list[str] = ds.take(indices, columns=["protein_hash"]).column("protein_hash").to_pylist()

    t_seq, _ = bench(lambda: [lookup(ds, h) for h in protein_hashes])
    t_batch, _ = bench(lambda: lookup_batch(ds, protein_hashes))
    t_async, _ = bench(lambda: asyncio.run(lookup_async(ds, protein_hashes)))

    print(f"Dataset: {uri}")
    print(f"Rows: {ds.count_rows():,}  Queries: {N}\n")
    print(f"Sequential:  {t_seq:6.2f}s  ({t_seq/N*1000:.0f} ms/query)")
    print(f"Batched IN:  {t_batch:6.2f}s  ({t_batch/N*1000:.1f} ms/query)")
    print(f"Async:       {t_async:6.2f}s  ({t_async/N*1000:.1f} ms/query effective)")
    print(f"\nSpeedup vs sequential:  batch={t_seq/t_batch:.0f}x  async={t_seq/t_async:.0f}x")


 if __name__ == "__main__":
    main()
	"""
	Benchmark lance scalar index lookups: sequential, batched, and async.

	Lance (https://lancedb.github.io/lance/) is a columnar format that supports
	BTREE scalar indices, making point lookups fast even over S3 — no local copy
	or database server needed. This script benchmarks three lookup patterns against
	a 120M-row dataset stored on S3:

	- Sequential: one filter query at a time (baseline)
	- Batched IN: single query with WHERE protein_hash IN (...)
	- Async: N concurrent queries via thread pool

	The dataset has BTREE indices on both `header` and `protein_hash`.
	This script queries by `protein_hash` (32-char hex string,
	e.g. "000099e72d43f68f7466a52654dfb9dc"), but you can swap to `header`.

	Usage:
	python lance_query_bench.py
	python lance_query_bench.py /local/path/to/dataset.lance

	Example results (120M rows, 256 queries, over S3):
	Sequential: 141.15s (551 ms/query)
	Batched IN: 4.20s (16.4 ms/query)
	Async: 0.98s (3.8 ms/query effective)

	Speedup vs sequential: batch=34x async=143x
	"""

	import asyncio
	import random
	import sys
	import time
	from concurrent.futures import ThreadPoolExecutor

	import lance
	import pyarrow as pa

	N = 256
	COLS = ["header", "protein_hash", "ptm", "mean_plddt"]
	DEFAULT = "s3://es-shared-aws-storage/data/esmatlas_v2/260323-atlas-small-test.lance"


	def lookup(ds: lance.LanceDataset, protein_hash: str) -> pa.Table:
	return ds.to_table(filter=f"protein_hash = '{protein_hash}'", columns=COLS)


	def lookup_batch(ds: lance.LanceDataset, protein_hashes: list[str]) -> pa.Table:
	quoted = ", ".join(f"'{h}'" for h in protein_hashes)
	return ds.to_table(filter=f"protein_hash IN ({quoted})", columns=COLS)


	async def lookup_async(ds: lance.LanceDataset, protein_hashes: list[str]) -> list[pa.Table]:
	loop = asyncio.get_event_loop()
	with ThreadPoolExecutor(max_workers=len(protein_hashes)) as pool:
	return await asyncio.gather(*(loop.run_in_executor(pool, lookup, ds, h) for h in protein_hashes))


	def bench(fn):
	t0 = time.perf_counter()
	result = fn()
	return time.perf_counter() - t0, result


	def main():
	uri = sys.argv[1] if len(sys.argv) > 1 else DEFAULT
	ds = lance.dataset(uri)
	indices = random.sample(range(ds.count_rows()), N)
	protein_hashes: list[str] = ds.take(indices, columns=["protein_hash"]).column("protein_hash").to_pylist()

	t_seq, _ = bench(lambda: [lookup(ds, h) for h in protein_hashes])
	t_batch, _ = bench(lambda: lookup_batch(ds, protein_hashes))
	t_async, _ = bench(lambda: asyncio.run(lookup_async(ds, protein_hashes)))

	print(f"Dataset: {uri}")
	print(f"Rows: {ds.count_rows():,} Queries: {N}\n")
	print(f"Sequential: {t_seq:6.2f}s ({t_seq/N*1000:.0f} ms/query)")
	print(f"Batched IN: {t_batch:6.2f}s ({t_batch/N*1000:.1f} ms/query)")
	print(f"Async: {t_async:6.2f}s ({t_async/N*1000:.1f} ms/query effective)")
	print(f"\nSpeedup vs sequential: batch={t_seq/t_batch:.0f}x async={t_seq/t_async:.0f}x")


	if __name__ == "__main__":
	main()
No results found