Skip to content

Instantly share code, notes, and snippets.

@FauxFaux
Created March 20, 2026 13:16
Show Gist options
  • Select an option

  • Save FauxFaux/637200c8b91c6d7e01ca07c852ca497e to your computer and use it in GitHub Desktop.

Select an option

Save FauxFaux/637200c8b91c6d7e01ca07c852ca497e to your computer and use it in GitHub Desktop.
"""
Benchmark: sorting dates by line number with file-seek access.
Simulates the production workflow where we hold only line numbers in memory
and must open the file, seek to the correct line, read the date string, and
parse it for every comparison the sort makes.
"""
from datetime import datetime, timedelta
from time import perf_counter
import random
import tempfile
import os
NUM_DATES = 28_000
# --- Step 1: generate the data file (one ISO date per line) ---
base = datetime(2020, 1, 1)
dates = [
(base + timedelta(seconds=random.randint(0, 200_000_000))).isoformat()
for _ in range(NUM_DATES)
]
data_path = os.path.join(tempfile.gettempdir(), "sort_bench_dates.txt")
with open(data_path, "w") as f:
for d in dates:
f.write(d + "\n")
# Build an index of byte-offsets for each line so we can seek directly.
line_offsets = []
with open(data_path, "rb") as f:
while True:
offset = f.tell()
line = f.readline()
if not line:
break
line_offsets.append(offset)
print(f"Wrote {len(line_offsets)} dates to {data_path}")
print()
# --- helpers ---
def read_date_at_line(fh, line_no):
"""Seek to `line_no` in the file, read the line, parse to datetime."""
fh.seek(line_offsets[line_no])
raw = fh.readline().strip()
return datetime.fromisoformat(raw)
# --- Benchmark 1: sort using a key function (one seek+parse per key call) ---
line_numbers = list(range(NUM_DATES))
with open(data_path, "r") as fh:
start = perf_counter()
sorted_lines_key = sorted(
line_numbers,
key=lambda ln: read_date_at_line(fh, ln),
)
elapsed_key = perf_counter() - start
print(f"[key] Sorted {NUM_DATES} line numbers in {elapsed_key:.6f} s")
# --- Benchmark 2: sort with a cmp-style approach (two seeks per comparison) ---
import functools
def compare_lines(fh, a, b):
da = read_date_at_line(fh, a)
db = read_date_at_line(fh, b)
return (da > db) - (da < db)
with open(data_path, "r") as fh:
start = perf_counter()
sorted_lines_cmp = sorted(
line_numbers,
key=functools.cmp_to_key(lambda a, b: compare_lines(fh, a, b)),
)
elapsed_cmp = perf_counter() - start
print(f"[cmp] Sorted {NUM_DATES} line numbers in {elapsed_cmp:.6f} s")
# --- Reference: the original in-memory sorts for comparison ---
start = perf_counter()
sorted_str = sorted(dates)
elapsed_str = perf_counter() - start
print(f"[ref-str] In-memory string sort: {elapsed_str:.6f} s")
start = perf_counter()
sorted_parsed = sorted(dates, key=datetime.fromisoformat)
elapsed_parsed = perf_counter() - start
print(f"[ref-parse] In-memory parsed sort: {elapsed_parsed:.6f} s")
# --- Sanity check ---
expected = [dates[i] for i in sorted_lines_key]
assert expected == sorted_parsed, "key-sort result doesn't match reference"
expected_cmp = [dates[i] for i in sorted_lines_cmp]
assert expected_cmp == sorted_parsed, "cmp-sort result doesn't match reference"
print("\nSanity checks passed.")
os.unlink(data_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment