FauxFaux · March 20, 2026 13:16
diff --git a/sort_by_file.py b/sort_by_file.py
 """
 Benchmark: sorting dates by line number with file-seek access.

 Simulates the production workflow where we hold only line numbers in memory
 and must open the file, seek to the correct line, read the date string, and
 parse it for every comparison the sort makes.
 """

 from datetime import datetime, timedelta
 from time import perf_counter
 import random
 import tempfile
 import os


 NUM_DATES = 28_000


 # --- Step 1: generate the data file (one ISO date per line) ---

 base = datetime(2020, 1, 1)
 dates = [
    (base + timedelta(seconds=random.randint(0, 200_000_000))).isoformat()
    for _ in range(NUM_DATES)
 ]

 data_path = os.path.join(tempfile.gettempdir(), "sort_bench_dates.txt")
 with open(data_path, "w") as f:
    for d in dates:
        f.write(d + "\n")

 # Build an index of byte-offsets for each line so we can seek directly.
 line_offsets = []
 with open(data_path, "rb") as f:
    while True:
        offset = f.tell()
        line = f.readline()
        if not line:
            break
        line_offsets.append(offset)

 print(f"Wrote {len(line_offsets)} dates to {data_path}")
 print()


 # --- helpers ---

 def read_date_at_line(fh, line_no):
    """Seek to `line_no` in the file, read the line, parse to datetime."""
    fh.seek(line_offsets[line_no])
    raw = fh.readline().strip()
    return datetime.fromisoformat(raw)


 # --- Benchmark 1: sort using a key function (one seek+parse per key call) ---

 line_numbers = list(range(NUM_DATES))

 with open(data_path, "r") as fh:
    start = perf_counter()
    sorted_lines_key = sorted(
        line_numbers,
        key=lambda ln: read_date_at_line(fh, ln),
    )
    elapsed_key = perf_counter() - start

 print(f"[key]  Sorted {NUM_DATES} line numbers in {elapsed_key:.6f} s")


 # --- Benchmark 2: sort with a cmp-style approach (two seeks per comparison) ---

 import functools

 def compare_lines(fh, a, b):
    da = read_date_at_line(fh, a)
    db = read_date_at_line(fh, b)
    return (da > db) - (da < db)

 with open(data_path, "r") as fh:
    start = perf_counter()
    sorted_lines_cmp = sorted(
        line_numbers,
        key=functools.cmp_to_key(lambda a, b: compare_lines(fh, a, b)),
    )
    elapsed_cmp = perf_counter() - start

 print(f"[cmp]  Sorted {NUM_DATES} line numbers in {elapsed_cmp:.6f} s")


 # --- Reference: the original in-memory sorts for comparison ---

 start = perf_counter()
 sorted_str = sorted(dates)
 elapsed_str = perf_counter() - start
 print(f"[ref-str]  In-memory string sort:      {elapsed_str:.6f} s")

 start = perf_counter()
 sorted_parsed = sorted(dates, key=datetime.fromisoformat)
 elapsed_parsed = perf_counter() - start
 print(f"[ref-parse] In-memory parsed sort:      {elapsed_parsed:.6f} s")

 # --- Sanity check ---
 expected = [dates[i] for i in sorted_lines_key]
 assert expected == sorted_parsed, "key-sort result doesn't match reference"
 expected_cmp = [dates[i] for i in sorted_lines_cmp]
 assert expected_cmp == sorted_parsed, "cmp-sort result doesn't match reference"
 print("\nSanity checks passed.")

 os.unlink(data_path)
	"""
	Benchmark: sorting dates by line number with file-seek access.

	Simulates the production workflow where we hold only line numbers in memory
	and must open the file, seek to the correct line, read the date string, and
	parse it for every comparison the sort makes.
	"""

	from datetime import datetime, timedelta
	from time import perf_counter
	import random
	import tempfile
	import os


	NUM_DATES = 28_000


	# --- Step 1: generate the data file (one ISO date per line) ---

	base = datetime(2020, 1, 1)
	dates = [
	(base + timedelta(seconds=random.randint(0, 200_000_000))).isoformat()
	for _ in range(NUM_DATES)
	]

	data_path = os.path.join(tempfile.gettempdir(), "sort_bench_dates.txt")
	with open(data_path, "w") as f:
	for d in dates:
	f.write(d + "\n")

	# Build an index of byte-offsets for each line so we can seek directly.
	line_offsets = []
	with open(data_path, "rb") as f:
	while True:
	offset = f.tell()
	line = f.readline()
	if not line:
	break
	line_offsets.append(offset)

	print(f"Wrote {len(line_offsets)} dates to {data_path}")
	print()


	# --- helpers ---

	def read_date_at_line(fh, line_no):
	"""Seek to `line_no` in the file, read the line, parse to datetime."""
	fh.seek(line_offsets[line_no])
	raw = fh.readline().strip()
	return datetime.fromisoformat(raw)


	# --- Benchmark 1: sort using a key function (one seek+parse per key call) ---

	line_numbers = list(range(NUM_DATES))

	with open(data_path, "r") as fh:
	start = perf_counter()
	sorted_lines_key = sorted(
	line_numbers,
	key=lambda ln: read_date_at_line(fh, ln),
	)
	elapsed_key = perf_counter() - start

	print(f"[key] Sorted {NUM_DATES} line numbers in {elapsed_key:.6f} s")


	# --- Benchmark 2: sort with a cmp-style approach (two seeks per comparison) ---

	import functools

	def compare_lines(fh, a, b):
	da = read_date_at_line(fh, a)
	db = read_date_at_line(fh, b)
	return (da > db) - (da < db)

	with open(data_path, "r") as fh:
	start = perf_counter()
	sorted_lines_cmp = sorted(
	line_numbers,
	key=functools.cmp_to_key(lambda a, b: compare_lines(fh, a, b)),
	)
	elapsed_cmp = perf_counter() - start

	print(f"[cmp] Sorted {NUM_DATES} line numbers in {elapsed_cmp:.6f} s")


	# --- Reference: the original in-memory sorts for comparison ---

	start = perf_counter()
	sorted_str = sorted(dates)
	elapsed_str = perf_counter() - start
	print(f"[ref-str] In-memory string sort: {elapsed_str:.6f} s")

	start = perf_counter()
	sorted_parsed = sorted(dates, key=datetime.fromisoformat)
	elapsed_parsed = perf_counter() - start
	print(f"[ref-parse] In-memory parsed sort: {elapsed_parsed:.6f} s")

	# --- Sanity check ---
	expected = [dates[i] for i in sorted_lines_key]
	assert expected == sorted_parsed, "key-sort result doesn't match reference"
	expected_cmp = [dates[i] for i in sorted_lines_cmp]
	assert expected_cmp == sorted_parsed, "cmp-sort result doesn't match reference"
	print("\nSanity checks passed.")

	os.unlink(data_path)
No results found