Created
March 20, 2026 13:16
-
-
Save FauxFaux/637200c8b91c6d7e01ca07c852ca497e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Benchmark: sorting dates by line number with file-seek access. | |
| Simulates the production workflow where we hold only line numbers in memory | |
| and must open the file, seek to the correct line, read the date string, and | |
| parse it for every comparison the sort makes. | |
| """ | |
| from datetime import datetime, timedelta | |
| from time import perf_counter | |
| import random | |
| import tempfile | |
| import os | |
| NUM_DATES = 28_000 | |
| # --- Step 1: generate the data file (one ISO date per line) --- | |
| base = datetime(2020, 1, 1) | |
| dates = [ | |
| (base + timedelta(seconds=random.randint(0, 200_000_000))).isoformat() | |
| for _ in range(NUM_DATES) | |
| ] | |
| data_path = os.path.join(tempfile.gettempdir(), "sort_bench_dates.txt") | |
| with open(data_path, "w") as f: | |
| for d in dates: | |
| f.write(d + "\n") | |
| # Build an index of byte-offsets for each line so we can seek directly. | |
| line_offsets = [] | |
| with open(data_path, "rb") as f: | |
| while True: | |
| offset = f.tell() | |
| line = f.readline() | |
| if not line: | |
| break | |
| line_offsets.append(offset) | |
| print(f"Wrote {len(line_offsets)} dates to {data_path}") | |
| print() | |
| # --- helpers --- | |
| def read_date_at_line(fh, line_no): | |
| """Seek to `line_no` in the file, read the line, parse to datetime.""" | |
| fh.seek(line_offsets[line_no]) | |
| raw = fh.readline().strip() | |
| return datetime.fromisoformat(raw) | |
| # --- Benchmark 1: sort using a key function (one seek+parse per key call) --- | |
| line_numbers = list(range(NUM_DATES)) | |
| with open(data_path, "r") as fh: | |
| start = perf_counter() | |
| sorted_lines_key = sorted( | |
| line_numbers, | |
| key=lambda ln: read_date_at_line(fh, ln), | |
| ) | |
| elapsed_key = perf_counter() - start | |
| print(f"[key] Sorted {NUM_DATES} line numbers in {elapsed_key:.6f} s") | |
| # --- Benchmark 2: sort with a cmp-style approach (two seeks per comparison) --- | |
| import functools | |
| def compare_lines(fh, a, b): | |
| da = read_date_at_line(fh, a) | |
| db = read_date_at_line(fh, b) | |
| return (da > db) - (da < db) | |
| with open(data_path, "r") as fh: | |
| start = perf_counter() | |
| sorted_lines_cmp = sorted( | |
| line_numbers, | |
| key=functools.cmp_to_key(lambda a, b: compare_lines(fh, a, b)), | |
| ) | |
| elapsed_cmp = perf_counter() - start | |
| print(f"[cmp] Sorted {NUM_DATES} line numbers in {elapsed_cmp:.6f} s") | |
| # --- Reference: the original in-memory sorts for comparison --- | |
| start = perf_counter() | |
| sorted_str = sorted(dates) | |
| elapsed_str = perf_counter() - start | |
| print(f"[ref-str] In-memory string sort: {elapsed_str:.6f} s") | |
| start = perf_counter() | |
| sorted_parsed = sorted(dates, key=datetime.fromisoformat) | |
| elapsed_parsed = perf_counter() - start | |
| print(f"[ref-parse] In-memory parsed sort: {elapsed_parsed:.6f} s") | |
| # --- Sanity check --- | |
| expected = [dates[i] for i in sorted_lines_key] | |
| assert expected == sorted_parsed, "key-sort result doesn't match reference" | |
| expected_cmp = [dates[i] for i in sorted_lines_cmp] | |
| assert expected_cmp == sorted_parsed, "cmp-sort result doesn't match reference" | |
| print("\nSanity checks passed.") | |
| os.unlink(data_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment