Created
June 11, 2026 13:08
-
-
Save TomAugspurger/069abda3bbfe7728b910187a09ff7e5f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Compare two benchmark markdown tables (on.md vs off.md). | |
| Flags rows where the CPU time in on.md falls outside the noise band of off.md. | |
| """ | |
| import re | |
| import pandas as pd | |
| def parse_time_to_us(s: str) -> float: | |
| """Convert a time string like '1.361 ms' or '725.928 us' to microseconds.""" | |
| s = s.strip() | |
| m = re.match(r"([\d.]+)\s*(ms|us)", s) | |
| if not m: | |
| raise ValueError(f"Cannot parse time: {s!r}") | |
| value, unit = float(m.group(1)), m.group(2) | |
| return value * 1_000 if unit == "ms" else value | |
| def parse_noise(s: str) -> float: | |
| """Convert '7.87%' to 0.0787.""" | |
| return float(s.strip().rstrip("%")) / 100 | |
| def parse_md(path: str) -> pd.DataFrame: | |
| rows = [] | |
| with open(path) as f: | |
| lines = f.readlines() | |
| for line in lines[2:]: # skip header + separator | |
| parts = [p.strip() for p in line.strip().split("|")] | |
| # columns: io_type, page_index, num_cols, num_row_groups, Samples, | |
| # CPU Time, Noise (cpu), GPU Time, Noise (gpu), | |
| # colchunks_per_sec, peak_memory_usage | |
| rows.append( | |
| { | |
| "io_type": parts[0], | |
| "page_index": int(parts[1]), | |
| "num_cols": int(parts[2]), | |
| "num_row_groups": int(parts[3]), | |
| "cpu_time_us": parse_time_to_us(parts[5]), | |
| "cpu_noise": parse_noise(parts[6]), | |
| } | |
| ) | |
| return pd.DataFrame(rows) | |
| KEY = ["page_index", "num_cols", "num_row_groups"] | |
| baseline = parse_md("off.md") | |
| new = parse_md("on.md") | |
| merged = baseline.merge(new, on=KEY, suffixes=("_off", "_on")) | |
| merged["noise_lower"] = merged["cpu_time_us_off"] * (1 - merged["cpu_noise_off"]) | |
| merged["noise_upper"] = merged["cpu_time_us_off"] * (1 + merged["cpu_noise_off"]) | |
| merged["outside_noise"] = (merged["cpu_time_us_on"] < merged["noise_lower"]) | ( | |
| merged["cpu_time_us_on"] > merged["noise_upper"] | |
| ) | |
| merged["delta_pct"] = ( | |
| (merged["cpu_time_us_on"] - merged["cpu_time_us_off"]) / merged["cpu_time_us_off"] * 100 | |
| ) | |
| flagged = merged[merged["outside_noise"]].copy() | |
| display_cols = KEY + [ | |
| "cpu_time_us_off", | |
| "cpu_noise_off", | |
| "noise_lower", | |
| "noise_upper", | |
| "cpu_time_us_on", | |
| "delta_pct", | |
| ] | |
| pd.set_option("display.float_format", "{:.2f}".format) | |
| pd.set_option("display.max_columns", None) | |
| pd.set_option("display.width", 160) | |
| if flagged.empty: | |
| print("No rows where on.md CPU time falls outside off.md noise band.") | |
| else: | |
| print(f"{len(flagged)} row(s) outside the noise band:\n") | |
| print(flagged[display_cols].to_string(index=False)) | |
| print("\n--- All rows (delta_pct vs baseline) ---") | |
| print( | |
| merged[KEY + ["cpu_time_us_off", "cpu_noise_off", "cpu_time_us_on", "delta_pct", "outside_noise"]] | |
| .sort_values("delta_pct", ascending=False) | |
| .to_string(index=False) | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment