Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Created June 11, 2026 13:08
Show Gist options
  • Select an option

  • Save TomAugspurger/069abda3bbfe7728b910187a09ff7e5f to your computer and use it in GitHub Desktop.

Select an option

Save TomAugspurger/069abda3bbfe7728b910187a09ff7e5f to your computer and use it in GitHub Desktop.
"""
Compare two benchmark markdown tables (on.md vs off.md).
Flags rows where the CPU time in on.md falls outside the noise band of off.md.
"""
import re
import pandas as pd
def parse_time_to_us(s: str) -> float:
"""Convert a time string like '1.361 ms' or '725.928 us' to microseconds."""
s = s.strip()
m = re.match(r"([\d.]+)\s*(ms|us)", s)
if not m:
raise ValueError(f"Cannot parse time: {s!r}")
value, unit = float(m.group(1)), m.group(2)
return value * 1_000 if unit == "ms" else value
def parse_noise(s: str) -> float:
"""Convert '7.87%' to 0.0787."""
return float(s.strip().rstrip("%")) / 100
def parse_md(path: str) -> pd.DataFrame:
rows = []
with open(path) as f:
lines = f.readlines()
for line in lines[2:]: # skip header + separator
parts = [p.strip() for p in line.strip().split("|")]
# columns: io_type, page_index, num_cols, num_row_groups, Samples,
# CPU Time, Noise (cpu), GPU Time, Noise (gpu),
# colchunks_per_sec, peak_memory_usage
rows.append(
{
"io_type": parts[0],
"page_index": int(parts[1]),
"num_cols": int(parts[2]),
"num_row_groups": int(parts[3]),
"cpu_time_us": parse_time_to_us(parts[5]),
"cpu_noise": parse_noise(parts[6]),
}
)
return pd.DataFrame(rows)
KEY = ["page_index", "num_cols", "num_row_groups"]
baseline = parse_md("off.md")
new = parse_md("on.md")
merged = baseline.merge(new, on=KEY, suffixes=("_off", "_on"))
merged["noise_lower"] = merged["cpu_time_us_off"] * (1 - merged["cpu_noise_off"])
merged["noise_upper"] = merged["cpu_time_us_off"] * (1 + merged["cpu_noise_off"])
merged["outside_noise"] = (merged["cpu_time_us_on"] < merged["noise_lower"]) | (
merged["cpu_time_us_on"] > merged["noise_upper"]
)
merged["delta_pct"] = (
(merged["cpu_time_us_on"] - merged["cpu_time_us_off"]) / merged["cpu_time_us_off"] * 100
)
flagged = merged[merged["outside_noise"]].copy()
display_cols = KEY + [
"cpu_time_us_off",
"cpu_noise_off",
"noise_lower",
"noise_upper",
"cpu_time_us_on",
"delta_pct",
]
pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 160)
if flagged.empty:
print("No rows where on.md CPU time falls outside off.md noise band.")
else:
print(f"{len(flagged)} row(s) outside the noise band:\n")
print(flagged[display_cols].to_string(index=False))
print("\n--- All rows (delta_pct vs baseline) ---")
print(
merged[KEY + ["cpu_time_us_off", "cpu_noise_off", "cpu_time_us_on", "delta_pct", "outside_noise"]]
.sort_values("delta_pct", ascending=False)
.to_string(index=False)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment