Skip to content

Instantly share code, notes, and snippets.

@lemire
Created October 8, 2025 19:16
Show Gist options
  • Save lemire/fabe642b256987f94695c03b3d7c2515 to your computer and use it in GitHub Desktop.
Save lemire/fabe642b256987f94695c03b3d7c2515 to your computer and use it in GitHub Desktop.
Python script to explore a potential performance regression in CRoaring
import subprocess
import json
import os
import sys
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import defaultdict
import tempfile
# Configuration
START_COMMIT = "d3b85c149b2d3c9e3343cfac860732a640486bba"
REPO_DIR = "." # Assuming script runs in repo root
BUILD_DIR = "build"
BENCHMARKS_DIR = os.path.join(BUILD_DIR, "microbenchmarks")
RESULTS_DIR = "benchmark_results"
def run_command(cmd, cwd=REPO_DIR, check=True):
"""Run a shell command and return output."""
try:
result = subprocess.run(cmd, shell=True, cwd=cwd, capture_output=True, text=True, check=check)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f"Error running '{cmd}': {e.stderr}")
sys.exit(1)
def get_commits(start_commit, end_branch="master"):
"""Get list of commits from start to end in chronological order."""
cmd = f"git rev-list --reverse {start_commit}..{end_branch}"
output = run_command(cmd)
return output.splitlines() if output else []
def get_commit_timestamp(commit_hash):
"""Get Unix timestamp for a commit."""
cmd = f"git log -1 --format=%ct {commit_hash}"
timestamp_str = run_command(cmd)
return int(timestamp_str) if timestamp_str else 0
def get_commit_datetime(commit_hash):
"""Get datetime object for a commit."""
timestamp = get_commit_timestamp(commit_hash)
return datetime.fromtimestamp(timestamp)
def main():
if not os.path.exists(BUILD_DIR):
os.makedirs(BUILD_DIR)
if not os.path.exists(RESULTS_DIR):
os.makedirs(RESULTS_DIR)
commits = get_commits(START_COMMIT)
if not commits:
print("No commits found between start and master.")
sys.exit(1)
print(f"Found {len(commits)} commits to process.")
# Checkout start commit and initial cmake
run_command(f"git checkout {START_COMMIT}")
run_command("cmake -B build -D ENABLE_ROARING_MICROBENCHMARKS=ON")
benchmark_data = defaultdict(list) # benchmark_name -> list of (datetime, time_ns)
for i, commit in enumerate(commits):
print(f"Processing commit {i+1}/{len(commits)}: {commit}")
# Checkout commit
run_command(f"git checkout {commit}")
# Build
run_command("cmake --build build --target bench")
# Get timestamp for filename
timestamp = get_commit_timestamp(commit)
timestamp_str = datetime.fromtimestamp(timestamp).strftime("%Y%m%d_%H%M%S")
json_file = os.path.join(RESULTS_DIR, f"results{timestamp_str}.json")
# Run benchmark
bench_cmd = f"./{BENCHMARKS_DIR}/bench --benchmark_out_format=json --benchmark_out={json_file}"
run_command(bench_cmd)
# Parse JSON
if os.path.exists(json_file):
with open(json_file, 'r') as f:
data = json.load(f)
commit_dt = get_commit_datetime(commit)
# Google Benchmark JSON structure: benchmarks array
for benchmark in data.get("benchmarks", []):
name = benchmark.get("name", "")
# Use cpu_time if available, else real_time
time_ns = benchmark.get("cpu_time", benchmark.get("real_time", 0))
if isinstance(time_ns, (int, float)):
benchmark_data[name].append((commit_dt, float(time_ns)))
else:
print(f"Warning: Invalid time for {name} in {json_file}")
else:
print(f"Warning: JSON file {json_file} not created.")
# Generate plots
pdf_names = []
for bench_name, points in benchmark_data.items():
if len(points) < 2:
print(f"Skipping {bench_name}: insufficient data points.")
continue
points.sort(key=lambda x: x[0]) # Sort by datetime
dates, times = zip(*points)
plt.figure(figsize=(10, 6))
plt.plot(dates, times, marker='o', linestyle='-', markersize=4)
plt.title(f"Benchmark: {bench_name}")
plt.xlabel("Commit Date")
plt.ylabel("Time (nanoseconds)")
plt.ylim(bottom=0) # Start y-axis at zero
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator())
plt.xticks(rotation=45)
plt.tight_layout()
pdf_name = f"{bench_name}_benchmark.pdf"
plt.savefig(pdf_name, format='pdf')
plt.close()
pdf_names.append(pdf_name)
print(f"Generated: {pdf_name}")
if pdf_names:
print("\nGenerated PDF files:")
for name in pdf_names:
print(f"- {name}")
else:
print("No plots generated.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment