Created
October 8, 2025 19:16
-
-
Save lemire/fabe642b256987f94695c03b3d7c2515 to your computer and use it in GitHub Desktop.
Python script to explore a potential performance regression in CRoaring
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import subprocess | |
| import json | |
| import os | |
| import sys | |
| from datetime import datetime | |
| import matplotlib.pyplot as plt | |
| import matplotlib.dates as mdates | |
| from collections import defaultdict | |
| import tempfile | |
| # Configuration | |
| START_COMMIT = "d3b85c149b2d3c9e3343cfac860732a640486bba" | |
| REPO_DIR = "." # Assuming script runs in repo root | |
| BUILD_DIR = "build" | |
| BENCHMARKS_DIR = os.path.join(BUILD_DIR, "microbenchmarks") | |
| RESULTS_DIR = "benchmark_results" | |
| def run_command(cmd, cwd=REPO_DIR, check=True): | |
| """Run a shell command and return output.""" | |
| try: | |
| result = subprocess.run(cmd, shell=True, cwd=cwd, capture_output=True, text=True, check=check) | |
| return result.stdout.strip() | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error running '{cmd}': {e.stderr}") | |
| sys.exit(1) | |
| def get_commits(start_commit, end_branch="master"): | |
| """Get list of commits from start to end in chronological order.""" | |
| cmd = f"git rev-list --reverse {start_commit}..{end_branch}" | |
| output = run_command(cmd) | |
| return output.splitlines() if output else [] | |
| def get_commit_timestamp(commit_hash): | |
| """Get Unix timestamp for a commit.""" | |
| cmd = f"git log -1 --format=%ct {commit_hash}" | |
| timestamp_str = run_command(cmd) | |
| return int(timestamp_str) if timestamp_str else 0 | |
| def get_commit_datetime(commit_hash): | |
| """Get datetime object for a commit.""" | |
| timestamp = get_commit_timestamp(commit_hash) | |
| return datetime.fromtimestamp(timestamp) | |
| def main(): | |
| if not os.path.exists(BUILD_DIR): | |
| os.makedirs(BUILD_DIR) | |
| if not os.path.exists(RESULTS_DIR): | |
| os.makedirs(RESULTS_DIR) | |
| commits = get_commits(START_COMMIT) | |
| if not commits: | |
| print("No commits found between start and master.") | |
| sys.exit(1) | |
| print(f"Found {len(commits)} commits to process.") | |
| # Checkout start commit and initial cmake | |
| run_command(f"git checkout {START_COMMIT}") | |
| run_command("cmake -B build -D ENABLE_ROARING_MICROBENCHMARKS=ON") | |
| benchmark_data = defaultdict(list) # benchmark_name -> list of (datetime, time_ns) | |
| for i, commit in enumerate(commits): | |
| print(f"Processing commit {i+1}/{len(commits)}: {commit}") | |
| # Checkout commit | |
| run_command(f"git checkout {commit}") | |
| # Build | |
| run_command("cmake --build build --target bench") | |
| # Get timestamp for filename | |
| timestamp = get_commit_timestamp(commit) | |
| timestamp_str = datetime.fromtimestamp(timestamp).strftime("%Y%m%d_%H%M%S") | |
| json_file = os.path.join(RESULTS_DIR, f"results{timestamp_str}.json") | |
| # Run benchmark | |
| bench_cmd = f"./{BENCHMARKS_DIR}/bench --benchmark_out_format=json --benchmark_out={json_file}" | |
| run_command(bench_cmd) | |
| # Parse JSON | |
| if os.path.exists(json_file): | |
| with open(json_file, 'r') as f: | |
| data = json.load(f) | |
| commit_dt = get_commit_datetime(commit) | |
| # Google Benchmark JSON structure: benchmarks array | |
| for benchmark in data.get("benchmarks", []): | |
| name = benchmark.get("name", "") | |
| # Use cpu_time if available, else real_time | |
| time_ns = benchmark.get("cpu_time", benchmark.get("real_time", 0)) | |
| if isinstance(time_ns, (int, float)): | |
| benchmark_data[name].append((commit_dt, float(time_ns))) | |
| else: | |
| print(f"Warning: Invalid time for {name} in {json_file}") | |
| else: | |
| print(f"Warning: JSON file {json_file} not created.") | |
| # Generate plots | |
| pdf_names = [] | |
| for bench_name, points in benchmark_data.items(): | |
| if len(points) < 2: | |
| print(f"Skipping {bench_name}: insufficient data points.") | |
| continue | |
| points.sort(key=lambda x: x[0]) # Sort by datetime | |
| dates, times = zip(*points) | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(dates, times, marker='o', linestyle='-', markersize=4) | |
| plt.title(f"Benchmark: {bench_name}") | |
| plt.xlabel("Commit Date") | |
| plt.ylabel("Time (nanoseconds)") | |
| plt.ylim(bottom=0) # Start y-axis at zero | |
| plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) | |
| plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator()) | |
| plt.xticks(rotation=45) | |
| plt.tight_layout() | |
| pdf_name = f"{bench_name}_benchmark.pdf" | |
| plt.savefig(pdf_name, format='pdf') | |
| plt.close() | |
| pdf_names.append(pdf_name) | |
| print(f"Generated: {pdf_name}") | |
| if pdf_names: | |
| print("\nGenerated PDF files:") | |
| for name in pdf_names: | |
| print(f"- {name}") | |
| else: | |
| print("No plots generated.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment