Created
April 16, 2025 16:03
-
-
Save fzakaria/92c87dc02b530c8b0818deab7a5c2d89 to your computer and use it in GitHub Desktop.
Find the number of git commits that have changed public signatures in Java code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import subprocess | |
import re | |
import datetime | |
from collections import defaultdict, Counter | |
import argparse | |
import os | |
import sys | |
# --- Configuration --- | |
DEFAULT_DAYS_BACK = 30 | |
# Heuristic: Regex to find potential public method signature lines | |
# Looks for lines starting with 'public', possibly preceded by annotations, | |
# containing '(', ')', and not being obvious comments or just 'class'/'interface' lines. | |
# This is intentionally kept simple and might need refinement. | |
# We are looking for ADDED or REMOVED lines containing this pattern. | |
SIGNATURE_PATTERN = re.compile(r'^\s*(?:@\w+\s*)*public\s+(?!class|interface|enum)\S+.*\([^)]*\)') | |
# Patterns to identify test files (adjust as needed for your project) | |
TEST_FILE_PATTERNS = [ | |
re.compile(r'Test\.java$'), | |
re.compile(r'IT\.java$'), # Integration Tests | |
re.compile(r'/test/'), # Common Maven/Gradle test source folder | |
re.compile(r'src/test/java/'), # Explicit path | |
] | |
# --- Helper Functions --- | |
def run_git_command(command, repo_path='.'): | |
"""Runs a Git command and returns its output.""" | |
try: | |
result = subprocess.run( | |
command, | |
cwd=repo_path, | |
capture_output=True, | |
text=True, | |
check=True, | |
encoding='utf-8', | |
# Ignore decoding errors for potentially weird commit messages/diffs | |
errors='ignore' | |
) | |
return result.stdout.strip() | |
except FileNotFoundError: | |
print(f"Error: 'git' command not found. Is Git installed and in your PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"Error running command '{' '.join(command)}':", file=sys.stderr) | |
print(f"Return Code: {e.returncode}", file=sys.stderr) | |
print(f"Stderr: {e.stderr}", file=sys.stderr) | |
# Decide if this is fatal. For log/show it might be okay if a commit is weird. | |
# For initial checks, maybe exit. Let's allow it to continue for now. | |
return None # Indicate error | |
except Exception as e: | |
print(f"An unexpected error occurred running git: {e}", file=sys.stderr) | |
return None | |
def is_test_file(filepath): | |
"""Checks if a file path matches any of the test file patterns.""" | |
return any(pattern.search(filepath.replace('\\', '/')) for pattern in TEST_FILE_PATTERNS) | |
def analyze_diff_for_public_change(diff_text): | |
""" | |
Analyzes diff text ('git show HASH -- file') for public method signature changes. | |
Heuristic: Looks for paired '-' and '+' lines matching the signature pattern. | |
""" | |
removed_potential_sigs = [] | |
added_potential_sigs = [] | |
lines = diff_text.splitlines() | |
for line in lines: | |
if line.startswith('-'): | |
content = line[1:].strip() | |
if SIGNATURE_PATTERN.search(content) and not content.startswith('//') and not content.startswith('*'): | |
# Basic comment check | |
removed_potential_sigs.append(content) | |
elif line.startswith('+'): | |
content = line[1:].strip() | |
if SIGNATURE_PATTERN.search(content) and not content.startswith('//') and not content.startswith('*'): | |
# Basic comment check | |
added_potential_sigs.append(content) | |
# Check if there are *any* potential matches removed AND added. | |
# This is a very rough heuristic suggesting modification. | |
# A more robust check might try to find matching method names between removed/added. | |
return bool(removed_potential_sigs) and bool(added_potential_sigs) | |
# --- Main Logic --- | |
def main(repo_path, days_back): | |
"""Main function to analyze the repository.""" | |
if not os.path.isdir(os.path.join(repo_path, '.git')): | |
print(f"Error: '{repo_path}' does not appear to be a Git repository.", file=sys.stderr) | |
sys.exit(1) | |
print(f"Analyzing Git history for the last {days_back} days in '{repo_path}'...") | |
# 1. Get commits within the timeframe | |
since_date = (datetime.datetime.now() - datetime.timedelta(days=days_back)).strftime('%Y-%m-%d') | |
# Get hash, committer date (ISO format) | |
log_output = run_git_command( | |
['git', 'log', f'--since="{since_date}"', '--pretty=format:%H %cI'], | |
repo_path=repo_path | |
) | |
if log_output is None: | |
print("Error retrieving Git log. Exiting.", file=sys.stderr) | |
sys.exit(1) | |
if not log_output: | |
print(f"No commits found in the last {days_back} days.") | |
return | |
commits = [] | |
for line in log_output.splitlines(): | |
if line.strip(): | |
parts = line.split(' ', 1) | |
if len(parts) == 2: | |
commit_hash, date_str = parts | |
try: | |
# Handle 'Z' suffix for compatibility with Python < 3.11 | |
if date_str.endswith('Z'): | |
# Replace 'Z' with the equivalent UTC offset '+00:00' | |
date_str_fixed = date_str[:-1] + '+00:00' | |
else: | |
date_str_fixed = date_str | |
# Parse ISO 8601 date | |
commit_date = datetime.datetime.fromisoformat(date_str_fixed).date() | |
commits.append({'hash': commit_hash, 'date': commit_date}) | |
except ValueError: | |
print(f"Warning: Could not parse date '{date_str_fixed}' for commit {commit_hash}. Skipping date part.", file=sys.stderr) | |
commits.append({'hash': commit_hash, 'date': None}) # Handle potential parsing errors | |
total_commits_in_period = len(commits) | |
print(f"Found {total_commits_in_period} commits since {since_date}.") | |
# 2. Analyze each commit | |
commits_with_sig_changes = set() | |
changed_files_count = Counter() | |
changes_by_day = defaultdict(int) | |
print("Analyzing commits for public method signature changes...") | |
for i, commit in enumerate(commits): | |
commit_hash = commit['hash'] | |
commit_date = commit['date'] | |
print(f" Checking commit {i+1}/{total_commits_in_period}: {commit_hash[:8]}...", end='\r') | |
# Get files changed in this commit ( M=Modified, A=Added ) | |
# Using --name-status to easily filter relevant files | |
diff_output = run_git_command( | |
['git', 'show', commit_hash, '--name-status', '--oneline', '--diff-filter=MA', '--pretty=format:'], | |
# '--diff-filter=MA' -> only interested in Modified or Added files | |
# '--pretty=format:' -> suppress commit message itself | |
repo_path=repo_path | |
) | |
if diff_output is None: | |
print(f"\nWarning: Could not get diff for commit {commit_hash}. Skipping.", file=sys.stderr) | |
continue # Skip commit if diff fails | |
commit_had_change = False | |
for line in diff_output.splitlines(): | |
if not line.strip(): continue | |
try: | |
status, filepath = line.split('\t', 1) | |
except ValueError: | |
# Handle potential merge commit summaries or unusual lines | |
# print(f"\nWarning: Skipping unusual diff line in {commit_hash}: {line}", file=sys.stderr) | |
continue | |
if not filepath.endswith('.java') or is_test_file(filepath): | |
continue # Skip non-java or test files | |
# Get the specific diff for this file in this commit | |
# Using unified=0 removes context lines, potentially simplifying parsing but might lose some info | |
file_diff = run_git_command( | |
['git', 'show', commit_hash, '--unified=0', '--', filepath], | |
repo_path=repo_path | |
) | |
if file_diff is None: | |
print(f"\nWarning: Could not get diff for file {filepath} in commit {commit_hash}. Skipping file.", file=sys.stderr) | |
continue # Skip file if diff fails | |
if analyze_diff_for_public_change(file_diff): | |
# print(f"\nDetected potential change in {filepath} in {commit_hash}") # Debugging | |
commits_with_sig_changes.add(commit_hash) | |
changed_files_count[filepath] += 1 | |
if commit_date: | |
changes_by_day[commit_date.strftime('%Y-%m-%d')] += 1 | |
commit_had_change = True | |
# Optional: break here if you only care *if* a commit had *any* change, | |
# not about *all* changed files within that commit. | |
# break | |
print("\nAnalysis complete.") | |
# 3. Report Results | |
print("\n--- Results ---") | |
# Percentage of commits | |
num_commits_changed = len(commits_with_sig_changes) | |
if total_commits_in_period > 0: | |
percentage = (num_commits_changed / total_commits_in_period) * 100 | |
print(f"Commits with potential public Java method signature changes: {num_commits_changed} / {total_commits_in_period} ({percentage:.2f}%)") | |
else: | |
print("No commits analyzed.") | |
# Histogram of changed files | |
print("\nFiles with potential public method signature changes (Top 20):") | |
if changed_files_count: | |
# Sort by count descending | |
sorted_files = sorted(changed_files_count.items(), key=lambda item: item[1], reverse=True) | |
max_len = max(len(f) for f, c in sorted_files[:20]) if sorted_files else 0 | |
max_count = sorted_files[0][1] if sorted_files else 0 | |
hist_scale = 50 # Width of the histogram bars | |
for filepath, count in sorted_files[:20]: | |
bar_len = int((count / max_count) * hist_scale) if max_count > 0 else 0 | |
bar = '#' * bar_len | |
print(f" {filepath:<{max_len}} | {count:<5} | {bar}") | |
if len(sorted_files) > 20: | |
print(f" ... and {len(sorted_files) - 20} more files.") | |
else: | |
print(" No files detected with changes.") | |
# Line graph data by day (simple text output) | |
print("\nPotential changes per day:") | |
if changes_by_day: | |
# Sort by date | |
sorted_days = sorted(changes_by_day.items()) | |
for day, count in sorted_days: | |
print(f" {day}: {count}") | |
else: | |
print(" No changes detected.") | |
# Optional: Plotting with Matplotlib (requires installation: pip install matplotlib) | |
try: | |
import matplotlib.pyplot as plt | |
import matplotlib.dates as mdates | |
print("\nGenerating plots...") | |
# File Histogram Plot | |
if changed_files_count: | |
top_files = sorted_files[:15] # Plot top 15 files | |
filenames = [item[0].split('/')[-1] for item in top_files] # Show only filename for brevity | |
counts = [item[1] for item in top_files] | |
plt.figure(figsize=(12, 8)) | |
plt.barh(range(len(filenames)), counts, tick_label=filenames) | |
plt.xlabel('Number of Commits with Changes') | |
plt.ylabel('File') | |
plt.title('Top 15 Files with Potential Public Method Signature Changes') | |
plt.gca().invert_yaxis() # Show highest count at top | |
plt.tight_layout() | |
plt.savefig('signature_changes_file_histogram.png') | |
print(" - Saved file histogram to signature_changes_file_histogram.png") | |
# plt.show() # Uncomment to display interactively | |
# Daily Changes Line Graph Plot | |
if changes_by_day: | |
dates = sorted(changes_by_day.keys()) | |
counts = [changes_by_day[d] for d in dates] | |
datetimes = [datetime.datetime.strptime(d, '%Y-%m-%d') for d in dates] | |
plt.figure(figsize=(12, 6)) | |
plt.plot(datetimes, counts, marker='o', linestyle='-') | |
plt.xlabel('Date') | |
plt.ylabel('Number of Commits with Changes') | |
plt.title('Commits with Potential Signature Changes Per Day') | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) | |
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator(maxticks=15)) | |
plt.gcf().autofmt_xdate() # Rotate date labels | |
plt.grid(True, axis='y', linestyle=':') | |
plt.tight_layout() | |
plt.savefig('signature_changes_daily_trend.png') | |
print(" - Saved daily trend graph to signature_changes_daily_trend.png") | |
# plt.show() # Uncomment to display interactively | |
plt.close('all') # Close plot figures | |
print("Plotting complete.") | |
except ImportError: | |
print("\nNote: Matplotlib not found. Skipping plot generation.") | |
print(" Install it using: pip install matplotlib") | |
except Exception as e: | |
print(f"\nAn error occurred during plotting: {e}", file=sys.stderr) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="Analyze Git history for potential public Java method signature changes." | |
) | |
parser.add_argument( | |
"-d", "--days", | |
type=int, | |
default=DEFAULT_DAYS_BACK, | |
help=f"Number of days back to analyze (default: {DEFAULT_DAYS_BACK})" | |
) | |
parser.add_argument( | |
"repo_path", | |
nargs='?', # Makes the argument optional | |
default='.', # Default to current directory | |
help="Path to the Git repository (default: current directory)" | |
) | |
args = parser.parse_args() | |
main(args.repo_path, args.days) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you add / remove two distinct methods, it will get flagged as an ABI change.
That's okay as this is best effort, and it a higher upper bound.