Skip to content

Instantly share code, notes, and snippets.

@fzakaria
Created April 16, 2025 16:03
Show Gist options
  • Save fzakaria/92c87dc02b530c8b0818deab7a5c2d89 to your computer and use it in GitHub Desktop.
Save fzakaria/92c87dc02b530c8b0818deab7a5c2d89 to your computer and use it in GitHub Desktop.
Find the number of git commits that have changed public signatures in Java code
#! /usr/bin/env python3
import subprocess
import re
import datetime
from collections import defaultdict, Counter
import argparse
import os
import sys
# --- Configuration ---
DEFAULT_DAYS_BACK = 30
# Heuristic: Regex to find potential public method signature lines
# Looks for lines starting with 'public', possibly preceded by annotations,
# containing '(', ')', and not being obvious comments or just 'class'/'interface' lines.
# This is intentionally kept simple and might need refinement.
# We are looking for ADDED or REMOVED lines containing this pattern.
SIGNATURE_PATTERN = re.compile(r'^\s*(?:@\w+\s*)*public\s+(?!class|interface|enum)\S+.*\([^)]*\)')
# Patterns to identify test files (adjust as needed for your project)
TEST_FILE_PATTERNS = [
re.compile(r'Test\.java$'),
re.compile(r'IT\.java$'), # Integration Tests
re.compile(r'/test/'), # Common Maven/Gradle test source folder
re.compile(r'src/test/java/'), # Explicit path
]
# --- Helper Functions ---
def run_git_command(command, repo_path='.'):
"""Runs a Git command and returns its output."""
try:
result = subprocess.run(
command,
cwd=repo_path,
capture_output=True,
text=True,
check=True,
encoding='utf-8',
# Ignore decoding errors for potentially weird commit messages/diffs
errors='ignore'
)
return result.stdout.strip()
except FileNotFoundError:
print(f"Error: 'git' command not found. Is Git installed and in your PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"Error running command '{' '.join(command)}':", file=sys.stderr)
print(f"Return Code: {e.returncode}", file=sys.stderr)
print(f"Stderr: {e.stderr}", file=sys.stderr)
# Decide if this is fatal. For log/show it might be okay if a commit is weird.
# For initial checks, maybe exit. Let's allow it to continue for now.
return None # Indicate error
except Exception as e:
print(f"An unexpected error occurred running git: {e}", file=sys.stderr)
return None
def is_test_file(filepath):
"""Checks if a file path matches any of the test file patterns."""
return any(pattern.search(filepath.replace('\\', '/')) for pattern in TEST_FILE_PATTERNS)
def analyze_diff_for_public_change(diff_text):
"""
Analyzes diff text ('git show HASH -- file') for public method signature changes.
Heuristic: Looks for paired '-' and '+' lines matching the signature pattern.
"""
removed_potential_sigs = []
added_potential_sigs = []
lines = diff_text.splitlines()
for line in lines:
if line.startswith('-'):
content = line[1:].strip()
if SIGNATURE_PATTERN.search(content) and not content.startswith('//') and not content.startswith('*'):
# Basic comment check
removed_potential_sigs.append(content)
elif line.startswith('+'):
content = line[1:].strip()
if SIGNATURE_PATTERN.search(content) and not content.startswith('//') and not content.startswith('*'):
# Basic comment check
added_potential_sigs.append(content)
# Check if there are *any* potential matches removed AND added.
# This is a very rough heuristic suggesting modification.
# A more robust check might try to find matching method names between removed/added.
return bool(removed_potential_sigs) and bool(added_potential_sigs)
# --- Main Logic ---
def main(repo_path, days_back):
"""Main function to analyze the repository."""
if not os.path.isdir(os.path.join(repo_path, '.git')):
print(f"Error: '{repo_path}' does not appear to be a Git repository.", file=sys.stderr)
sys.exit(1)
print(f"Analyzing Git history for the last {days_back} days in '{repo_path}'...")
# 1. Get commits within the timeframe
since_date = (datetime.datetime.now() - datetime.timedelta(days=days_back)).strftime('%Y-%m-%d')
# Get hash, committer date (ISO format)
log_output = run_git_command(
['git', 'log', f'--since="{since_date}"', '--pretty=format:%H %cI'],
repo_path=repo_path
)
if log_output is None:
print("Error retrieving Git log. Exiting.", file=sys.stderr)
sys.exit(1)
if not log_output:
print(f"No commits found in the last {days_back} days.")
return
commits = []
for line in log_output.splitlines():
if line.strip():
parts = line.split(' ', 1)
if len(parts) == 2:
commit_hash, date_str = parts
try:
# Handle 'Z' suffix for compatibility with Python < 3.11
if date_str.endswith('Z'):
# Replace 'Z' with the equivalent UTC offset '+00:00'
date_str_fixed = date_str[:-1] + '+00:00'
else:
date_str_fixed = date_str
# Parse ISO 8601 date
commit_date = datetime.datetime.fromisoformat(date_str_fixed).date()
commits.append({'hash': commit_hash, 'date': commit_date})
except ValueError:
print(f"Warning: Could not parse date '{date_str_fixed}' for commit {commit_hash}. Skipping date part.", file=sys.stderr)
commits.append({'hash': commit_hash, 'date': None}) # Handle potential parsing errors
total_commits_in_period = len(commits)
print(f"Found {total_commits_in_period} commits since {since_date}.")
# 2. Analyze each commit
commits_with_sig_changes = set()
changed_files_count = Counter()
changes_by_day = defaultdict(int)
print("Analyzing commits for public method signature changes...")
for i, commit in enumerate(commits):
commit_hash = commit['hash']
commit_date = commit['date']
print(f" Checking commit {i+1}/{total_commits_in_period}: {commit_hash[:8]}...", end='\r')
# Get files changed in this commit ( M=Modified, A=Added )
# Using --name-status to easily filter relevant files
diff_output = run_git_command(
['git', 'show', commit_hash, '--name-status', '--oneline', '--diff-filter=MA', '--pretty=format:'],
# '--diff-filter=MA' -> only interested in Modified or Added files
# '--pretty=format:' -> suppress commit message itself
repo_path=repo_path
)
if diff_output is None:
print(f"\nWarning: Could not get diff for commit {commit_hash}. Skipping.", file=sys.stderr)
continue # Skip commit if diff fails
commit_had_change = False
for line in diff_output.splitlines():
if not line.strip(): continue
try:
status, filepath = line.split('\t', 1)
except ValueError:
# Handle potential merge commit summaries or unusual lines
# print(f"\nWarning: Skipping unusual diff line in {commit_hash}: {line}", file=sys.stderr)
continue
if not filepath.endswith('.java') or is_test_file(filepath):
continue # Skip non-java or test files
# Get the specific diff for this file in this commit
# Using unified=0 removes context lines, potentially simplifying parsing but might lose some info
file_diff = run_git_command(
['git', 'show', commit_hash, '--unified=0', '--', filepath],
repo_path=repo_path
)
if file_diff is None:
print(f"\nWarning: Could not get diff for file {filepath} in commit {commit_hash}. Skipping file.", file=sys.stderr)
continue # Skip file if diff fails
if analyze_diff_for_public_change(file_diff):
# print(f"\nDetected potential change in {filepath} in {commit_hash}") # Debugging
commits_with_sig_changes.add(commit_hash)
changed_files_count[filepath] += 1
if commit_date:
changes_by_day[commit_date.strftime('%Y-%m-%d')] += 1
commit_had_change = True
# Optional: break here if you only care *if* a commit had *any* change,
# not about *all* changed files within that commit.
# break
print("\nAnalysis complete.")
# 3. Report Results
print("\n--- Results ---")
# Percentage of commits
num_commits_changed = len(commits_with_sig_changes)
if total_commits_in_period > 0:
percentage = (num_commits_changed / total_commits_in_period) * 100
print(f"Commits with potential public Java method signature changes: {num_commits_changed} / {total_commits_in_period} ({percentage:.2f}%)")
else:
print("No commits analyzed.")
# Histogram of changed files
print("\nFiles with potential public method signature changes (Top 20):")
if changed_files_count:
# Sort by count descending
sorted_files = sorted(changed_files_count.items(), key=lambda item: item[1], reverse=True)
max_len = max(len(f) for f, c in sorted_files[:20]) if sorted_files else 0
max_count = sorted_files[0][1] if sorted_files else 0
hist_scale = 50 # Width of the histogram bars
for filepath, count in sorted_files[:20]:
bar_len = int((count / max_count) * hist_scale) if max_count > 0 else 0
bar = '#' * bar_len
print(f" {filepath:<{max_len}} | {count:<5} | {bar}")
if len(sorted_files) > 20:
print(f" ... and {len(sorted_files) - 20} more files.")
else:
print(" No files detected with changes.")
# Line graph data by day (simple text output)
print("\nPotential changes per day:")
if changes_by_day:
# Sort by date
sorted_days = sorted(changes_by_day.items())
for day, count in sorted_days:
print(f" {day}: {count}")
else:
print(" No changes detected.")
# Optional: Plotting with Matplotlib (requires installation: pip install matplotlib)
try:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
print("\nGenerating plots...")
# File Histogram Plot
if changed_files_count:
top_files = sorted_files[:15] # Plot top 15 files
filenames = [item[0].split('/')[-1] for item in top_files] # Show only filename for brevity
counts = [item[1] for item in top_files]
plt.figure(figsize=(12, 8))
plt.barh(range(len(filenames)), counts, tick_label=filenames)
plt.xlabel('Number of Commits with Changes')
plt.ylabel('File')
plt.title('Top 15 Files with Potential Public Method Signature Changes')
plt.gca().invert_yaxis() # Show highest count at top
plt.tight_layout()
plt.savefig('signature_changes_file_histogram.png')
print(" - Saved file histogram to signature_changes_file_histogram.png")
# plt.show() # Uncomment to display interactively
# Daily Changes Line Graph Plot
if changes_by_day:
dates = sorted(changes_by_day.keys())
counts = [changes_by_day[d] for d in dates]
datetimes = [datetime.datetime.strptime(d, '%Y-%m-%d') for d in dates]
plt.figure(figsize=(12, 6))
plt.plot(datetimes, counts, marker='o', linestyle='-')
plt.xlabel('Date')
plt.ylabel('Number of Commits with Changes')
plt.title('Commits with Potential Signature Changes Per Day')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator(maxticks=15))
plt.gcf().autofmt_xdate() # Rotate date labels
plt.grid(True, axis='y', linestyle=':')
plt.tight_layout()
plt.savefig('signature_changes_daily_trend.png')
print(" - Saved daily trend graph to signature_changes_daily_trend.png")
# plt.show() # Uncomment to display interactively
plt.close('all') # Close plot figures
print("Plotting complete.")
except ImportError:
print("\nNote: Matplotlib not found. Skipping plot generation.")
print(" Install it using: pip install matplotlib")
except Exception as e:
print(f"\nAn error occurred during plotting: {e}", file=sys.stderr)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Analyze Git history for potential public Java method signature changes."
)
parser.add_argument(
"-d", "--days",
type=int,
default=DEFAULT_DAYS_BACK,
help=f"Number of days back to analyze (default: {DEFAULT_DAYS_BACK})"
)
parser.add_argument(
"repo_path",
nargs='?', # Makes the argument optional
default='.', # Default to current directory
help="Path to the Git repository (default: current directory)"
)
args = parser.parse_args()
main(args.repo_path, args.days)
@fzakaria
Copy link
Author

If you add / remove two distinct methods, it will get flagged as an ABI change.
That's okay as this is best effort, and it a higher upper bound.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment