Skip to content

Instantly share code, notes, and snippets.

@psychoticbeef
Last active February 5, 2025 11:57
Show Gist options
  • Save psychoticbeef/d945bcdf1569da2214b791f92ca498d7 to your computer and use it in GitHub Desktop.
Save psychoticbeef/d945bcdf1569da2214b791f92ca498d7 to your computer and use it in GitHub Desktop.
Calculate how many lines stayed unchanged between commits
#!/usr/bin/env python3
"""
compare_commits.py
This script compares two Git commits (old and new) in the current repository,
taking into account renames. It does the following:
- Lists all Python and C++ files in each commit.
- Uses Git’s rename detection to map files that have been renamed.
- Reads the content of each file from the two commits.
- Uses difflib to count the number of unchanged lines between each paired file.
- Sums up the total lines (for the selected files) in both commits and the number of unchanged lines.
- Prints a summary report.
Usage:
python compare_commits.py <old_commit> <new_commit>
Example:
python compare_commits.py HEAD~1 HEAD
"""
import subprocess
import difflib
import sys
import argparse
def get_files_in_commit(commit, extensions):
"""
Get a list of file paths (relative to the repo root) in the given commit
that end with one of the provided extensions.
"""
try:
result = subprocess.run(
["git", "ls-tree", "-r", "--name-only", commit],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
except subprocess.CalledProcessError as e:
print(f"Error running git ls-tree for commit {commit}: {e.stderr}", file=sys.stderr)
sys.exit(1)
file_list = result.stdout.splitlines()
return [f for f in file_list if any(f.endswith(ext) for ext in extensions)]
def get_file_lines(commit, file_path):
"""
Return the contents of a file (as a list of lines, keeping newlines)
from the given commit using 'git show'.
"""
try:
result = subprocess.run(
["git", "show", f"{commit}:{file_path}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return result.stdout.splitlines(keepends=True)
except subprocess.CalledProcessError as e:
# The file might not exist in the commit.
print(f"Warning: could not get {file_path} from {commit}: {e.stderr.strip()}", file=sys.stderr)
return []
def get_rename_mapping(old_commit, new_commit):
"""
Use git diff with rename detection to build a mapping from file paths in the
old commit to file paths in the new commit, for renamed files.
Returns a dictionary: { old_path: new_path }
"""
try:
result = subprocess.run(
["git", "diff", "--name-status", "-M", old_commit, new_commit],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
except subprocess.CalledProcessError as e:
print(f"Error running git diff for rename detection: {e.stderr}", file=sys.stderr)
sys.exit(1)
mapping = {}
for line in result.stdout.splitlines():
parts = line.split()
# Git diff output format for renames is: R<score> old_path new_path
if parts and parts[0].startswith("R"):
if len(parts) >= 3:
old_path, new_path = parts[1], parts[2]
mapping[old_path] = new_path
return mapping
def count_unchanged_lines(lines_old, lines_new):
"""
Use difflib.SequenceMatcher to find matching blocks between the two versions
and return the total number of unchanged lines.
"""
matcher = difflib.SequenceMatcher(None, lines_old, lines_new)
matching_blocks = matcher.get_matching_blocks()
# The last block is a dummy with size 0, so summing is fine.
return sum(block.size for block in matching_blocks)
def main():
parser = argparse.ArgumentParser(
description="Compare two Git commits (taking into account renames) for Python and C++ files."
)
parser.add_argument("old_commit", help="Old commit reference (e.g., HEAD~1)")
parser.add_argument("new_commit", help="New commit reference (e.g., HEAD)")
args = parser.parse_args()
# Define file extensions of interest.
extensions = [".py", ".cpp", ".hpp", ".h", ".cc", ".cxx"]
# Get the list of files for each commit.
old_files = get_files_in_commit(args.old_commit, extensions)
new_files = get_files_in_commit(args.new_commit, extensions)
# Build sets for quick lookup.
old_files_set = set(old_files)
new_files_set = set(new_files)
# Get rename mapping from old commit to new commit.
rename_mapping = get_rename_mapping(args.old_commit, args.new_commit)
total_lines_old = 0
total_lines_new = 0
unchanged_lines_total = 0
common_files_count = 0
# Process each file from the old commit.
for old_path in old_files:
# Determine corresponding file in new commit:
# 1. If the same path exists in the new commit, use that.
# 2. Else, if the file was renamed, use the new name.
if old_path in new_files_set:
new_path = old_path
elif old_path in rename_mapping:
new_path = rename_mapping[old_path]
if new_path not in new_files_set:
# In rare cases, the mapping might point to a file not present.
continue
else:
# The file does not exist in the new commit.
continue
# Count as a common file.
common_files_count += 1
# Get file contents from both commits.
old_lines = get_file_lines(args.old_commit, old_path)
new_lines = get_file_lines(args.new_commit, new_path)
# Add line counts (using the file from each commit).
total_lines_old += len(old_lines)
total_lines_new += len(new_lines)
# Count unchanged lines between the two versions.
unchanged = count_unchanged_lines(old_lines, new_lines)
unchanged_lines_total += unchanged
# Also add files that exist only in the new commit (they add to total new lines).
# (These files have no counterpart in the old commit.)
for new_path in new_files:
# If this new file was not paired above:
# either it existed only in new_commit or was renamed from an old file we didn’t count.
# For simplicity, we add its total lines if its original name is not in old_files_set.
if new_path not in rename_mapping.values() and new_path not in old_files_set:
new_lines = get_file_lines(args.new_commit, new_path)
total_lines_new += len(new_lines)
# Report the results.
print("=== Comparison Summary ===")
print("Common files (compared in both commits):", common_files_count)
print("Total lines in old commit (selected files):", total_lines_old)
print("Total lines in new commit (selected files):", total_lines_new)
print("Total unchanged lines in common files:", unchanged_lines_total)
if total_lines_new > 0:
perc = (unchanged_lines_total / total_lines_new) * 100
print(f"Percentage of unchanged lines (relative to new commit): {perc:.2f}%")
else:
print("No lines found in new commit for the selected files.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment