Last active
February 5, 2025 11:57
-
-
Save psychoticbeef/d945bcdf1569da2214b791f92ca498d7 to your computer and use it in GitHub Desktop.
Calculate how many lines stayed unchanged between commits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
compare_commits.py | |
This script compares two Git commits (old and new) in the current repository, | |
taking into account renames. It does the following: | |
- Lists all Python and C++ files in each commit. | |
- Uses Git’s rename detection to map files that have been renamed. | |
- Reads the content of each file from the two commits. | |
- Uses difflib to count the number of unchanged lines between each paired file. | |
- Sums up the total lines (for the selected files) in both commits and the number of unchanged lines. | |
- Prints a summary report. | |
Usage: | |
python compare_commits.py <old_commit> <new_commit> | |
Example: | |
python compare_commits.py HEAD~1 HEAD | |
""" | |
import subprocess | |
import difflib | |
import sys | |
import argparse | |
def get_files_in_commit(commit, extensions): | |
""" | |
Get a list of file paths (relative to the repo root) in the given commit | |
that end with one of the provided extensions. | |
""" | |
try: | |
result = subprocess.run( | |
["git", "ls-tree", "-r", "--name-only", commit], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True, | |
check=True, | |
) | |
except subprocess.CalledProcessError as e: | |
print(f"Error running git ls-tree for commit {commit}: {e.stderr}", file=sys.stderr) | |
sys.exit(1) | |
file_list = result.stdout.splitlines() | |
return [f for f in file_list if any(f.endswith(ext) for ext in extensions)] | |
def get_file_lines(commit, file_path): | |
""" | |
Return the contents of a file (as a list of lines, keeping newlines) | |
from the given commit using 'git show'. | |
""" | |
try: | |
result = subprocess.run( | |
["git", "show", f"{commit}:{file_path}"], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True, | |
check=True, | |
) | |
return result.stdout.splitlines(keepends=True) | |
except subprocess.CalledProcessError as e: | |
# The file might not exist in the commit. | |
print(f"Warning: could not get {file_path} from {commit}: {e.stderr.strip()}", file=sys.stderr) | |
return [] | |
def get_rename_mapping(old_commit, new_commit): | |
""" | |
Use git diff with rename detection to build a mapping from file paths in the | |
old commit to file paths in the new commit, for renamed files. | |
Returns a dictionary: { old_path: new_path } | |
""" | |
try: | |
result = subprocess.run( | |
["git", "diff", "--name-status", "-M", old_commit, new_commit], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True, | |
check=True, | |
) | |
except subprocess.CalledProcessError as e: | |
print(f"Error running git diff for rename detection: {e.stderr}", file=sys.stderr) | |
sys.exit(1) | |
mapping = {} | |
for line in result.stdout.splitlines(): | |
parts = line.split() | |
# Git diff output format for renames is: R<score> old_path new_path | |
if parts and parts[0].startswith("R"): | |
if len(parts) >= 3: | |
old_path, new_path = parts[1], parts[2] | |
mapping[old_path] = new_path | |
return mapping | |
def count_unchanged_lines(lines_old, lines_new): | |
""" | |
Use difflib.SequenceMatcher to find matching blocks between the two versions | |
and return the total number of unchanged lines. | |
""" | |
matcher = difflib.SequenceMatcher(None, lines_old, lines_new) | |
matching_blocks = matcher.get_matching_blocks() | |
# The last block is a dummy with size 0, so summing is fine. | |
return sum(block.size for block in matching_blocks) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Compare two Git commits (taking into account renames) for Python and C++ files." | |
) | |
parser.add_argument("old_commit", help="Old commit reference (e.g., HEAD~1)") | |
parser.add_argument("new_commit", help="New commit reference (e.g., HEAD)") | |
args = parser.parse_args() | |
# Define file extensions of interest. | |
extensions = [".py", ".cpp", ".hpp", ".h", ".cc", ".cxx"] | |
# Get the list of files for each commit. | |
old_files = get_files_in_commit(args.old_commit, extensions) | |
new_files = get_files_in_commit(args.new_commit, extensions) | |
# Build sets for quick lookup. | |
old_files_set = set(old_files) | |
new_files_set = set(new_files) | |
# Get rename mapping from old commit to new commit. | |
rename_mapping = get_rename_mapping(args.old_commit, args.new_commit) | |
total_lines_old = 0 | |
total_lines_new = 0 | |
unchanged_lines_total = 0 | |
common_files_count = 0 | |
# Process each file from the old commit. | |
for old_path in old_files: | |
# Determine corresponding file in new commit: | |
# 1. If the same path exists in the new commit, use that. | |
# 2. Else, if the file was renamed, use the new name. | |
if old_path in new_files_set: | |
new_path = old_path | |
elif old_path in rename_mapping: | |
new_path = rename_mapping[old_path] | |
if new_path not in new_files_set: | |
# In rare cases, the mapping might point to a file not present. | |
continue | |
else: | |
# The file does not exist in the new commit. | |
continue | |
# Count as a common file. | |
common_files_count += 1 | |
# Get file contents from both commits. | |
old_lines = get_file_lines(args.old_commit, old_path) | |
new_lines = get_file_lines(args.new_commit, new_path) | |
# Add line counts (using the file from each commit). | |
total_lines_old += len(old_lines) | |
total_lines_new += len(new_lines) | |
# Count unchanged lines between the two versions. | |
unchanged = count_unchanged_lines(old_lines, new_lines) | |
unchanged_lines_total += unchanged | |
# Also add files that exist only in the new commit (they add to total new lines). | |
# (These files have no counterpart in the old commit.) | |
for new_path in new_files: | |
# If this new file was not paired above: | |
# either it existed only in new_commit or was renamed from an old file we didn’t count. | |
# For simplicity, we add its total lines if its original name is not in old_files_set. | |
if new_path not in rename_mapping.values() and new_path not in old_files_set: | |
new_lines = get_file_lines(args.new_commit, new_path) | |
total_lines_new += len(new_lines) | |
# Report the results. | |
print("=== Comparison Summary ===") | |
print("Common files (compared in both commits):", common_files_count) | |
print("Total lines in old commit (selected files):", total_lines_old) | |
print("Total lines in new commit (selected files):", total_lines_new) | |
print("Total unchanged lines in common files:", unchanged_lines_total) | |
if total_lines_new > 0: | |
perc = (unchanged_lines_total / total_lines_new) * 100 | |
print(f"Percentage of unchanged lines (relative to new commit): {perc:.2f}%") | |
else: | |
print("No lines found in new commit for the selected files.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment