Skip to content

Instantly share code, notes, and snippets.

@simon-mo
Created September 30, 2024 19:08
Show Gist options
  • Save simon-mo/08d6857e2021560aff7f8b46bda64e6d to your computer and use it in GitHub Desktop.
Save simon-mo/08d6857e2021560aff7f8b46bda64e6d to your computer and use it in GitHub Desktop.
import pandas as pd
# Load the two CSV files
commit_data = pd.read_csv('git_log_summary.csv') # This contains author, email, date, and total lines changed
author_org_data = pd.read_csv('git_log_grouped_by_author.csv') # This contains author, email, org, and other fields
# Merge the two dataframes on author and email
merged_data = pd.merge(commit_data, author_org_data[['Author', 'Email', 'Organization']], on=['Author', 'Email'], how='left')
# Fill missing organization names with "Community"
merged_data['Organization'] = merged_data['Organization'].fillna('Community')
# Ensure the 'Date' column has only valid dates
# Coerce invalid dates to NaT (Not a Time), which are ignored in the next step
merged_data['Date'] = pd.to_datetime(merged_data['Date'], errors='coerce')
# Drop any rows where the 'Date' is NaT (invalid dates)
merged_data = merged_data.dropna(subset=['Date'])
# Add a new column for year-month
merged_data['Year-Month'] = merged_data['Date'].dt.to_period('M')
# Group the data by 'Year-Month' and 'Organization' to calculate commit count and total lines changed
grouped_data = merged_data.groupby(['Year-Month', 'Organization']).agg(
Commit_Count=('Date', 'size'),
Total_Lines_Changed=('Total Lines Changed', 'sum')
).reset_index()
# Save the final result to a new CSV file
grouped_data.to_csv('git_log_grouped_by_month_and_org.csv', index=False)
print("CSV file 'git_log_grouped_by_month_and_org.csv' has been created.")
import subprocess
import re
import csv
from collections import defaultdict
# Function to get git log data
def get_git_log():
# Run git log command to get author, email, date, and insertions/deletions
log_output = subprocess.run(
['git', 'log', '--shortstat', '--pretty=format:%an,%ae,%ad', '--date=short'],
capture_output=True,
text=True
)
return log_output.stdout.splitlines()
# Function to parse the git log and create CSV files
def parse_git_log_to_csv():
# Get git log output
lines = get_git_log()
# Variables to store parsed data
data = []
commit_count_by_author = defaultdict(int) # To count commits by author
author_changes_summary = defaultdict(lambda: [0, 0]) # To store total changes by author
author = ""
email = ""
date = ""
# Regex to match lines containing "files changed" (i.e., lines with insertions and deletions)
pattern = re.compile(r"(\d+) insertions\(\+\), (\d+) deletions\(-\)")
# Process lines
for line in lines:
if re.match(r".+@.+\.\w{2,},\d{4}-\d{2}-\d{2}", line): # Detect author, email, and date lines
parts = line.split(',')
author = parts[0].strip()
email = parts[1].strip()
date = parts[2].strip()
commit_count_by_author[(author, email)] += 1 # Increment commit count for the author-email pair
elif "files changed" in line: # Detect lines with changes
match = pattern.search(line)
if match:
insertions = int(match.group(1))
deletions = int(match.group(2))
total_changes = insertions + deletions
# Append a row to the data list
data.append([author, email, date, total_changes])
# Sum the changes for each author-email pair
author_changes_summary[(author, email)][0] += total_changes
author_changes_summary[(author, email)][1] += 1 # Increment commit count
# Write the data to a CSV file
with open('git_log_summary.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Author", "Email", "Date", "Total Lines Changed"])
writer.writerows(data)
print("CSV file 'git_log_summary.csv' has been created.")
# Prepare data for the grouped CSV, adding an empty 'organization' field
grouped_data = [
[author, email, commit_count, "", author_changes_summary[(author, email)][0]]
for (author, email), commit_count in sorted(commit_count_by_author.items(), key=lambda x: x[1], reverse=True)
]
# Write the grouped data to a CSV file
with open('git_log_grouped_by_author.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Author", "Email", "Number of Commits", "Organization", "Total Lines Changed"])
writer.writerows(grouped_data)
print("CSV file 'git_log_grouped_by_author.csv' has been created.")
# Call the function to parse git log and create CSVs
parse_git_log_to_csv()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment