Created
September 30, 2024 19:08
-
-
Save simon-mo/08d6857e2021560aff7f8b46bda64e6d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Load the two CSV files | |
commit_data = pd.read_csv('git_log_summary.csv') # This contains author, email, date, and total lines changed | |
author_org_data = pd.read_csv('git_log_grouped_by_author.csv') # This contains author, email, org, and other fields | |
# Merge the two dataframes on author and email | |
merged_data = pd.merge(commit_data, author_org_data[['Author', 'Email', 'Organization']], on=['Author', 'Email'], how='left') | |
# Fill missing organization names with "Community" | |
merged_data['Organization'] = merged_data['Organization'].fillna('Community') | |
# Ensure the 'Date' column has only valid dates | |
# Coerce invalid dates to NaT (Not a Time), which are ignored in the next step | |
merged_data['Date'] = pd.to_datetime(merged_data['Date'], errors='coerce') | |
# Drop any rows where the 'Date' is NaT (invalid dates) | |
merged_data = merged_data.dropna(subset=['Date']) | |
# Add a new column for year-month | |
merged_data['Year-Month'] = merged_data['Date'].dt.to_period('M') | |
# Group the data by 'Year-Month' and 'Organization' to calculate commit count and total lines changed | |
grouped_data = merged_data.groupby(['Year-Month', 'Organization']).agg( | |
Commit_Count=('Date', 'size'), | |
Total_Lines_Changed=('Total Lines Changed', 'sum') | |
).reset_index() | |
# Save the final result to a new CSV file | |
grouped_data.to_csv('git_log_grouped_by_month_and_org.csv', index=False) | |
print("CSV file 'git_log_grouped_by_month_and_org.csv' has been created.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import re | |
import csv | |
from collections import defaultdict | |
# Function to get git log data | |
def get_git_log(): | |
# Run git log command to get author, email, date, and insertions/deletions | |
log_output = subprocess.run( | |
['git', 'log', '--shortstat', '--pretty=format:%an,%ae,%ad', '--date=short'], | |
capture_output=True, | |
text=True | |
) | |
return log_output.stdout.splitlines() | |
# Function to parse the git log and create CSV files | |
def parse_git_log_to_csv(): | |
# Get git log output | |
lines = get_git_log() | |
# Variables to store parsed data | |
data = [] | |
commit_count_by_author = defaultdict(int) # To count commits by author | |
author_changes_summary = defaultdict(lambda: [0, 0]) # To store total changes by author | |
author = "" | |
email = "" | |
date = "" | |
# Regex to match lines containing "files changed" (i.e., lines with insertions and deletions) | |
pattern = re.compile(r"(\d+) insertions\(\+\), (\d+) deletions\(-\)") | |
# Process lines | |
for line in lines: | |
if re.match(r".+@.+\.\w{2,},\d{4}-\d{2}-\d{2}", line): # Detect author, email, and date lines | |
parts = line.split(',') | |
author = parts[0].strip() | |
email = parts[1].strip() | |
date = parts[2].strip() | |
commit_count_by_author[(author, email)] += 1 # Increment commit count for the author-email pair | |
elif "files changed" in line: # Detect lines with changes | |
match = pattern.search(line) | |
if match: | |
insertions = int(match.group(1)) | |
deletions = int(match.group(2)) | |
total_changes = insertions + deletions | |
# Append a row to the data list | |
data.append([author, email, date, total_changes]) | |
# Sum the changes for each author-email pair | |
author_changes_summary[(author, email)][0] += total_changes | |
author_changes_summary[(author, email)][1] += 1 # Increment commit count | |
# Write the data to a CSV file | |
with open('git_log_summary.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(["Author", "Email", "Date", "Total Lines Changed"]) | |
writer.writerows(data) | |
print("CSV file 'git_log_summary.csv' has been created.") | |
# Prepare data for the grouped CSV, adding an empty 'organization' field | |
grouped_data = [ | |
[author, email, commit_count, "", author_changes_summary[(author, email)][0]] | |
for (author, email), commit_count in sorted(commit_count_by_author.items(), key=lambda x: x[1], reverse=True) | |
] | |
# Write the grouped data to a CSV file | |
with open('git_log_grouped_by_author.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(["Author", "Email", "Number of Commits", "Organization", "Total Lines Changed"]) | |
writer.writerows(grouped_data) | |
print("CSV file 'git_log_grouped_by_author.csv' has been created.") | |
# Call the function to parse git log and create CSVs | |
parse_git_log_to_csv() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment