simon-mo · September 30, 2024 19:08
diff --git a/git-analysis.py b/git-analysis.py
 import pandas as pd

 # Load the two CSV files
 commit_data = pd.read_csv('git_log_summary.csv')  # This contains author, email, date, and total lines changed
 author_org_data = pd.read_csv('git_log_grouped_by_author.csv')  # This contains author, email, org, and other fields

 # Merge the two dataframes on author and email
 merged_data = pd.merge(commit_data, author_org_data[['Author', 'Email', 'Organization']], on=['Author', 'Email'], how='left')

 # Fill missing organization names with "Community"
 merged_data['Organization'] = merged_data['Organization'].fillna('Community')

 # Ensure the 'Date' column has only valid dates
 # Coerce invalid dates to NaT (Not a Time), which are ignored in the next step
 merged_data['Date'] = pd.to_datetime(merged_data['Date'], errors='coerce')

 # Drop any rows where the 'Date' is NaT (invalid dates)
 merged_data = merged_data.dropna(subset=['Date'])

 # Add a new column for year-month
 merged_data['Year-Month'] = merged_data['Date'].dt.to_period('M')

 # Group the data by 'Year-Month' and 'Organization' to calculate commit count and total lines changed
 grouped_data = merged_data.groupby(['Year-Month', 'Organization']).agg(
    Commit_Count=('Date', 'size'),
    Total_Lines_Changed=('Total Lines Changed', 'sum')
 ).reset_index()

 # Save the final result to a new CSV file
 grouped_data.to_csv('git_log_grouped_by_month_and_org.csv', index=False)

 print("CSV file 'git_log_grouped_by_month_and_org.csv' has been created.")
diff --git a/git-summary.py b/git-summary.py
 import subprocess
 import re
 import csv
 from collections import defaultdict

 # Function to get git log data
 def get_git_log():
    # Run git log command to get author, email, date, and insertions/deletions
    log_output = subprocess.run(
        ['git', 'log', '--shortstat', '--pretty=format:%an,%ae,%ad', '--date=short'],
        capture_output=True,
        text=True
    )
    return log_output.stdout.splitlines()

 # Function to parse the git log and create CSV files
 def parse_git_log_to_csv():
    # Get git log output
    lines = get_git_log()
    
    # Variables to store parsed data
    data = []
    commit_count_by_author = defaultdict(int)  # To count commits by author
    author_changes_summary = defaultdict(lambda: [0, 0])  # To store total changes by author
    author = ""
    email = ""
    date = ""

    # Regex to match lines containing "files changed" (i.e., lines with insertions and deletions)
    pattern = re.compile(r"(\d+) insertions\(\+\), (\d+) deletions\(-\)")

    # Process lines
    for line in lines:
        if re.match(r".+@.+\.\w{2,},\d{4}-\d{2}-\d{2}", line):  # Detect author, email, and date lines
            parts = line.split(',')
            author = parts[0].strip()
            email = parts[1].strip()
            date = parts[2].strip()
            commit_count_by_author[(author, email)] += 1  # Increment commit count for the author-email pair
        elif "files changed" in line:  # Detect lines with changes
            match = pattern.search(line)
            if match:
                insertions = int(match.group(1))
                deletions = int(match.group(2))
                total_changes = insertions + deletions
                # Append a row to the data list
                data.append([author, email, date, total_changes])
                # Sum the changes for each author-email pair
                author_changes_summary[(author, email)][0] += total_changes
                author_changes_summary[(author, email)][1] += 1  # Increment commit count

    # Write the data to a CSV file
    with open('git_log_summary.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Author", "Email", "Date", "Total Lines Changed"])
        writer.writerows(data)

    print("CSV file 'git_log_summary.csv' has been created.")

    # Prepare data for the grouped CSV, adding an empty 'organization' field
    grouped_data = [
        [author, email, commit_count, "", author_changes_summary[(author, email)][0]]
        for (author, email), commit_count in sorted(commit_count_by_author.items(), key=lambda x: x[1], reverse=True)
    ]

    # Write the grouped data to a CSV file
    with open('git_log_grouped_by_author.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Author", "Email", "Number of Commits", "Organization", "Total Lines Changed"])
        writer.writerows(grouped_data)

    print("CSV file 'git_log_grouped_by_author.csv' has been created.")

 # Call the function to parse git log and create CSVs
 parse_git_log_to_csv()
	import pandas as pd

	# Load the two CSV files
	commit_data = pd.read_csv('git_log_summary.csv') # This contains author, email, date, and total lines changed
	author_org_data = pd.read_csv('git_log_grouped_by_author.csv') # This contains author, email, org, and other fields

	# Merge the two dataframes on author and email
	merged_data = pd.merge(commit_data, author_org_data[['Author', 'Email', 'Organization']], on=['Author', 'Email'], how='left')

	# Fill missing organization names with "Community"
	merged_data['Organization'] = merged_data['Organization'].fillna('Community')

	# Ensure the 'Date' column has only valid dates
	# Coerce invalid dates to NaT (Not a Time), which are ignored in the next step
	merged_data['Date'] = pd.to_datetime(merged_data['Date'], errors='coerce')

	# Drop any rows where the 'Date' is NaT (invalid dates)
	merged_data = merged_data.dropna(subset=['Date'])

	# Add a new column for year-month
	merged_data['Year-Month'] = merged_data['Date'].dt.to_period('M')

	# Group the data by 'Year-Month' and 'Organization' to calculate commit count and total lines changed
	grouped_data = merged_data.groupby(['Year-Month', 'Organization']).agg(
	Commit_Count=('Date', 'size'),
	Total_Lines_Changed=('Total Lines Changed', 'sum')
	).reset_index()

	# Save the final result to a new CSV file
	grouped_data.to_csv('git_log_grouped_by_month_and_org.csv', index=False)

	print("CSV file 'git_log_grouped_by_month_and_org.csv' has been created.")
	import subprocess
	import re
	import csv
	from collections import defaultdict

	# Function to get git log data
	def get_git_log():
	# Run git log command to get author, email, date, and insertions/deletions
	log_output = subprocess.run(
	['git', 'log', '--shortstat', '--pretty=format:%an,%ae,%ad', '--date=short'],
	capture_output=True,
	text=True
	)
	return log_output.stdout.splitlines()

	# Function to parse the git log and create CSV files
	def parse_git_log_to_csv():
	# Get git log output
	lines = get_git_log()

	# Variables to store parsed data
	data = []
	commit_count_by_author = defaultdict(int) # To count commits by author
	author_changes_summary = defaultdict(lambda: [0, 0]) # To store total changes by author
	author = ""
	email = ""
	date = ""

	# Regex to match lines containing "files changed" (i.e., lines with insertions and deletions)
	pattern = re.compile(r"(\d+) insertions\(\+\), (\d+) deletions\(-\)")

	# Process lines
	for line in lines:
	if re.match(r".+@.+\.\w{2,},\d{4}-\d{2}-\d{2}", line): # Detect author, email, and date lines
	parts = line.split(',')
	author = parts[0].strip()
	email = parts[1].strip()
	date = parts[2].strip()
	commit_count_by_author[(author, email)] += 1 # Increment commit count for the author-email pair
	elif "files changed" in line: # Detect lines with changes
	match = pattern.search(line)
	if match:
	insertions = int(match.group(1))
	deletions = int(match.group(2))
	total_changes = insertions + deletions
	# Append a row to the data list
	data.append([author, email, date, total_changes])
	# Sum the changes for each author-email pair
	author_changes_summary[(author, email)][0] += total_changes
	author_changes_summary[(author, email)][1] += 1 # Increment commit count

	# Write the data to a CSV file
	with open('git_log_summary.csv', 'w', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(["Author", "Email", "Date", "Total Lines Changed"])
	writer.writerows(data)

	print("CSV file 'git_log_summary.csv' has been created.")

	# Prepare data for the grouped CSV, adding an empty 'organization' field
	grouped_data = [
	[author, email, commit_count, "", author_changes_summary[(author, email)][0]]
	for (author, email), commit_count in sorted(commit_count_by_author.items(), key=lambda x: x[1], reverse=True)
	]

	# Write the grouped data to a CSV file
	with open('git_log_grouped_by_author.csv', 'w', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(["Author", "Email", "Number of Commits", "Organization", "Total Lines Changed"])
	writer.writerows(grouped_data)

	print("CSV file 'git_log_grouped_by_author.csv' has been created.")

	# Call the function to parse git log and create CSVs
	parse_git_log_to_csv()