Skip to content

Instantly share code, notes, and snippets.

@mishudark
Created June 1, 2025 09:32
Show Gist options
  • Save mishudark/94c55b7974f870a32057ff0f781810c0 to your computer and use it in GitHub Desktop.
Save mishudark/94c55b7974f870a32057ff0f781810c0 to your computer and use it in GitHub Desktop.
import numpy as np
from datetime import date, timedelta
from collections import defaultdict
# --- Configuration ---
# Adjust this threshold based on what you consider a "large" change for your project.
LOC_THRESHOLD = 100
STATE_NAMES = ["Stable", "Active Dev", "High Churn"]
def parse_git_log(filepath):
"""Parses the git log file generated with --numstat."""
daily_stats = defaultdict(lambda: {'commits': 0, 'loc_changed': 0})
try:
with open(filepath, 'r') as f:
current_date_str = None
for line in f:
line = line.strip()
if not line:
continue
# Check if the line is a date (YYYY-MM-DD format)
if len(line) == 10 and line[4] == '-' and line[7] == '-':
try:
current_date_str = date.fromisoformat(line).isoformat()
daily_stats[current_date_str]['commits'] += 1
except ValueError:
# Not a date, likely part of a commit message
pass
# Check if the line is a numstat line (e.g., "10\t5\tfile.js")
elif line[0].isdigit():
parts = line.split('\t')
if len(parts) == 3:
try:
added = int(parts[0])
deleted = int(parts[1])
if current_date_str:
daily_stats[current_date_str]['loc_changed'] += added + deleted
except (ValueError, IndexError):
# Not a valid numstat line
pass
except FileNotFoundError:
print(f"Error: File not found at '{filepath}'. Please generate it first.")
return None
return daily_stats
def get_state(stats):
"""Determines the state based on daily stats."""
if not stats or stats['commits'] == 0:
return 0 # Stable
if stats['loc_changed'] > LOC_THRESHOLD:
return 2 # High Churn
return 1 # Active Development
def analyze_history(daily_stats):
"""Builds the transition matrix and calculates steady state."""
if not daily_stats:
return None, None
# Sort dates to create a continuous timeline
sorted_dates = sorted(daily_stats.keys())
start_date = date.fromisoformat(sorted_dates[0])
end_date = date.fromisoformat(sorted_dates[-1])
# Create a complete sequence of daily states
daily_states = []
for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
date_str = single_date.isoformat()
daily_states.append(get_state(daily_stats.get(date_str)))
# Count transitions
num_states = len(STATE_NAMES)
transition_counts = np.zeros((num_states, num_states))
for i in range(len(daily_states) - 1):
from_state = daily_states[i]
to_state = daily_states[i+1]
transition_counts[from_state, to_state] += 1
# Normalize to get probability matrix
transition_probabilities = np.zeros_like(transition_counts, dtype=float)
row_sums = transition_counts.sum(axis=1)
# Avoid division by zero for states that were never visited
non_zero_rows = row_sums > 0
transition_probabilities[non_zero_rows] = transition_counts[non_zero_rows] / row_sums[non_zero_rows, np.newaxis]
# Calculate steady-state distribution using power iteration
# (Finding the eigenvector for the eigenvalue 1)
p_matrix = transition_probabilities.T
state_vector = np.full(num_states, 1.0 / num_states)
for _ in range(100): # 100 iterations is usually more than enough for convergence
state_vector = np.dot(p_matrix, state_vector)
return transition_probabilities, state_vector
def print_results_table(ag_grid_results, tanstack_results):
"""Prints a formatted Markdown table with the analysis results."""
ag_grid_p, ag_grid_ss = ag_grid_results
tanstack_p, tanstack_ss = tanstack_results
print("## Component Stability Analysis: ag-grid vs. TanStack Table\n")
print(f"Based on a 'High Churn' threshold of **{LOC_THRESHOLD} lines of code** changed per day.\n")
header = "| Metric | State | ag-grid | TanStack Table |"
separator = "|-----------------------------|--------------|------------------|------------------|"
print(header)
print(separator)
# Transition Probabilities
for i, from_state in enumerate(STATE_NAMES):
for j, to_state in enumerate(STATE_NAMES):
metric_name = f"Prob( {from_state} → {to_state} )"
ag_val = f"{ag_grid_p[i, j]:.2%}" if ag_grid_p is not None else "N/A"
ts_val = f"{tanstack_p[i, j]:.2%}" if tanstack_p is not None else "N/A"
print(f"| {metric_name:<27} | → {to_state:<10} | {ag_val:<16} | {ts_val:<16} |")
if i < len(STATE_NAMES) - 1:
print(separator)
# Steady State Probabilities
print(separator)
for i, state_name in enumerate(STATE_NAMES):
metric_name = "Long-Term Probability"
ag_val = f"{ag_grid_ss[i]:.2%}" if ag_grid_ss is not None else "N/A"
ts_val = f"{tanstack_ss[i]:.2%}" if tanstack_ss is not None else "N/A"
# Highlight the risky state
if state_name == "High Churn":
ag_val = f"**{ag_val}**"
ts_val = f"**{ts_val}**"
print(f"| {metric_name:<27} | {state_name:<12} | {ag_val:<16} | {ts_val:<16} |")
if __name__ == "__main__":
ag_grid_stats = parse_git_log('ag_grid_history.txt')
tanstack_stats = parse_git_log('tanstack_history.txt')
if ag_grid_stats and tanstack_stats:
ag_grid_results = analyze_history(ag_grid_stats)
tanstack_results = analyze_history(tanstack_stats)
print_results_table(ag_grid_results, tanstack_results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment