Created
June 1, 2025 09:32
-
-
Save mishudark/94c55b7974f870a32057ff0f781810c0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from datetime import date, timedelta | |
from collections import defaultdict | |
# --- Configuration --- | |
# Adjust this threshold based on what you consider a "large" change for your project. | |
LOC_THRESHOLD = 100 | |
STATE_NAMES = ["Stable", "Active Dev", "High Churn"] | |
def parse_git_log(filepath): | |
"""Parses the git log file generated with --numstat.""" | |
daily_stats = defaultdict(lambda: {'commits': 0, 'loc_changed': 0}) | |
try: | |
with open(filepath, 'r') as f: | |
current_date_str = None | |
for line in f: | |
line = line.strip() | |
if not line: | |
continue | |
# Check if the line is a date (YYYY-MM-DD format) | |
if len(line) == 10 and line[4] == '-' and line[7] == '-': | |
try: | |
current_date_str = date.fromisoformat(line).isoformat() | |
daily_stats[current_date_str]['commits'] += 1 | |
except ValueError: | |
# Not a date, likely part of a commit message | |
pass | |
# Check if the line is a numstat line (e.g., "10\t5\tfile.js") | |
elif line[0].isdigit(): | |
parts = line.split('\t') | |
if len(parts) == 3: | |
try: | |
added = int(parts[0]) | |
deleted = int(parts[1]) | |
if current_date_str: | |
daily_stats[current_date_str]['loc_changed'] += added + deleted | |
except (ValueError, IndexError): | |
# Not a valid numstat line | |
pass | |
except FileNotFoundError: | |
print(f"Error: File not found at '{filepath}'. Please generate it first.") | |
return None | |
return daily_stats | |
def get_state(stats): | |
"""Determines the state based on daily stats.""" | |
if not stats or stats['commits'] == 0: | |
return 0 # Stable | |
if stats['loc_changed'] > LOC_THRESHOLD: | |
return 2 # High Churn | |
return 1 # Active Development | |
def analyze_history(daily_stats): | |
"""Builds the transition matrix and calculates steady state.""" | |
if not daily_stats: | |
return None, None | |
# Sort dates to create a continuous timeline | |
sorted_dates = sorted(daily_stats.keys()) | |
start_date = date.fromisoformat(sorted_dates[0]) | |
end_date = date.fromisoformat(sorted_dates[-1]) | |
# Create a complete sequence of daily states | |
daily_states = [] | |
for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)): | |
date_str = single_date.isoformat() | |
daily_states.append(get_state(daily_stats.get(date_str))) | |
# Count transitions | |
num_states = len(STATE_NAMES) | |
transition_counts = np.zeros((num_states, num_states)) | |
for i in range(len(daily_states) - 1): | |
from_state = daily_states[i] | |
to_state = daily_states[i+1] | |
transition_counts[from_state, to_state] += 1 | |
# Normalize to get probability matrix | |
transition_probabilities = np.zeros_like(transition_counts, dtype=float) | |
row_sums = transition_counts.sum(axis=1) | |
# Avoid division by zero for states that were never visited | |
non_zero_rows = row_sums > 0 | |
transition_probabilities[non_zero_rows] = transition_counts[non_zero_rows] / row_sums[non_zero_rows, np.newaxis] | |
# Calculate steady-state distribution using power iteration | |
# (Finding the eigenvector for the eigenvalue 1) | |
p_matrix = transition_probabilities.T | |
state_vector = np.full(num_states, 1.0 / num_states) | |
for _ in range(100): # 100 iterations is usually more than enough for convergence | |
state_vector = np.dot(p_matrix, state_vector) | |
return transition_probabilities, state_vector | |
def print_results_table(ag_grid_results, tanstack_results): | |
"""Prints a formatted Markdown table with the analysis results.""" | |
ag_grid_p, ag_grid_ss = ag_grid_results | |
tanstack_p, tanstack_ss = tanstack_results | |
print("## Component Stability Analysis: ag-grid vs. TanStack Table\n") | |
print(f"Based on a 'High Churn' threshold of **{LOC_THRESHOLD} lines of code** changed per day.\n") | |
header = "| Metric | State | ag-grid | TanStack Table |" | |
separator = "|-----------------------------|--------------|------------------|------------------|" | |
print(header) | |
print(separator) | |
# Transition Probabilities | |
for i, from_state in enumerate(STATE_NAMES): | |
for j, to_state in enumerate(STATE_NAMES): | |
metric_name = f"Prob( {from_state} → {to_state} )" | |
ag_val = f"{ag_grid_p[i, j]:.2%}" if ag_grid_p is not None else "N/A" | |
ts_val = f"{tanstack_p[i, j]:.2%}" if tanstack_p is not None else "N/A" | |
print(f"| {metric_name:<27} | → {to_state:<10} | {ag_val:<16} | {ts_val:<16} |") | |
if i < len(STATE_NAMES) - 1: | |
print(separator) | |
# Steady State Probabilities | |
print(separator) | |
for i, state_name in enumerate(STATE_NAMES): | |
metric_name = "Long-Term Probability" | |
ag_val = f"{ag_grid_ss[i]:.2%}" if ag_grid_ss is not None else "N/A" | |
ts_val = f"{tanstack_ss[i]:.2%}" if tanstack_ss is not None else "N/A" | |
# Highlight the risky state | |
if state_name == "High Churn": | |
ag_val = f"**{ag_val}**" | |
ts_val = f"**{ts_val}**" | |
print(f"| {metric_name:<27} | {state_name:<12} | {ag_val:<16} | {ts_val:<16} |") | |
if __name__ == "__main__": | |
ag_grid_stats = parse_git_log('ag_grid_history.txt') | |
tanstack_stats = parse_git_log('tanstack_history.txt') | |
if ag_grid_stats and tanstack_stats: | |
ag_grid_results = analyze_history(ag_grid_stats) | |
tanstack_results = analyze_history(tanstack_stats) | |
print_results_table(ag_grid_results, tanstack_results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment