mishudark · June 1, 2025 09:32
diff --git a/markov_git.py b/markov_git.py
 import numpy as np
 from datetime import date, timedelta
 from collections import defaultdict

 # --- Configuration ---
 # Adjust this threshold based on what you consider a "large" change for your project.
 LOC_THRESHOLD = 100
 STATE_NAMES = ["Stable", "Active Dev", "High Churn"]

 def parse_git_log(filepath):
    """Parses the git log file generated with --numstat."""
    daily_stats = defaultdict(lambda: {'commits': 0, 'loc_changed': 0})
    try:
        with open(filepath, 'r') as f:
            current_date_str = None
            for line in f:
                line = line.strip()
                if not line:
                    continue
                # Check if the line is a date (YYYY-MM-DD format)
                if len(line) == 10 and line[4] == '-' and line[7] == '-':
                    try:
                        current_date_str = date.fromisoformat(line).isoformat()
                        daily_stats[current_date_str]['commits'] += 1
                    except ValueError:
                        # Not a date, likely part of a commit message
                        pass
                # Check if the line is a numstat line (e.g., "10\t5\tfile.js")
                elif line[0].isdigit():
                    parts = line.split('\t')
                    if len(parts) == 3:
                        try:
                            added = int(parts[0])
                            deleted = int(parts[1])
                            if current_date_str:
                                daily_stats[current_date_str]['loc_changed'] += added + deleted
                        except (ValueError, IndexError):
                            # Not a valid numstat line
                            pass
    except FileNotFoundError:
        print(f"Error: File not found at '{filepath}'. Please generate it first.")
        return None
    return daily_stats

 def get_state(stats):
    """Determines the state based on daily stats."""
    if not stats or stats['commits'] == 0:
        return 0  # Stable
    if stats['loc_changed'] > LOC_THRESHOLD:
        return 2  # High Churn
    return 1  # Active Development

 def analyze_history(daily_stats):
    """Builds the transition matrix and calculates steady state."""
    if not daily_stats:
        return None, None

    # Sort dates to create a continuous timeline
    sorted_dates = sorted(daily_stats.keys())
    start_date = date.fromisoformat(sorted_dates[0])
    end_date = date.fromisoformat(sorted_dates[-1])
    
    # Create a complete sequence of daily states
    daily_states = []
    for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
        date_str = single_date.isoformat()
        daily_states.append(get_state(daily_stats.get(date_str)))

    # Count transitions
    num_states = len(STATE_NAMES)
    transition_counts = np.zeros((num_states, num_states))
    for i in range(len(daily_states) - 1):
        from_state = daily_states[i]
        to_state = daily_states[i+1]
        transition_counts[from_state, to_state] += 1
    
    # Normalize to get probability matrix
    transition_probabilities = np.zeros_like(transition_counts, dtype=float)
    row_sums = transition_counts.sum(axis=1)
    # Avoid division by zero for states that were never visited
    non_zero_rows = row_sums > 0
    transition_probabilities[non_zero_rows] = transition_counts[non_zero_rows] / row_sums[non_zero_rows, np.newaxis]

    # Calculate steady-state distribution using power iteration
    # (Finding the eigenvector for the eigenvalue 1)
    p_matrix = transition_probabilities.T
    state_vector = np.full(num_states, 1.0 / num_states)
    for _ in range(100): # 100 iterations is usually more than enough for convergence
        state_vector = np.dot(p_matrix, state_vector)

    return transition_probabilities, state_vector

 def print_results_table(ag_grid_results, tanstack_results):
    """Prints a formatted Markdown table with the analysis results."""
    
    ag_grid_p, ag_grid_ss = ag_grid_results
    tanstack_p, tanstack_ss = tanstack_results

    print("## Component Stability Analysis: ag-grid vs. TanStack Table\n")
    print(f"Based on a 'High Churn' threshold of **{LOC_THRESHOLD} lines of code** changed per day.\n")
    
    header = "| Metric                      | State        | ag-grid          | TanStack Table   |"
    separator = "|-----------------------------|--------------|------------------|------------------|"
    
    print(header)
    print(separator)

    # Transition Probabilities
    for i, from_state in enumerate(STATE_NAMES):
        for j, to_state in enumerate(STATE_NAMES):
            metric_name = f"Prob( {from_state} → {to_state} )"
            ag_val = f"{ag_grid_p[i, j]:.2%}" if ag_grid_p is not None else "N/A"
            ts_val = f"{tanstack_p[i, j]:.2%}" if tanstack_p is not None else "N/A"
            print(f"| {metric_name:<27} | → {to_state:<10} | {ag_val:<16} | {ts_val:<16} |")
        if i < len(STATE_NAMES) - 1:
            print(separator)

    # Steady State Probabilities
    print(separator)
    for i, state_name in enumerate(STATE_NAMES):
        metric_name = "Long-Term Probability"
        ag_val = f"{ag_grid_ss[i]:.2%}" if ag_grid_ss is not None else "N/A"
        ts_val = f"{tanstack_ss[i]:.2%}" if tanstack_ss is not None else "N/A"
        
        # Highlight the risky state
        if state_name == "High Churn":
            ag_val = f"**{ag_val}**"
            ts_val = f"**{ts_val}**"
            
        print(f"| {metric_name:<27} | {state_name:<12} | {ag_val:<16} | {ts_val:<16} |")

 if __name__ == "__main__":
    ag_grid_stats = parse_git_log('ag_grid_history.txt')
    tanstack_stats = parse_git_log('tanstack_history.txt')

    if ag_grid_stats and tanstack_stats:
        ag_grid_results = analyze_history(ag_grid_stats)
        tanstack_results = analyze_history(tanstack_stats)
        
        print_results_table(ag_grid_results, tanstack_results)
	import numpy as np
	from datetime import date, timedelta
	from collections import defaultdict

	# --- Configuration ---
	# Adjust this threshold based on what you consider a "large" change for your project.
	LOC_THRESHOLD = 100
	STATE_NAMES = ["Stable", "Active Dev", "High Churn"]

	def parse_git_log(filepath):
	"""Parses the git log file generated with --numstat."""
	daily_stats = defaultdict(lambda: {'commits': 0, 'loc_changed': 0})
	try:
	with open(filepath, 'r') as f:
	current_date_str = None
	for line in f:
	line = line.strip()
	if not line:
	continue
	# Check if the line is a date (YYYY-MM-DD format)
	if len(line) == 10 and line[4] == '-' and line[7] == '-':
	try:
	current_date_str = date.fromisoformat(line).isoformat()
	daily_stats[current_date_str]['commits'] += 1
	except ValueError:
	# Not a date, likely part of a commit message
	pass
	# Check if the line is a numstat line (e.g., "10\t5\tfile.js")
	elif line[0].isdigit():
	parts = line.split('\t')
	if len(parts) == 3:
	try:
	added = int(parts[0])
	deleted = int(parts[1])
	if current_date_str:
	daily_stats[current_date_str]['loc_changed'] += added + deleted
	except (ValueError, IndexError):
	# Not a valid numstat line
	pass
	except FileNotFoundError:
	print(f"Error: File not found at '{filepath}'. Please generate it first.")
	return None
	return daily_stats

	def get_state(stats):
	"""Determines the state based on daily stats."""
	if not stats or stats['commits'] == 0:
	return 0 # Stable
	if stats['loc_changed'] > LOC_THRESHOLD:
	return 2 # High Churn
	return 1 # Active Development

	def analyze_history(daily_stats):
	"""Builds the transition matrix and calculates steady state."""
	if not daily_stats:
	return None, None

	# Sort dates to create a continuous timeline
	sorted_dates = sorted(daily_stats.keys())
	start_date = date.fromisoformat(sorted_dates[0])
	end_date = date.fromisoformat(sorted_dates[-1])

	# Create a complete sequence of daily states
	daily_states = []
	for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
	date_str = single_date.isoformat()
	daily_states.append(get_state(daily_stats.get(date_str)))

	# Count transitions
	num_states = len(STATE_NAMES)
	transition_counts = np.zeros((num_states, num_states))
	for i in range(len(daily_states) - 1):
	from_state = daily_states[i]
	to_state = daily_states[i+1]
	transition_counts[from_state, to_state] += 1

	# Normalize to get probability matrix
	transition_probabilities = np.zeros_like(transition_counts, dtype=float)
	row_sums = transition_counts.sum(axis=1)
	# Avoid division by zero for states that were never visited
	non_zero_rows = row_sums > 0
	transition_probabilities[non_zero_rows] = transition_counts[non_zero_rows] / row_sums[non_zero_rows, np.newaxis]

	# Calculate steady-state distribution using power iteration
	# (Finding the eigenvector for the eigenvalue 1)
	p_matrix = transition_probabilities.T
	state_vector = np.full(num_states, 1.0 / num_states)
	for _ in range(100): # 100 iterations is usually more than enough for convergence
	state_vector = np.dot(p_matrix, state_vector)

	return transition_probabilities, state_vector

	def print_results_table(ag_grid_results, tanstack_results):
	"""Prints a formatted Markdown table with the analysis results."""

	ag_grid_p, ag_grid_ss = ag_grid_results
	tanstack_p, tanstack_ss = tanstack_results

	print("## Component Stability Analysis: ag-grid vs. TanStack Table\n")
	print(f"Based on a 'High Churn' threshold of {LOC_THRESHOLD} lines of code changed per day.\n")

	header = "\| Metric \| State \| ag-grid \| TanStack Table \|"
	separator = "\|-----------------------------\|--------------\|------------------\|------------------\|"

	print(header)
	print(separator)

	# Transition Probabilities
	for i, from_state in enumerate(STATE_NAMES):
	for j, to_state in enumerate(STATE_NAMES):
	metric_name = f"Prob( {from_state} → {to_state} )"
	ag_val = f"{ag_grid_p[i, j]:.2%}" if ag_grid_p is not None else "N/A"
	ts_val = f"{tanstack_p[i, j]:.2%}" if tanstack_p is not None else "N/A"
	print(f"\| {metric_name:<27} \| → {to_state:<10} \| {ag_val:<16} \| {ts_val:<16} \|")
	if i < len(STATE_NAMES) - 1:
	print(separator)

	# Steady State Probabilities
	print(separator)
	for i, state_name in enumerate(STATE_NAMES):
	metric_name = "Long-Term Probability"
	ag_val = f"{ag_grid_ss[i]:.2%}" if ag_grid_ss is not None else "N/A"
	ts_val = f"{tanstack_ss[i]:.2%}" if tanstack_ss is not None else "N/A"

	# Highlight the risky state
	if state_name == "High Churn":
	ag_val = f"{ag_val}"
	ts_val = f"{ts_val}"

	print(f"\| {metric_name:<27} \| {state_name:<12} \| {ag_val:<16} \| {ts_val:<16} \|")

	if __name__ == "__main__":
	ag_grid_stats = parse_git_log('ag_grid_history.txt')
	tanstack_stats = parse_git_log('tanstack_history.txt')

	if ag_grid_stats and tanstack_stats:
	ag_grid_results = analyze_history(ag_grid_stats)
	tanstack_results = analyze_history(tanstack_stats)

	print_results_table(ag_grid_results, tanstack_results)
No results found