Created
March 27, 2026 17:01
-
-
Save robintux/65b4f6f4de0222427e80c44b009b339c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Demostración empírica de la descomposición sesgo-varianza | |
| Simulación de Monte Carlo para estimar cada componente | |
| """ | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.linear_model import LinearRegression, Ridge | |
| from sklearn.tree import DecisionTreeRegressor | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.preprocessing import PolynomialFeatures | |
| from sklearn.pipeline import make_pipeline | |
| import seaborn as sns | |
| # Configurar reproducibilidad | |
| np.random.seed(42) | |
| # ============================================================================ | |
| # 1. FUNCIÓN VERDADERA Y GENERACIÓN DE DATOS | |
| # ============================================================================ | |
| def true_function(x): | |
| """Función target no lineal (sinusoidal)""" | |
| return np.sin(2 * x) + 0.5 * np.cos(5 * x) | |
| def generate_dataset(n_samples, noise_std, random_state=None): | |
| """Genera un conjunto de datos con ruido gaussiano""" | |
| if random_state is not None: | |
| np.random.seed(random_state) | |
| x = np.random.uniform(-2, 2, n_samples) | |
| y = true_function(x) + noise_std * np.random.randn(n_samples) | |
| return x.reshape(-1, 1), y | |
| # Parámetros de la simulación | |
| N_DATASETS = 100 # Número de conjuntos de entrenamiento para Monte Carlo | |
| N_SAMPLES = 50 # Tamaño de cada conjunto de entrenamiento | |
| NOISE_STD = 0.3 # Desviación estándar del ruido | |
| X_TEST = np.linspace(-2.5, 2.5, 500).reshape(-1, 1) | |
| Y_TRUE = true_function(X_TEST.flatten()) | |
| # ============================================================================ | |
| # 2. MODELOS CON DIFERENTE COMPLEJIDAD | |
| # ============================================================================ | |
| def create_models(): | |
| """Crea diccionario de modelos con diferentes niveles de complejidad""" | |
| return { | |
| 'Lineal': LinearRegression(), | |
| 'Polinomio grado 2': make_pipeline(PolynomialFeatures(2), LinearRegression()), | |
| 'Polinomio grado 5': make_pipeline(PolynomialFeatures(5), LinearRegression()), | |
| 'Polinomio grado 15': make_pipeline(PolynomialFeatures(15), LinearRegression()), | |
| 'Árbol (prof=2)': DecisionTreeRegressor(max_depth=2, random_state=42), | |
| 'Árbol (prof=10)': DecisionTreeRegressor(max_depth=10, random_state=42), | |
| 'Random Forest': RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42), | |
| 'Ridge (α=100)': Ridge(alpha=100) | |
| } | |
| # ============================================================================ | |
| # 3. ESTIMACIÓN DE SESGO, VARIANZA Y ERROR POR MONTE CARLO | |
| # ============================================================================ | |
| def estimate_bias_variance(model, x_train_list, y_train_list, x_test, y_true, noise_std): | |
| """ | |
| Estima sesgo, varianza y error usando simulación Monte Carlo | |
| Args: | |
| model: Modelo scikit-learn (clona para cada iteración) | |
| x_train_list: Lista de arrays X de entrenamiento | |
| y_train_list: Lista de arrays y de entrenamiento | |
| x_test: Grid de puntos para evaluación | |
| y_true: Valores verdaderos en x_test | |
| noise_std: Desviación estándar del ruido | |
| Returns: | |
| dict: Con estimaciones de bias², varianza, ruido, error total | |
| """ | |
| from sklearn.base import clone | |
| n_datasets = len(x_train_list) | |
| n_test = len(x_test) | |
| # Almacenar predicciones de cada modelo entrenado | |
| predictions = np.zeros((n_datasets, n_test)) | |
| # Entrenar y predecir para cada conjunto de datos | |
| for i in range(n_datasets): | |
| model_i = clone(model) | |
| model_i.fit(x_train_list[i], y_train_list[i]) | |
| predictions[i, :] = model_i.predict(x_test) | |
| # Calcular componentes de la descomposición | |
| # Expectativa del predictor (promedio sobre datasets) | |
| f_bar = np.mean(predictions, axis=0) | |
| # Sesgo²: (E[f̂(x)] - f*(x))² | |
| bias_squared = (f_bar - y_true) ** 2 | |
| # Varianza: E[(f̂(x) - E[f̂(x)])²] | |
| variance = np.mean((predictions - f_bar) ** 2, axis=0) | |
| # Ruido: σ² (conocido por construcción) | |
| noise = np.full(n_test, noise_std ** 2) | |
| # Error total: MSE promedio | |
| mse = np.mean((predictions - y_true) ** 2, axis=0) | |
| # Verificar descomposición | |
| decomposition = bias_squared + variance + noise | |
| reconstruction_error = np.mean(np.abs(mse - decomposition)) | |
| return { | |
| 'bias_squared': np.mean(bias_squared), | |
| 'variance': np.mean(variance), | |
| 'noise': np.mean(noise), | |
| 'mse': np.mean(mse), | |
| 'decomposition': np.mean(decomposition), | |
| 'reconstruction_error': reconstruction_error, | |
| 'f_bar': f_bar, | |
| 'predictions': predictions | |
| } | |
| # ============================================================================ | |
| # 4. GENERAR MÚLTIPLES CONJUNTOS DE DATOS | |
| # ============================================================================ | |
| print("Generando conjuntos de datos para simulación Monte Carlo...") | |
| x_train_list = [] | |
| y_train_list = [] | |
| for i in range(N_DATASETS): | |
| x, y = generate_dataset(N_SAMPLES, NOISE_STD, random_state=i) | |
| x_train_list.append(x) | |
| y_train_list.append(y) | |
| print(f" {N_DATASETS} conjuntos generados con {N_SAMPLES} muestras cada uno") | |
| # ============================================================================ | |
| # 5. EJECUTAR ANÁLISIS PARA CADA MODELO | |
| # ============================================================================ | |
| print("\nEjecutando análisis sesgo-varianza para cada modelo...") | |
| models = create_models() | |
| results = {} | |
| for name, model in models.items(): | |
| print(f" * Evaluando: {name}") | |
| results[name] = estimate_bias_variance( | |
| model, x_train_list, y_train_list, X_TEST, Y_TRUE, NOISE_STD | |
| ) | |
| # ============================================================================ | |
| # 6. VISUALIZACIÓN DE RESULTADOS | |
| # ============================================================================ | |
| print("\nGenerando visualizaciones...") | |
| # Figura 1: Descomposición por modelo | |
| fig, ax = plt.subplots(figsize=(14, 7)) | |
| model_names = list(results.keys()) | |
| x_pos = np.arange(len(model_names)) | |
| bias_values = [results[m]['bias_squared'] for m in model_names] | |
| var_values = [results[m]['variance'] for m in model_names] | |
| noise_values = [results[m]['noise'] for m in model_names] | |
| mse_values = [results[m]['mse'] for m in model_names] | |
| # Gráfico de barras apiladas | |
| ax.bar(x_pos, bias_values, label='Sesgo²', color='#2E86AB', alpha=0.8) | |
| ax.bar(x_pos, var_values, bottom=bias_values, label='Varianza', color='#A23B72', alpha=0.8) | |
| ax.bar(x_pos, noise_values, bottom=np.array(bias_values)+np.array(var_values), | |
| label='Ruido', color='#F18F01', alpha=0.8) | |
| # Línea de error total | |
| ax.plot(x_pos, mse_values, 'ko-', linewidth=2, markersize=8, label='MSE Total') | |
| ax.set_xlabel('Modelo', fontsize=12) | |
| ax.set_ylabel('Error', fontsize=12) | |
| ax.set_title('Descomposición Sesgo-Varianza por Modelo', fontsize=14, fontweight='bold') | |
| ax.set_xticks(x_pos) | |
| ax.set_xticklabels(model_names, rotation=45, ha='right', fontsize=9) | |
| ax.legend(loc='upper right', fontsize=10) | |
| ax.grid(alpha=0.3, axis='y') | |
| plt.tight_layout() | |
| plt.show() | |
| # Figura 2: Predicciones promedio vs. verdadera para modelos seleccionados | |
| fig, axes = plt.subplots(2, 2, figsize=(14, 10)) | |
| fig.suptitle('Ajuste de Modelos: Promedio sobre 100 Conjuntos de Entrenamiento', | |
| fontsize=14, fontweight='bold') | |
| selected_models = ['Lineal', 'Polinomio grado 5', 'Polinomio grado 15', 'Árbol (prof=10)'] | |
| colors = ['#2E86AB', '#A23B72', '#F18F01', '#06A77D'] | |
| for ax, name, color in zip(axes.flatten(), selected_models, colors): | |
| # Datos de entrenamiento (un ejemplo) | |
| ax.scatter(x_train_list[0], y_train_list[0], s=15, alpha=0.3, color='gray', label='Datos') | |
| # Función verdadera | |
| ax.plot(X_TEST, Y_TRUE, 'k-', linewidth=2, label='Verdadera f*(x)') | |
| # Predicción promedio | |
| ax.plot(X_TEST, results[name]['f_bar'], '--', color=color, linewidth=2.5, | |
| label=f'Promedio {name}') | |
| # Banda de varianza (±1 desviación estándar) | |
| pred_std = np.std(results[name]['predictions'], axis=0) | |
| ax.fill_between(X_TEST.flatten(), | |
| results[name]['f_bar'] - pred_std, | |
| results[name]['f_bar'] + pred_std, | |
| alpha=0.2, color=color, label='±1 Std (Varianza)') | |
| ax.set_xlabel('x', fontsize=11) | |
| ax.set_ylabel('y', fontsize=11) | |
| ax.set_title(f'{name}\nSesgo²={results[name]["bias_squared"]:.4f}, Var={results[name]["variance"]:.4f}', | |
| fontsize=11) | |
| ax.legend(fontsize=8, loc='lower right') | |
| ax.grid(alpha=0.3) | |
| plt.tight_layout() | |
| plt.show() | |
| # ============================================================================ | |
| # 7. RESUMEN CUANTITATIVO | |
| # ============================================================================ | |
| print("\n" + "=" * 80) | |
| print("RESUMEN CUANTITATIVO DE DESCOMPOSICIÓN SESGO-VARIANZA") | |
| print("=" * 80) | |
| print(f"{'Modelo':<25} {'Sesgo²':<12} {'Varianza':<12} {'Ruido':<10} {'MSE Total':<12}") | |
| print("-" * 80) | |
| for name in model_names: | |
| r = results[name] | |
| print(f"{name:<25} {r['bias_squared']:<12.6f} {r['variance']:<12.6f} " | |
| f"{r['noise']:<10.6f} {r['mse']:<12.6f}") | |
| print("\n\tOBSERVACIONES CLAVE:") | |
| print(" 1. Modelos simples (Lineal): Sesgo alto, Varianza baja") | |
| print(" 2. Modelos complejos (Polinomio 15, Árbol prof=10): Sesgo bajo, Varianza alta") | |
| print(" 3. Modelos balanceados (Polinomio 5, Random Forest): Mejor MSE total") | |
| print(" 4. El ruido es constante para todos los modelos (propiedad de los datos)") | |
| print(" 5. La descomposición se verifica: MSE ≈ Sesgo² + Varianza + Ruido") | |
| print("\n\tIMPLICACIONES PRÁCTICAS:") | |
| print(" * Para reducir sesgo: aumentar complejidad, añadir features, reducir regularización") | |
| print(" * Para reducir varianza: más datos, regularización, ensembling, simplificar modelo") | |
| print(" * El objetivo es minimizar MSE total, no sesgo o varianza individualmente") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment