Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save robintux/65b4f6f4de0222427e80c44b009b339c to your computer and use it in GitHub Desktop.

Select an option

Save robintux/65b4f6f4de0222427e80c44b009b339c to your computer and use it in GitHub Desktop.
"""
Demostración empírica de la descomposición sesgo-varianza
Simulación de Monte Carlo para estimar cada componente
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import seaborn as sns
# Configurar reproducibilidad
np.random.seed(42)
# ============================================================================
# 1. FUNCIÓN VERDADERA Y GENERACIÓN DE DATOS
# ============================================================================
def true_function(x):
"""Función target no lineal (sinusoidal)"""
return np.sin(2 * x) + 0.5 * np.cos(5 * x)
def generate_dataset(n_samples, noise_std, random_state=None):
"""Genera un conjunto de datos con ruido gaussiano"""
if random_state is not None:
np.random.seed(random_state)
x = np.random.uniform(-2, 2, n_samples)
y = true_function(x) + noise_std * np.random.randn(n_samples)
return x.reshape(-1, 1), y
# Parámetros de la simulación
N_DATASETS = 100 # Número de conjuntos de entrenamiento para Monte Carlo
N_SAMPLES = 50 # Tamaño de cada conjunto de entrenamiento
NOISE_STD = 0.3 # Desviación estándar del ruido
X_TEST = np.linspace(-2.5, 2.5, 500).reshape(-1, 1)
Y_TRUE = true_function(X_TEST.flatten())
# ============================================================================
# 2. MODELOS CON DIFERENTE COMPLEJIDAD
# ============================================================================
def create_models():
"""Crea diccionario de modelos con diferentes niveles de complejidad"""
return {
'Lineal': LinearRegression(),
'Polinomio grado 2': make_pipeline(PolynomialFeatures(2), LinearRegression()),
'Polinomio grado 5': make_pipeline(PolynomialFeatures(5), LinearRegression()),
'Polinomio grado 15': make_pipeline(PolynomialFeatures(15), LinearRegression()),
'Árbol (prof=2)': DecisionTreeRegressor(max_depth=2, random_state=42),
'Árbol (prof=10)': DecisionTreeRegressor(max_depth=10, random_state=42),
'Random Forest': RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42),
'Ridge (α=100)': Ridge(alpha=100)
}
# ============================================================================
# 3. ESTIMACIÓN DE SESGO, VARIANZA Y ERROR POR MONTE CARLO
# ============================================================================
def estimate_bias_variance(model, x_train_list, y_train_list, x_test, y_true, noise_std):
"""
Estima sesgo, varianza y error usando simulación Monte Carlo
Args:
model: Modelo scikit-learn (clona para cada iteración)
x_train_list: Lista de arrays X de entrenamiento
y_train_list: Lista de arrays y de entrenamiento
x_test: Grid de puntos para evaluación
y_true: Valores verdaderos en x_test
noise_std: Desviación estándar del ruido
Returns:
dict: Con estimaciones de bias², varianza, ruido, error total
"""
from sklearn.base import clone
n_datasets = len(x_train_list)
n_test = len(x_test)
# Almacenar predicciones de cada modelo entrenado
predictions = np.zeros((n_datasets, n_test))
# Entrenar y predecir para cada conjunto de datos
for i in range(n_datasets):
model_i = clone(model)
model_i.fit(x_train_list[i], y_train_list[i])
predictions[i, :] = model_i.predict(x_test)
# Calcular componentes de la descomposición
# Expectativa del predictor (promedio sobre datasets)
f_bar = np.mean(predictions, axis=0)
# Sesgo²: (E[f̂(x)] - f*(x))²
bias_squared = (f_bar - y_true) ** 2
# Varianza: E[(f̂(x) - E[f̂(x)])²]
variance = np.mean((predictions - f_bar) ** 2, axis=0)
# Ruido: σ² (conocido por construcción)
noise = np.full(n_test, noise_std ** 2)
# Error total: MSE promedio
mse = np.mean((predictions - y_true) ** 2, axis=0)
# Verificar descomposición
decomposition = bias_squared + variance + noise
reconstruction_error = np.mean(np.abs(mse - decomposition))
return {
'bias_squared': np.mean(bias_squared),
'variance': np.mean(variance),
'noise': np.mean(noise),
'mse': np.mean(mse),
'decomposition': np.mean(decomposition),
'reconstruction_error': reconstruction_error,
'f_bar': f_bar,
'predictions': predictions
}
# ============================================================================
# 4. GENERAR MÚLTIPLES CONJUNTOS DE DATOS
# ============================================================================
print("Generando conjuntos de datos para simulación Monte Carlo...")
x_train_list = []
y_train_list = []
for i in range(N_DATASETS):
x, y = generate_dataset(N_SAMPLES, NOISE_STD, random_state=i)
x_train_list.append(x)
y_train_list.append(y)
print(f" {N_DATASETS} conjuntos generados con {N_SAMPLES} muestras cada uno")
# ============================================================================
# 5. EJECUTAR ANÁLISIS PARA CADA MODELO
# ============================================================================
print("\nEjecutando análisis sesgo-varianza para cada modelo...")
models = create_models()
results = {}
for name, model in models.items():
print(f" * Evaluando: {name}")
results[name] = estimate_bias_variance(
model, x_train_list, y_train_list, X_TEST, Y_TRUE, NOISE_STD
)
# ============================================================================
# 6. VISUALIZACIÓN DE RESULTADOS
# ============================================================================
print("\nGenerando visualizaciones...")
# Figura 1: Descomposición por modelo
fig, ax = plt.subplots(figsize=(14, 7))
model_names = list(results.keys())
x_pos = np.arange(len(model_names))
bias_values = [results[m]['bias_squared'] for m in model_names]
var_values = [results[m]['variance'] for m in model_names]
noise_values = [results[m]['noise'] for m in model_names]
mse_values = [results[m]['mse'] for m in model_names]
# Gráfico de barras apiladas
ax.bar(x_pos, bias_values, label='Sesgo²', color='#2E86AB', alpha=0.8)
ax.bar(x_pos, var_values, bottom=bias_values, label='Varianza', color='#A23B72', alpha=0.8)
ax.bar(x_pos, noise_values, bottom=np.array(bias_values)+np.array(var_values),
label='Ruido', color='#F18F01', alpha=0.8)
# Línea de error total
ax.plot(x_pos, mse_values, 'ko-', linewidth=2, markersize=8, label='MSE Total')
ax.set_xlabel('Modelo', fontsize=12)
ax.set_ylabel('Error', fontsize=12)
ax.set_title('Descomposición Sesgo-Varianza por Modelo', fontsize=14, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(model_names, rotation=45, ha='right', fontsize=9)
ax.legend(loc='upper right', fontsize=10)
ax.grid(alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
# Figura 2: Predicciones promedio vs. verdadera para modelos seleccionados
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Ajuste de Modelos: Promedio sobre 100 Conjuntos de Entrenamiento',
fontsize=14, fontweight='bold')
selected_models = ['Lineal', 'Polinomio grado 5', 'Polinomio grado 15', 'Árbol (prof=10)']
colors = ['#2E86AB', '#A23B72', '#F18F01', '#06A77D']
for ax, name, color in zip(axes.flatten(), selected_models, colors):
# Datos de entrenamiento (un ejemplo)
ax.scatter(x_train_list[0], y_train_list[0], s=15, alpha=0.3, color='gray', label='Datos')
# Función verdadera
ax.plot(X_TEST, Y_TRUE, 'k-', linewidth=2, label='Verdadera f*(x)')
# Predicción promedio
ax.plot(X_TEST, results[name]['f_bar'], '--', color=color, linewidth=2.5,
label=f'Promedio {name}')
# Banda de varianza (±1 desviación estándar)
pred_std = np.std(results[name]['predictions'], axis=0)
ax.fill_between(X_TEST.flatten(),
results[name]['f_bar'] - pred_std,
results[name]['f_bar'] + pred_std,
alpha=0.2, color=color, label='±1 Std (Varianza)')
ax.set_xlabel('x', fontsize=11)
ax.set_ylabel('y', fontsize=11)
ax.set_title(f'{name}\nSesgo²={results[name]["bias_squared"]:.4f}, Var={results[name]["variance"]:.4f}',
fontsize=11)
ax.legend(fontsize=8, loc='lower right')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()
# ============================================================================
# 7. RESUMEN CUANTITATIVO
# ============================================================================
print("\n" + "=" * 80)
print("RESUMEN CUANTITATIVO DE DESCOMPOSICIÓN SESGO-VARIANZA")
print("=" * 80)
print(f"{'Modelo':<25} {'Sesgo²':<12} {'Varianza':<12} {'Ruido':<10} {'MSE Total':<12}")
print("-" * 80)
for name in model_names:
r = results[name]
print(f"{name:<25} {r['bias_squared']:<12.6f} {r['variance']:<12.6f} "
f"{r['noise']:<10.6f} {r['mse']:<12.6f}")
print("\n\tOBSERVACIONES CLAVE:")
print(" 1. Modelos simples (Lineal): Sesgo alto, Varianza baja")
print(" 2. Modelos complejos (Polinomio 15, Árbol prof=10): Sesgo bajo, Varianza alta")
print(" 3. Modelos balanceados (Polinomio 5, Random Forest): Mejor MSE total")
print(" 4. El ruido es constante para todos los modelos (propiedad de los datos)")
print(" 5. La descomposición se verifica: MSE ≈ Sesgo² + Varianza + Ruido")
print("\n\tIMPLICACIONES PRÁCTICAS:")
print(" * Para reducir sesgo: aumentar complejidad, añadir features, reducir regularización")
print(" * Para reducir varianza: más datos, regularización, ensembling, simplificar modelo")
print(" * El objetivo es minimizar MSE total, no sesgo o varianza individualmente")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment