Skip to content

Instantly share code, notes, and snippets.

@TimothyYe
Created July 25, 2024 13:52
Show Gist options
  • Save TimothyYe/b5037516ab7d2f90108cae67dcd67ed2 to your computer and use it in GitHub Desktop.
Save TimothyYe/b5037516ab7d2f90108cae67dcd67ed2 to your computer and use it in GitHub Desktop.
Stacking to Improve Model Performance
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
# https://medium.com/@brijesh_soni/stacking-to-improve-model-performance-a-comprehensive-guide-on-ensemble-learning-in-python-9ed53c93ce28
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train base models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(random_state=42)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
# Example 1: Stacking
def stacking_example(X_train, y_train, X_test, y_test, base_models, meta_model):
# Generate predictions from base models
base_predictions = np.column_stack([
model.predict_proba(X_train)[:, 1] for model in base_models
])
# Train meta-model
meta_model.fit(base_predictions, y_train)
# Make predictions on test set
test_base_predictions = np.column_stack([
model.predict_proba(X_test)[:, 1] for model in base_models
])
final_predictions = meta_model.predict(test_base_predictions)
return accuracy_score(y_test, final_predictions)
# Use logistic regression as the meta-model
meta_model = LogisticRegression()
base_models = [rf_model, xgb_model, lr_model]
stacking_accuracy = stacking_example(X_train, y_train, X_test, y_test, base_models, meta_model)
print(f"Stacking Accuracy: {stacking_accuracy:.4f}")
# Example 2: Weighted Averaging
def weighted_averaging_example(X_test, y_test, models, weights):
# Generate predictions from all models
predictions = np.array([model.predict_proba(X_test)[:, 1] for model in models])
# Apply weights and average
weighted_predictions = np.average(predictions, axis=0, weights=weights)
# Convert probabilities to class predictions
final_predictions = (weighted_predictions > 0.5).astype(int)
return accuracy_score(y_test, final_predictions)
# Calculate weights based on individual model performance
individual_accuracies = [
accuracy_score(y_test, model.predict(X_test)) for model in base_models
]
weights = np.array(individual_accuracies) / np.sum(individual_accuracies)
weighted_avg_accuracy = weighted_averaging_example(X_test, y_test, base_models, weights)
print(f"Weighted Averaging Accuracy: {weighted_avg_accuracy:.4f}")
# Compare with individual model performances
for model, acc in zip(["Random Forest", "XGBoost", "Logistic Regression"], individual_accuracies):
print(f"{model} Accuracy: {acc:.4f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment