Skip to content

Instantly share code, notes, and snippets.

@tomokishii
Last active August 21, 2018 05:26
Show Gist options
  • Save tomokishii/dbe88acf3e840137f71a159ac2a15692 to your computer and use it in GitHub Desktop.
Save tomokishii/dbe88acf3e840137f71a159ac2a15692 to your computer and use it in GitHub Desktop.
Modern Gradient Boosting models and Scikit-learn GridSearchCV

Modern Gradient Boosting models - how to use GridSearchCV

My current environment is...

  • Python 3.5.2
  • Scikit-learn 0.18.1
  • XGBoost (python-package) 0.6
  • LightGBM (python-package) v2.0
# -*- coding: utf-8 -*-
#
# digits_xgb.py
# date. 1/2/2017, 5/29
#
# I'm going to learn how to tune xgboost model.
# 1. K-fold cross validation, 2. GridSearch
#
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
# from sklearn.linear_model import LogisticRegression
def load_data():
digits = load_digits()
y = digits.target
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=0)
return X_train, X_test, y_train, y_test
def xgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5):
'''
Base analysis process by XGBoost (Grid Search)
'''
param_grid = {
'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.2],
'n_estimators': [100] }
xgbclf = xgb.XGBClassifier()
# Run Grid Search process
fit_params = {'eval_metric': 'mlogloss',
'verbose': False,
'early_stopping_rounds': 10,
'eval_set': [(X_test, y_test)]}
gs_clf = GridSearchCV(xgbclf, param_grid,
n_jobs=1, cv=n_folds,
fit_params=fit_params,
scoring='accuracy')
gs_clf.fit(X_train, y_train)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
print('score:', score)
for param_name in sorted(best_parameters.keys()):
print('%s: %r' % (param_name, best_parameters[param_name]))
xgbclf_best = xgb.XGBClassifier(**best_parameters)
xgbclf_best.fit(X_train, y_train)
y_pred_train = xgbclf_best.predict_proba(X_train)
y_pred_test = xgbclf_best.predict_proba(X_test)
return y_pred_train, y_pred_test
#
if __name__ == '__main__':
X_train, X_test, y_train, y_test = load_data()
print('XGBoost process:')
y_pred_tr, y_pred_ave = xgb_gridsearch(X_train, X_test, y_train, y_test)
y_pred_ave = np.argmax(y_pred_ave, axis=1)
# Evaluation the result
accu = accuracy_score(y_test, y_pred_ave)
print('\nAveraged model:')
print('accuracy = {:>.4f}'.format(accu))
confmat = confusion_matrix(y_test, y_pred_ave)
print('\nconfusion matrix:')
print(confmat)
#
# forest_fires_gs_lgb.py
# date. 5/29/2017
#
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
def load_data():
fn = '../../Data/ForestFires/forestfires.csv'
forestfires = pd.read_csv(fn)
feats = forestfires.columns
X = forestfires.iloc[:, :-1].values
y = forestfires.iloc[:, -1].values
# extract subset
X = X[:, 4:]
feats = feats[4:-1]
return X, y, feats
def lgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5):
'''
LightGBM grid search cv
'''
param_grid = {
'objective': ['regression'],
'num_leaves': [15, 23, 31],
'learning_rate': [0.1, 0.2],
'n_estimators': [100]}
fit_params = {
'eval_metric': 'l2',
'eval_set': [(X_test, y_test)],
'verbose': False,
'early_stopping_rounds': 10}
# define classifier
lgb_reg = lgb.LGBMRegressor()
gs_reg = GridSearchCV(lgb_reg, param_grid,
n_jobs=1, cv=n_folds,
fit_params=fit_params,
scoring='neg_mean_squared_error') # important for Regression
gs_reg.fit(X_train, y_train)
means = gs_reg.cv_results_['mean_test_score']
stds = gs_reg.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_reg.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
best_parameters = gs_reg.best_params_
print('\nbest parameters:')
for param_name in sorted(best_parameters.keys()):
print('%s: %r' % (param_name, best_parameters[param_name]))
reg_best = lgb.LGBMRegressor(**best_parameters)
return reg_best
if __name__ == '__main__':
X, y, feats = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25, random_state=2017)
lgb_reg = lgb_gridsearch(X_train, X_test, y_train, y_test)
# Re-fit
lgb_reg.fit(X_train, y_train,
eval_metric='l2',
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
y_pred = lgb_reg.predict(X_test, num_iteration=lgb_reg.best_iteration)
mse = mean_squared_error(y_test, y_pred)
print('\nrmse = ', mse ** 0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment