My current environment is...
- Python 3.5.2
- Scikit-learn 0.18.1
- XGBoost (python-package) 0.6
- LightGBM (python-package) v2.0
# -*- coding: utf-8 -*- | |
# | |
# digits_xgb.py | |
# date. 1/2/2017, 5/29 | |
# | |
# I'm going to learn how to tune xgboost model. | |
# 1. K-fold cross validation, 2. GridSearch | |
# | |
import numpy as np | |
import xgboost as xgb | |
from sklearn.datasets import load_digits | |
from sklearn.model_selection import train_test_split, KFold | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
# from sklearn.linear_model import LogisticRegression | |
def load_data(): | |
digits = load_digits() | |
y = digits.target | |
n_samples = len(digits.images) | |
X = digits.images.reshape((n_samples, -1)) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.33, random_state=0) | |
return X_train, X_test, y_train, y_test | |
def xgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5): | |
''' | |
Base analysis process by XGBoost (Grid Search) | |
''' | |
param_grid = { | |
'max_depth': [3, 4, 5], | |
'learning_rate': [0.1, 0.2], | |
'n_estimators': [100] } | |
xgbclf = xgb.XGBClassifier() | |
# Run Grid Search process | |
fit_params = {'eval_metric': 'mlogloss', | |
'verbose': False, | |
'early_stopping_rounds': 10, | |
'eval_set': [(X_test, y_test)]} | |
gs_clf = GridSearchCV(xgbclf, param_grid, | |
n_jobs=1, cv=n_folds, | |
fit_params=fit_params, | |
scoring='accuracy') | |
gs_clf.fit(X_train, y_train) | |
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1]) | |
print('score:', score) | |
for param_name in sorted(best_parameters.keys()): | |
print('%s: %r' % (param_name, best_parameters[param_name])) | |
xgbclf_best = xgb.XGBClassifier(**best_parameters) | |
xgbclf_best.fit(X_train, y_train) | |
y_pred_train = xgbclf_best.predict_proba(X_train) | |
y_pred_test = xgbclf_best.predict_proba(X_test) | |
return y_pred_train, y_pred_test | |
# | |
if __name__ == '__main__': | |
X_train, X_test, y_train, y_test = load_data() | |
print('XGBoost process:') | |
y_pred_tr, y_pred_ave = xgb_gridsearch(X_train, X_test, y_train, y_test) | |
y_pred_ave = np.argmax(y_pred_ave, axis=1) | |
# Evaluation the result | |
accu = accuracy_score(y_test, y_pred_ave) | |
print('\nAveraged model:') | |
print('accuracy = {:>.4f}'.format(accu)) | |
confmat = confusion_matrix(y_test, y_pred_ave) | |
print('\nconfusion matrix:') | |
print(confmat) |
# | |
# forest_fires_gs_lgb.py | |
# date. 5/29/2017 | |
# | |
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.metrics import mean_squared_error | |
import lightgbm as lgb | |
def load_data(): | |
fn = '../../Data/ForestFires/forestfires.csv' | |
forestfires = pd.read_csv(fn) | |
feats = forestfires.columns | |
X = forestfires.iloc[:, :-1].values | |
y = forestfires.iloc[:, -1].values | |
# extract subset | |
X = X[:, 4:] | |
feats = feats[4:-1] | |
return X, y, feats | |
def lgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5): | |
''' | |
LightGBM grid search cv | |
''' | |
param_grid = { | |
'objective': ['regression'], | |
'num_leaves': [15, 23, 31], | |
'learning_rate': [0.1, 0.2], | |
'n_estimators': [100]} | |
fit_params = { | |
'eval_metric': 'l2', | |
'eval_set': [(X_test, y_test)], | |
'verbose': False, | |
'early_stopping_rounds': 10} | |
# define classifier | |
lgb_reg = lgb.LGBMRegressor() | |
gs_reg = GridSearchCV(lgb_reg, param_grid, | |
n_jobs=1, cv=n_folds, | |
fit_params=fit_params, | |
scoring='neg_mean_squared_error') # important for Regression | |
gs_reg.fit(X_train, y_train) | |
means = gs_reg.cv_results_['mean_test_score'] | |
stds = gs_reg.cv_results_['std_test_score'] | |
for mean, std, params in zip(means, stds, gs_reg.cv_results_['params']): | |
print("%0.3f (+/-%0.03f) for %r" | |
% (mean, std * 2, params)) | |
best_parameters = gs_reg.best_params_ | |
print('\nbest parameters:') | |
for param_name in sorted(best_parameters.keys()): | |
print('%s: %r' % (param_name, best_parameters[param_name])) | |
reg_best = lgb.LGBMRegressor(**best_parameters) | |
return reg_best | |
if __name__ == '__main__': | |
X, y, feats = load_data() | |
X_train, X_test, y_train, y_test = train_test_split(X, y, | |
test_size=0.25, random_state=2017) | |
lgb_reg = lgb_gridsearch(X_train, X_test, y_train, y_test) | |
# Re-fit | |
lgb_reg.fit(X_train, y_train, | |
eval_metric='l2', | |
eval_set=[(X_test, y_test)], | |
early_stopping_rounds=10) | |
y_pred = lgb_reg.predict(X_test, num_iteration=lgb_reg.best_iteration) | |
mse = mean_squared_error(y_test, y_pred) | |
print('\nrmse = ', mse ** 0.5) |