Last active
January 15, 2021 19:14
-
-
Save steermomo/ac3c57202d6e9e4b84a1bdad16fe3ba6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing core libraries | |
import numpy as np | |
import pandas as pd | |
from time import time | |
import pprint | |
import joblib | |
# Suppressing warnings because of skopt verbosity | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Our example dataset | |
from sklearn.datasets import load_boston | |
# Classifiers | |
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.pipeline import Pipeline | |
import lightgbm as lgb | |
import xgboost as xgb | |
from catboost import CatBoostClassifier | |
# Hyperparameters distributions | |
from scipy.stats import randint | |
from scipy.stats import uniform | |
# Model selection | |
from sklearn.model_selection import StratifiedKFold | |
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV | |
from sklearn.model_selection import cross_val_score | |
# Metrics | |
from sklearn.metrics import average_precision_score | |
from sklearn.metrics import make_scorer | |
# Skopt functions | |
from skopt import BayesSearchCV | |
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes | |
from skopt.space import Real, Categorical, Integer | |
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments | |
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time. | |
from skopt.callbacks import VerboseCallback # Callback to control the verbosity | |
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta | |
# Reporting util for different optimizers | |
def report_perf(optimizer, X, y, title, callbacks=None): | |
""" | |
A wrapper for measuring time and performances of different optmizers | |
optimizer = a sklearn or a skopt optimizer | |
X = the training set | |
y = our target | |
title = a string label for the experiment | |
""" | |
start = time() | |
if callbacks: | |
optimizer.fit(X, y, callback=callbacks) | |
else: | |
optimizer.fit(X, y) | |
best_score = optimizer.best_score_ | |
best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_] | |
best_params = optimizer.best_params_ | |
print((title + " took %.2f seconds, candidates checked: %d, best CV score: %.3f " | |
+u"\u00B1"+" %.3f") % (time() - start, | |
len(optimizer.cv_results_['params']), | |
best_score, | |
best_score_std)) | |
print('Best parameters:') | |
pprint.pprint(best_params) | |
print() | |
return best_params | |
# Setting a 5-fold stratified cross-validation (note: shuffle=True) | |
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) | |
# Converting average precision score into a scorer suitable for model selection | |
avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True) | |
clf = lgb.LGBMClassifier(boosting_type='gbdt', | |
class_weight='balanced', | |
objective='binary', | |
n_jobs=1, | |
verbose=0) | |
search_spaces = { | |
'learning_rate': Real(0.01, 1.0, 'log-uniform'), | |
'num_leaves': Integer(2, 500), | |
'max_depth': Integer(0, 500), | |
'min_child_samples': Integer(0, 200), # minimal number of data in one leaf | |
'max_bin': Integer(100, 100000), # max number of bins that feature values will be bucketed | |
'subsample': Real(0.01, 1.0, 'uniform'), | |
'subsample_freq': Integer(0, 10), # bagging fraction | |
'colsample_bytree': Real(0.01, 1.0, 'uniform'), # enabler of bagging fraction | |
'min_child_weight': Integer(0, 10), # minimal number of data in one leaf. | |
'subsample_for_bin': Integer(100000, 500000), # number of data that sampled for histogram bins | |
'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization | |
'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization | |
'scale_pos_weight': Real(1e-6, 500, 'log-uniform'), | |
'n_estimators': Integer(10, 10000) | |
} | |
opt = BayesSearchCV(clf, | |
search_spaces, | |
scoring=avg_prec, | |
cv=skf, | |
n_iter=40, | |
n_jobs=-1, | |
return_train_score=False, | |
refit=True, | |
optimizer_kwargs={'base_estimator': 'GP'}, | |
random_state=22) | |
best_params = report_perf(opt, X, y_bin,'LightGBM', | |
callbacks=[DeltaXStopper(0.001), | |
DeadlineStopper(60*5)]) | |
# XGBoost | |
clf = xgb.XGBClassifier( | |
n_jobs = 1, | |
objective = 'binary:logistic', | |
silent=1, | |
tree_method='approx') | |
search_spaces = {'learning_rate': Real(0.01, 1.0, 'log-uniform'), | |
'min_child_weight': Integer(0, 10), | |
'max_depth': Integer(0, 50), | |
'max_delta_step': Integer(0, 20), # Maximum delta step we allow each leaf output | |
'subsample': Real(0.01, 1.0, 'uniform'), | |
'colsample_bytree': Real(0.01, 1.0, 'uniform'), # subsample ratio of columns by tree | |
'colsample_bylevel': Real(0.01, 1.0, 'uniform'), # subsample ratio by level in trees | |
'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization | |
'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization | |
'gamma': Real(1e-9, 0.5, 'log-uniform'), # Minimum loss reduction for partition | |
'n_estimators': Integer(50, 100), | |
'scale_pos_weight': Real(1e-6, 500, 'log-uniform')} | |
opt = BayesSearchCV(clf, | |
search_spaces, | |
scoring=avg_prec, | |
cv=skf, | |
n_iter=40, | |
n_jobs=-1, | |
return_train_score=False, | |
refit=True, | |
optimizer_kwargs={'base_estimator': 'GP'}, | |
random_state=22) | |
best_params = report_perf(opt, X, y_bin,'XGBoost', | |
callbacks=[DeltaXStopper(0.001), | |
DeadlineStopper(60*5)]) | |
# cat boost | |
clf = CatBoostClassifier(loss_function='Logloss', | |
verbose = False) | |
search_spaces = {'iterations': Integer(10, 100), | |
'depth': Integer(1, 8), | |
'learning_rate': Real(0.01, 1.0, 'log-uniform'), | |
'random_strength': Real(1e-9, 10, 'log-uniform'), # randomness for scoring splits | |
'bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap | |
'border_count': Integer(1, 255), # splits for numerical features | |
'l2_leaf_reg': Integer(2, 30), # L2 regularization | |
'scale_pos_weight':Real(0.01, 10.0, 'uniform')} | |
opt = BayesSearchCV(clf, | |
search_spaces, | |
scoring=avg_prec, | |
cv=skf, | |
n_iter=40, | |
n_jobs=1, # use just 1 job with CatBoost in order to avoid segmentation fault | |
return_train_score=False, | |
refit=True, | |
optimizer_kwargs={'base_estimator': 'GP'}, | |
random_state=22) | |
best_params = report_perf(opt, X, y_bin,'CatBoost', | |
callbacks=[DeltaXStopper(0.001), | |
DeadlineStopper(60*5)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
From Competitive GBDT Specification and Optimization Workshop