Skip to content

Instantly share code, notes, and snippets.

@thomasjungblut
Last active May 22, 2024 22:14
Show Gist options
  • Save thomasjungblut/b58d70d260abf0eff1a8c447f3d07389 to your computer and use it in GitHub Desktop.
Save thomasjungblut/b58d70d260abf0eff1a8c447f3d07389 to your computer and use it in GitHub Desktop.
XGBoost hyper parameter optimization using bayes_opt
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import KFold
import xgboost as xgb
def xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample):
# prepare xgb parameters
params = {
"objective": "reg:linear",
"booster" : "gbtree",
"eval_metric": "mae",
"tree_method": 'auto',
"silent": 1,
"eta": eta,
"max_depth": int(maxDepth),
"min_child_weight" : minChildWeight,
"subsample": subsample,
"colsample_bytree": colSample,
"gamma": gamma
}
cvScore = kFoldValidation(train, features, params, int(numRounds), nFolds = 3)
print('CV score: {:.6f}'.format(cvScore))
return -1.0 * cvScore # invert the cv score to let bayopt maximize
def bayesOpt(train, features):
ranges = {
'numRounds': (1000, 5000),
'eta': (0.001, 0.3),
'gamma': (0, 25),
'maxDepth': (1, 10),
'minChildWeight': (0, 10),
'subsample': (0, 1),
'colSample': (0, 1)
}
# proxy through a lambda to be able to pass train and features
optFunc = lambda numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample: xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample)
bo = BayesianOptimization(optFunc, ranges)
bo.maximize(init_points = 50, n_iter = 5, kappa = 2, acq = "ei", xi = 0.0)
bestMAE = round((-1.0 * bo.res['max']['max_val']), 6)
print("\n Best MAE found: %f" % bestMAE)
print("\n Parameters: %s" % bo.res['max']['max_params'])
def kFoldValidation(train, features, xgbParams, numRounds, nFolds, target='loss'):
kf = KFold(len(train), n_folds = nFolds, shuffle = True)
fold_score=[]
for train_index, cv_index in kf:
# split train/validation
X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[cv_index]
y_train, y_valid = (train[target].as_matrix()[train_index]), (train[target].as_matrix()[cv_index])
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(xgbParams, dtrain, numRounds, evals = watchlist, early_stopping_rounds = 100)
score = gbm.best_score
fold_score.append(score)
return np.mean(fold_score)
@Jason2Brownlee
Copy link

Excellent use of k-fold cross-validation with early stopping.

Enumerating the cv folds for xgboost manually is the only way I've found of correctly setting the eval set required for early stopping.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment