mohit-sinha · June 29, 2018 11:54
diff --git a/xgb_bo.py b/xgb_bo.py
 from bayes_opt import BayesianOptimization
 from sklearn.cross_validation import KFold
 import xgboost as xgb
 import numpy

 def xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample):
    # prepare xgb parameters 
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "tree_method": 'auto',
        "silent": 1,
        "eta": eta,
        "max_depth": int(maxDepth),
        "min_child_weight" : minChildWeight,
        "subsample": subsample,
        "colsample_bytree": colSample,
        "gamma": gamma
    }
    cvScore = kFoldValidation(train, features, params, int(numRounds), nFolds = 3)
    print('CV score: {:.6f}'.format(cvScore))
    return -1.0 * cvScore   # invert the cv score to let bayopt maximize
   
 def bayesOpt(train, features):
    ranges = {
        'numRounds': (1000, 5000),
        'eta': (0.001, 0.3),
        'gamma': (0, 25),
        'maxDepth': (1, 10),
        'minChildWeight': (0, 10),
        'subsample': (0, 1),
        'colSample': (0, 1)
    }
    # proxy through a lambda to be able to pass train and features
    optFunc = lambda numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample: xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample)
    bo = BayesianOptimization(optFunc, ranges)
    bo.maximize(init_points = 50, n_iter = 5, kappa = 2, acq = "ei", xi = 0.0)
    
    bestAUC = round((-1.0 * bo.res['max']['max_val']), 6)
    print("\n Best AUC found: %f" % bestAUC)
    print("\n Parameters: %s" % bo.res['max']['max_params'])
    

 def kFoldValidation(train, features, xgbParams, numRounds, nFolds, target='is_pass'):
    kf = KFold(len(train), n_folds = nFolds, shuffle = True)
    fold_score=[]
    
    for train_index, cv_index in kf:
        # split train/validation
        X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[cv_index]
        y_train, y_valid = train[target].as_matrix()[train_index], train[target].as_matrix()[cv_index]
        dtrain = xgb.DMatrix(X_train, y_train) 
        dvalid = xgb.DMatrix(X_valid, y_valid)
        
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        gbm = xgb.train(xgbParams, dtrain, numRounds, evals = watchlist, early_stopping_rounds = 100)
        
        score = gbm.best_score
        fold_score.append(score)
    
    return numpy.mean(fold_score)
	from bayes_opt import BayesianOptimization
	from sklearn.cross_validation import KFold
	import xgboost as xgb
	import numpy

	def xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample):
	# prepare xgb parameters
	params = {
	"objective": "binary:logistic",
	"booster" : "gbtree",
	"eval_metric": "auc",
	"tree_method": 'auto',
	"silent": 1,
	"eta": eta,
	"max_depth": int(maxDepth),
	"min_child_weight" : minChildWeight,
	"subsample": subsample,
	"colsample_bytree": colSample,
	"gamma": gamma
	}
	cvScore = kFoldValidation(train, features, params, int(numRounds), nFolds = 3)
	print('CV score: {:.6f}'.format(cvScore))
	return -1.0 * cvScore # invert the cv score to let bayopt maximize

	def bayesOpt(train, features):
	ranges = {
	'numRounds': (1000, 5000),
	'eta': (0.001, 0.3),
	'gamma': (0, 25),
	'maxDepth': (1, 10),
	'minChildWeight': (0, 10),
	'subsample': (0, 1),
	'colSample': (0, 1)
	}
	# proxy through a lambda to be able to pass train and features
	optFunc = lambda numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample: xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample)
	bo = BayesianOptimization(optFunc, ranges)
	bo.maximize(init_points = 50, n_iter = 5, kappa = 2, acq = "ei", xi = 0.0)

	bestAUC = round((-1.0 * bo.res['max']['max_val']), 6)
	print("\n Best AUC found: %f" % bestAUC)
	print("\n Parameters: %s" % bo.res['max']['max_params'])


	def kFoldValidation(train, features, xgbParams, numRounds, nFolds, target='is_pass'):
	kf = KFold(len(train), n_folds = nFolds, shuffle = True)
	fold_score=[]

	for train_index, cv_index in kf:
	# split train/validation
	X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[cv_index]
	y_train, y_valid = train[target].as_matrix()[train_index], train[target].as_matrix()[cv_index]
	dtrain = xgb.DMatrix(X_train, y_train)
	dvalid = xgb.DMatrix(X_valid, y_valid)

	watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
	gbm = xgb.train(xgbParams, dtrain, numRounds, evals = watchlist, early_stopping_rounds = 100)

	score = gbm.best_score
	fold_score.append(score)

	return numpy.mean(fold_score)