Skip to content

Instantly share code, notes, and snippets.

@roberttreichler
Last active March 22, 2017 02:14
Show Gist options
  • Save roberttreichler/114acf6947dfdb473f2b5159894e9115 to your computer and use it in GitHub Desktop.
Save roberttreichler/114acf6947dfdb473f2b5159894e9115 to your computer and use it in GitHub Desktop.
xgboost early stop
import numpy as np
import pandas as pd
import xgboost as xgb
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools
n_folds = 5
cv_sum = 0
early_stopping = 100
fpred = []
xgb_rounds = []
d_train_full = xgb.DMatrix(train, label=y)
d_test = xgb.DMatrix(test)
kf = KFold(train.shape[0], n_folds=n_folds)
for i, (train_index, test_index) in enumerate(kf):
print('\n Fold %d' % (i+1))
X_train, X_val = train.iloc[train_index], train.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
rand_state = 2016
params = {'seed': 0,#
'nthread':-1,#
'colsample_bytree': 0.8,#
'silent': 0,#
'subsample': 0.8,#
'learning_rate': 0.02,#
'objective': 'binary:logistic',#
'scale_pos_weight': 1,#
'gamma' : 0,#
'max_depth': 11,#
'min_child_weight': 3,#
'booster': 'gbtree',#
'eval_metric':'logloss'}#
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_val, label=y_val)
watchlist = [(d_train, 'train'), (d_valid, 'eval')]
clf = xgb.train(params,
d_train,
100000,
watchlist,
early_stopping_rounds=50,
)
xgb_rounds.append(clf.best_iteration)
scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
cv_score = log_loss(y_val, scores_val)
print('log loss: %.6f' % cv_score)
y_pred = clf.predict(d_test, ntree_limit=clf.best_ntree_limit)
if i > 0:
fpred = pred + y_pred
else:
fpred = y_pred
pred = fpred
cv_sum = cv_sum + cv_score
mpred = pred / n_folds
score = cv_sum / n_folds
print('Average eval-log loss: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))
print("Writing results")
result = pd.DataFrame(columns=['test_id','is_duplicate'])
result['is_duplicate'] = mpred
result["test_id"] = test_id['test_id']
#result = result.set_index("test_id")
print("%d-fold average prediction:" % n_folds)
now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'submission_5fold-average-xgb_fairobj_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
#result.to_csv(sub_file, index=True, index_label='test_id')
result.to_csv(sub_file, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment