-
-
Save roberttreichler/114acf6947dfdb473f2b5159894e9115 to your computer and use it in GitHub Desktop.
xgboost early stop
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import xgboost as xgb | |
from datetime import datetime | |
from sklearn.metrics import mean_absolute_error | |
from sklearn.cross_validation import KFold | |
from scipy.stats import skew, boxcox | |
from sklearn import preprocessing | |
from sklearn.preprocessing import StandardScaler | |
import itertools | |
n_folds = 5 | |
cv_sum = 0 | |
early_stopping = 100 | |
fpred = [] | |
xgb_rounds = [] | |
d_train_full = xgb.DMatrix(train, label=y) | |
d_test = xgb.DMatrix(test) | |
kf = KFold(train.shape[0], n_folds=n_folds) | |
for i, (train_index, test_index) in enumerate(kf): | |
print('\n Fold %d' % (i+1)) | |
X_train, X_val = train.iloc[train_index], train.iloc[test_index] | |
y_train, y_val = y.iloc[train_index], y.iloc[test_index] | |
rand_state = 2016 | |
params = {'seed': 0,# | |
'nthread':-1,# | |
'colsample_bytree': 0.8,# | |
'silent': 0,# | |
'subsample': 0.8,# | |
'learning_rate': 0.02,# | |
'objective': 'binary:logistic',# | |
'scale_pos_weight': 1,# | |
'gamma' : 0,# | |
'max_depth': 11,# | |
'min_child_weight': 3,# | |
'booster': 'gbtree',# | |
'eval_metric':'logloss'}# | |
d_train = xgb.DMatrix(X_train, label=y_train) | |
d_valid = xgb.DMatrix(X_val, label=y_val) | |
watchlist = [(d_train, 'train'), (d_valid, 'eval')] | |
clf = xgb.train(params, | |
d_train, | |
100000, | |
watchlist, | |
early_stopping_rounds=50, | |
) | |
xgb_rounds.append(clf.best_iteration) | |
scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit) | |
cv_score = log_loss(y_val, scores_val) | |
print('log loss: %.6f' % cv_score) | |
y_pred = clf.predict(d_test, ntree_limit=clf.best_ntree_limit) | |
if i > 0: | |
fpred = pred + y_pred | |
else: | |
fpred = y_pred | |
pred = fpred | |
cv_sum = cv_sum + cv_score | |
mpred = pred / n_folds | |
score = cv_sum / n_folds | |
print('Average eval-log loss: %.6f' % score) | |
n_rounds = int(np.mean(xgb_rounds)) | |
print("Writing results") | |
result = pd.DataFrame(columns=['test_id','is_duplicate']) | |
result['is_duplicate'] = mpred | |
result["test_id"] = test_id['test_id'] | |
#result = result.set_index("test_id") | |
print("%d-fold average prediction:" % n_folds) | |
now = datetime.now() | |
score = str(round((cv_sum / n_folds), 6)) | |
sub_file = 'submission_5fold-average-xgb_fairobj_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv' | |
print("Writing submission: %s" % sub_file) | |
#result.to_csv(sub_file, index=True, index_label='test_id') | |
result.to_csv(sub_file, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment