Last active
January 7, 2021 19:26
-
-
Save mavillan/0b28b8a8e4d0095eef8fa2a026a9469e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=23) | |
cv_scores = list() | |
best_iterations = list() | |
for train_idx,valid_idx in kf.split(train.loc[:,input_cols]): | |
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target] | |
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target] | |
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols) | |
valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols) | |
_model = lgb.train( | |
train_set = train_dset, | |
valid_sets = [valid_dset, ], | |
num_boost_round = 10000, | |
early_stopping_rounds = 50, | |
params = model_params, | |
verbose_eval=25, | |
) | |
cv_scores.append(_model.best_score["valid_0"]["rmse"]) | |
best_iterations.append(_model.best_iteration) | |
print(f"CV RMSE: {np.mean(cv_scores):0.5f}") | |
# output: CV RMSE: 250.98540 | |
print(f"Best iterations by fold: {best_iterations}") | |
# output: Best iterations by fold: [653, 157, 75, 62, 2751] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kf_outer = model_selection.KFold(n_splits=5, shuffle=True, random_state=23) | |
cv_scores = list() | |
for train_idx,valid_idx in kf_outer.split(train.loc[:,input_cols]): | |
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target] | |
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target] | |
best_iterations = list() | |
kf_inner = model_selection.KFold(n_splits=5, shuffle=True, random_state=2) | |
for train_idx_inner,valid_idx_inner in kf_inner.split(X_train, y_train): | |
X_train_inner,y_train_inner = X_train.iloc[train_idx_inner,:], y_train.iloc[train_idx_inner] | |
X_valid_inner,y_valid_inner = X_train.iloc[valid_idx_inner,:], y_train.iloc[valid_idx_inner] | |
train_dset = cast_to_lgb_dset(X_train_inner, y_train_inner, categoric_cols) | |
valid_dset = cast_to_lgb_dset(X_valid_inner, y_valid_inner, categoric_cols) | |
_model = lgb.train( | |
train_set = train_dset, | |
valid_sets = [valid_dset, ], | |
num_boost_round = 10000, | |
early_stopping_rounds = 50, | |
params = model_params, | |
verbose_eval=25 | |
) | |
best_iterations.append(_model.best_iteration) | |
best_iteration = np.median(best_iterations) | |
# trains a model over outer split using best iteration obtained from inner loop | |
# (without using early stopping to avoid leakage) | |
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols) | |
model = lgb.train( | |
train_set = train_dset, | |
num_boost_round = int(best_iteration), | |
params = model_params | |
) | |
preds = model.predict(X_valid) | |
rmse = np.sqrt(metrics.mean_squared_error(y_valid, preds)) | |
cv_scores.append(rmse) | |
print(f"CV RMSE: {np.mean(cv_scores):0.5f}") | |
# output: CV RMSE: 284.08901 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# first kfold splitting object (with a different seed!) | |
# the first kfold splitting is for finding a good value of number of iterations | |
kf1 = model_selection.KFold(n_splits=5, shuffle=True, random_state=123) | |
best_iterations = list() | |
for train_idx,valid_idx in kf1.split(train.loc[:,input_cols]): | |
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target] | |
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target] | |
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols) | |
valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols) | |
_model = lgb.train( | |
train_set = train_dset, | |
valid_sets = [valid_dset, ], | |
num_boost_round = 10000, | |
early_stopping_rounds = 50, | |
params = model_params, | |
verbose_eval=25, | |
) | |
best_iterations.append(_model.best_iteration) | |
best_iteration = np.median(best_iterations) | |
# the second kfold splitting for estimating model performance | |
kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=23) | |
cv_scores = list() | |
for train_idx,valid_idx in kf2.split(train.loc[:,input_cols]): | |
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target] | |
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target] | |
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols) | |
model = lgb.train( | |
train_set = train_dset, | |
num_boost_round = int(best_iteration), | |
params = model_params | |
) | |
preds = model.predict(X_valid) | |
rmse = np.sqrt(metrics.mean_squared_error(y_valid, preds)) | |
cv_scores.append(rmse) | |
print(f"CV RMSE: {np.mean(cv_scores):0.5f}") | |
# output: CV RMSE: 270.39060 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
date | store_nbr | day_of_week | week_of_year | temperature | ... | sales | |
---|---|---|---|---|---|---|---|
2021-01-01 | 123 | 5 | 1 | 23 | ... | 30213 | |
2021-01-01 | 521 | 5 | 1 | 07 | ... | 61823 | |
2021-01-01 | 623 | 5 | 1 | 13 | ... | 19019 | |
2021-01-01 | 215 | 5 | 1 | 27 | ... | 51391 | |
2021-01-01 | 765 | 5 | 1 | 18 | ... | 31222 | |
... | ... | ... | ... | ... | ... | ... | |
2021-01-01 | 341 | 5 | 1 | 31 | ... | 89081 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn import model_selection | |
from sklearn import preprocessing | |
# synthetic dataset | |
X_full = np.random.random((1000,5)) | |
y_full = np.random.random((1000,1)) | |
# leaky scaling | |
scaler1 = preprocessing.StandardScaler() | |
X_full_scaled = scaler1.fit_transform(X_full) | |
split = model_selection.train_test_split(X_full_scaled, y_full, test_size=0.25) | |
X_train,X_test,y_train,y_test = split | |
# correct scaling | |
split = model_selection.train_test_split(X_full, y_full, test_size=0.25) | |
X_train,X_test,y_train,y_test = split | |
scaler2 = preprocessing.StandardScaler() | |
X_train = scaler2.fit_transform(X_train) | |
X_test = scaler2.transform(X_test) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment