Skip to content

Instantly share code, notes, and snippets.

@mavillan
Last active January 7, 2021 19:26
Show Gist options
  • Save mavillan/0b28b8a8e4d0095eef8fa2a026a9469e to your computer and use it in GitHub Desktop.
Save mavillan/0b28b8a8e4d0095eef8fa2a026a9469e to your computer and use it in GitHub Desktop.
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=23)
cv_scores = list()
best_iterations = list()
for train_idx,valid_idx in kf.split(train.loc[:,input_cols]):
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols)
_model = lgb.train(
train_set = train_dset,
valid_sets = [valid_dset, ],
num_boost_round = 10000,
early_stopping_rounds = 50,
params = model_params,
verbose_eval=25,
)
cv_scores.append(_model.best_score["valid_0"]["rmse"])
best_iterations.append(_model.best_iteration)
print(f"CV RMSE: {np.mean(cv_scores):0.5f}")
# output: CV RMSE: 250.98540
print(f"Best iterations by fold: {best_iterations}")
# output: Best iterations by fold: [653, 157, 75, 62, 2751]
kf_outer = model_selection.KFold(n_splits=5, shuffle=True, random_state=23)
cv_scores = list()
for train_idx,valid_idx in kf_outer.split(train.loc[:,input_cols]):
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]
best_iterations = list()
kf_inner = model_selection.KFold(n_splits=5, shuffle=True, random_state=2)
for train_idx_inner,valid_idx_inner in kf_inner.split(X_train, y_train):
X_train_inner,y_train_inner = X_train.iloc[train_idx_inner,:], y_train.iloc[train_idx_inner]
X_valid_inner,y_valid_inner = X_train.iloc[valid_idx_inner,:], y_train.iloc[valid_idx_inner]
train_dset = cast_to_lgb_dset(X_train_inner, y_train_inner, categoric_cols)
valid_dset = cast_to_lgb_dset(X_valid_inner, y_valid_inner, categoric_cols)
_model = lgb.train(
train_set = train_dset,
valid_sets = [valid_dset, ],
num_boost_round = 10000,
early_stopping_rounds = 50,
params = model_params,
verbose_eval=25
)
best_iterations.append(_model.best_iteration)
best_iteration = np.median(best_iterations)
# trains a model over outer split using best iteration obtained from inner loop
# (without using early stopping to avoid leakage)
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
model = lgb.train(
train_set = train_dset,
num_boost_round = int(best_iteration),
params = model_params
)
preds = model.predict(X_valid)
rmse = np.sqrt(metrics.mean_squared_error(y_valid, preds))
cv_scores.append(rmse)
print(f"CV RMSE: {np.mean(cv_scores):0.5f}")
# output: CV RMSE: 284.08901
# first kfold splitting object (with a different seed!)
# the first kfold splitting is for finding a good value of number of iterations
kf1 = model_selection.KFold(n_splits=5, shuffle=True, random_state=123)
best_iterations = list()
for train_idx,valid_idx in kf1.split(train.loc[:,input_cols]):
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols)
_model = lgb.train(
train_set = train_dset,
valid_sets = [valid_dset, ],
num_boost_round = 10000,
early_stopping_rounds = 50,
params = model_params,
verbose_eval=25,
)
best_iterations.append(_model.best_iteration)
best_iteration = np.median(best_iterations)
# the second kfold splitting for estimating model performance
kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=23)
cv_scores = list()
for train_idx,valid_idx in kf2.split(train.loc[:,input_cols]):
X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]
train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
model = lgb.train(
train_set = train_dset,
num_boost_round = int(best_iteration),
params = model_params
)
preds = model.predict(X_valid)
rmse = np.sqrt(metrics.mean_squared_error(y_valid, preds))
cv_scores.append(rmse)
print(f"CV RMSE: {np.mean(cv_scores):0.5f}")
# output: CV RMSE: 270.39060
date store_nbr day_of_week week_of_year temperature ... sales
2021-01-01 123 5 1 23 ... 30213
2021-01-01 521 5 1 07 ... 61823
2021-01-01 623 5 1 13 ... 19019
2021-01-01 215 5 1 27 ... 51391
2021-01-01 765 5 1 18 ... 31222
... ... ... ... ... ... ...
2021-01-01 341 5 1 31 ... 89081
import numpy as np
from sklearn import model_selection
from sklearn import preprocessing
# synthetic dataset
X_full = np.random.random((1000,5))
y_full = np.random.random((1000,1))
# leaky scaling
scaler1 = preprocessing.StandardScaler()
X_full_scaled = scaler1.fit_transform(X_full)
split = model_selection.train_test_split(X_full_scaled, y_full, test_size=0.25)
X_train,X_test,y_train,y_test = split
# correct scaling
split = model_selection.train_test_split(X_full, y_full, test_size=0.25)
X_train,X_test,y_train,y_test = split
scaler2 = preprocessing.StandardScaler()
X_train = scaler2.fit_transform(X_train)
X_test = scaler2.transform(X_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment