Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Nikolay-Lysenko/37169317108d1426de6f0ccfacb8a961 to your computer and use it in GitHub Desktop.
Save Nikolay-Lysenko/37169317108d1426de6f0ccfacb8a961 to your computer and use it in GitHub Desktop.
Why no one should shuffle data endowed with temporal structure?
"""
This script demonstrates why train set and
test set must not be shuffled if a dataset has
temporal structure. In case of temporal
structure, shuffling leads to leakage of some
information about a test set to a training set
and this results in too optimistic scores.
The point of the considered here example is that
a particular flaw of tree-based ensembles
can not be revealed by evaluation with shuffling.
Namely, this flaw is that decision trees can not
make predictions that are beyond range of
target values on a train set. Hence, tree-based
ensembles are inappropriate tool for modeling of
time series with strong trend (at least until
such series are not detrended). However,
wrong evaluation leads to scores that are high
enough and allow drawing a deceptive conclusion
that there is no problem and the model is perfect.
@author: Nikolay Lysenko
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor
def generate_trivial_time_series_with_trend(series_length: int) -> np.ndarray:
"""
Generate time series that is equal to
time step (i.e. y = t where y is dependent
variable and t is time).
Such series can be easily forecasted with
linear regression, but not with tree-based
methods.
"""
time_variable = np.arange(0, series_length, 1, dtype=np.float64)
time_variable = time_variable.reshape((-1, 1))
target = time_variable.copy()
dataset = np.hstack((time_variable, target))
return dataset
def report_performance(X_train: np.ndarray, X_test: np.ndarray,
y_train: np.ndarray, y_test: np.ndarray,
rgr: BaseEstimator, name_of_run: str) -> type(None):
"""
Fit `rgr` to `X_train` and `y_train`,
report R^2 and MSE on `X_test` and `y_test`,
and after that visualize results.
"""
rgr.fit(X_train, y_train)
y_hat = rgr.predict(X_test)
report_template = '{name}: R^2 is {r_sq:.3f}, MSE is {mse:.3f}'
filling = {'name': name_of_run,
'r_sq': r2_score(y_test, y_hat),
'mse': mean_squared_error(y_test, y_hat)}
report = report_template.format(**filling)
print(report)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(X_train, y_train, color='blue', alpha=0.1)
ax.scatter(X_test, y_test, color='orange', alpha=0.5)
ax.scatter(X_test, y_hat, color='red', marker='x')
ax.set_title(report)
plt.show()
def main():
series_length = 100
random_state = 361
dataset = generate_trivial_time_series_with_trend(series_length)
rgr = RandomForestRegressor(n_estimators=25, random_state=random_state)
wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test = \
train_test_split(dataset[:, :-1], dataset[:, -1],
random_state=random_state)
report_performance(
wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test, rgr,
'Wrong train/test split')
train_test_threshold = int(round(0.75 * series_length, 0))
fair_X_train, fair_X_test = (dataset[:train_test_threshold, :-1],
dataset[train_test_threshold:, :-1])
fair_y_train, fair_y_test = (dataset[:train_test_threshold, -1],
dataset[train_test_threshold:, -1])
report_performance(
fair_X_train, fair_X_test, fair_y_train, fair_y_test, rgr,
'Fair train/test split')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment