Last active
August 29, 2017 19:11
-
-
Save Nikolay-Lysenko/37169317108d1426de6f0ccfacb8a961 to your computer and use it in GitHub Desktop.
Why no one should shuffle data endowed with temporal structure?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script demonstrates why train set and | |
test set must not be shuffled if a dataset has | |
temporal structure. In case of temporal | |
structure, shuffling leads to leakage of some | |
information about a test set to a training set | |
and this results in too optimistic scores. | |
The point of the considered here example is that | |
a particular flaw of tree-based ensembles | |
can not be revealed by evaluation with shuffling. | |
Namely, this flaw is that decision trees can not | |
make predictions that are beyond range of | |
target values on a train set. Hence, tree-based | |
ensembles are inappropriate tool for modeling of | |
time series with strong trend (at least until | |
such series are not detrended). However, | |
wrong evaluation leads to scores that are high | |
enough and allow drawing a deceptive conclusion | |
that there is no problem and the model is perfect. | |
@author: Nikolay Lysenko | |
""" | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import r2_score, mean_squared_error | |
from sklearn.base import BaseEstimator | |
from sklearn.ensemble import RandomForestRegressor | |
def generate_trivial_time_series_with_trend(series_length: int) -> np.ndarray: | |
""" | |
Generate time series that is equal to | |
time step (i.e. y = t where y is dependent | |
variable and t is time). | |
Such series can be easily forecasted with | |
linear regression, but not with tree-based | |
methods. | |
""" | |
time_variable = np.arange(0, series_length, 1, dtype=np.float64) | |
time_variable = time_variable.reshape((-1, 1)) | |
target = time_variable.copy() | |
dataset = np.hstack((time_variable, target)) | |
return dataset | |
def report_performance(X_train: np.ndarray, X_test: np.ndarray, | |
y_train: np.ndarray, y_test: np.ndarray, | |
rgr: BaseEstimator, name_of_run: str) -> type(None): | |
""" | |
Fit `rgr` to `X_train` and `y_train`, | |
report R^2 and MSE on `X_test` and `y_test`, | |
and after that visualize results. | |
""" | |
rgr.fit(X_train, y_train) | |
y_hat = rgr.predict(X_test) | |
report_template = '{name}: R^2 is {r_sq:.3f}, MSE is {mse:.3f}' | |
filling = {'name': name_of_run, | |
'r_sq': r2_score(y_test, y_hat), | |
'mse': mean_squared_error(y_test, y_hat)} | |
report = report_template.format(**filling) | |
print(report) | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
ax.scatter(X_train, y_train, color='blue', alpha=0.1) | |
ax.scatter(X_test, y_test, color='orange', alpha=0.5) | |
ax.scatter(X_test, y_hat, color='red', marker='x') | |
ax.set_title(report) | |
plt.show() | |
def main(): | |
series_length = 100 | |
random_state = 361 | |
dataset = generate_trivial_time_series_with_trend(series_length) | |
rgr = RandomForestRegressor(n_estimators=25, random_state=random_state) | |
wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test = \ | |
train_test_split(dataset[:, :-1], dataset[:, -1], | |
random_state=random_state) | |
report_performance( | |
wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test, rgr, | |
'Wrong train/test split') | |
train_test_threshold = int(round(0.75 * series_length, 0)) | |
fair_X_train, fair_X_test = (dataset[:train_test_threshold, :-1], | |
dataset[train_test_threshold:, :-1]) | |
fair_y_train, fair_y_test = (dataset[:train_test_threshold, -1], | |
dataset[train_test_threshold:, -1]) | |
report_performance( | |
fair_X_train, fair_X_test, fair_y_train, fair_y_test, rgr, | |
'Fair train/test split') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment