Nikolay-Lysenko · August 29, 2017 19:11
diff --git a/wrong_train_test_split_for_time_series.py b/wrong_train_test_split_for_time_series.py
 """
 This script demonstrates why train set and
 test set must not be shuffled if a dataset has
 temporal structure. In case of temporal
 structure, shuffling leads to leakage of some
 information about a test set to a training set
 and this results in too optimistic scores.

 The point of the considered here example is that
 a particular flaw of tree-based ensembles
 can not be revealed by evaluation with shuffling.
 Namely, this flaw is that decision trees can not
 make predictions that are beyond range of
 target values on a train set. Hence, tree-based
 ensembles are inappropriate tool for modeling of
 time series with strong trend (at least until
 such series are not detrended). However,
 wrong evaluation leads to scores that are high
 enough and allow drawing a deceptive conclusion
 that there is no problem and the model is perfect.

 @author: Nikolay Lysenko
 """


 import numpy as np
 import matplotlib.pyplot as plt

 from sklearn.model_selection import train_test_split
 from sklearn.metrics import r2_score, mean_squared_error
 from sklearn.base import BaseEstimator
 from sklearn.ensemble import RandomForestRegressor


 def generate_trivial_time_series_with_trend(series_length: int) -> np.ndarray:
    """
    Generate time series that is equal to
    time step (i.e. y = t where y is dependent
    variable and t is time).
    Such series can be easily forecasted with
    linear regression, but not with tree-based
    methods.
    """
    time_variable = np.arange(0, series_length, 1, dtype=np.float64)
    time_variable = time_variable.reshape((-1, 1))
    target = time_variable.copy()
    dataset = np.hstack((time_variable, target))
    return dataset


 def report_performance(X_train: np.ndarray, X_test: np.ndarray,
                       y_train: np.ndarray, y_test: np.ndarray,
                       rgr: BaseEstimator, name_of_run: str) -> type(None):
    """
    Fit `rgr` to `X_train` and `y_train`,
    report R^2 and MSE on `X_test` and `y_test`,
    and after that visualize results.
    """
    rgr.fit(X_train, y_train)
    y_hat = rgr.predict(X_test)
    report_template = '{name}: R^2 is {r_sq:.3f}, MSE is {mse:.3f}'
    filling = {'name': name_of_run,
               'r_sq': r2_score(y_test, y_hat),
               'mse': mean_squared_error(y_test, y_hat)}
    report = report_template.format(**filling)
    print(report)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(X_train, y_train, color='blue', alpha=0.1)
    ax.scatter(X_test, y_test, color='orange', alpha=0.5)
    ax.scatter(X_test, y_hat, color='red', marker='x')
    ax.set_title(report)
    plt.show()


 def main():
    series_length = 100
    random_state = 361

    dataset = generate_trivial_time_series_with_trend(series_length)
  
    rgr = RandomForestRegressor(n_estimators=25, random_state=random_state)

    wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test = \
        train_test_split(dataset[:, :-1], dataset[:, -1],
                         random_state=random_state)
    report_performance(
        wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test, rgr,
        'Wrong train/test split')

    train_test_threshold = int(round(0.75 * series_length, 0))
    fair_X_train, fair_X_test = (dataset[:train_test_threshold, :-1],
                                 dataset[train_test_threshold:, :-1])
    fair_y_train, fair_y_test = (dataset[:train_test_threshold, -1],
                                 dataset[train_test_threshold:, -1])
    report_performance(
        fair_X_train, fair_X_test, fair_y_train, fair_y_test, rgr,
        'Fair train/test split')


 if __name__ == '__main__':
    main()
	"""
	This script demonstrates why train set and
	test set must not be shuffled if a dataset has
	temporal structure. In case of temporal
	structure, shuffling leads to leakage of some
	information about a test set to a training set
	and this results in too optimistic scores.

	The point of the considered here example is that
	a particular flaw of tree-based ensembles
	can not be revealed by evaluation with shuffling.
	Namely, this flaw is that decision trees can not
	make predictions that are beyond range of
	target values on a train set. Hence, tree-based
	ensembles are inappropriate tool for modeling of
	time series with strong trend (at least until
	such series are not detrended). However,
	wrong evaluation leads to scores that are high
	enough and allow drawing a deceptive conclusion
	that there is no problem and the model is perfect.

	@author: Nikolay Lysenko
	"""


	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.model_selection import train_test_split
	from sklearn.metrics import r2_score, mean_squared_error
	from sklearn.base import BaseEstimator
	from sklearn.ensemble import RandomForestRegressor


	def generate_trivial_time_series_with_trend(series_length: int) -> np.ndarray:
	"""
	Generate time series that is equal to
	time step (i.e. y = t where y is dependent
	variable and t is time).
	Such series can be easily forecasted with
	linear regression, but not with tree-based
	methods.
	"""
	time_variable = np.arange(0, series_length, 1, dtype=np.float64)
	time_variable = time_variable.reshape((-1, 1))
	target = time_variable.copy()
	dataset = np.hstack((time_variable, target))
	return dataset


	def report_performance(X_train: np.ndarray, X_test: np.ndarray,
	y_train: np.ndarray, y_test: np.ndarray,
	rgr: BaseEstimator, name_of_run: str) -> type(None):
	"""
	Fit `rgr` to `X_train` and `y_train`,
	report R^2 and MSE on `X_test` and `y_test`,
	and after that visualize results.
	"""
	rgr.fit(X_train, y_train)
	y_hat = rgr.predict(X_test)
	report_template = '{name}: R^2 is {r_sq:.3f}, MSE is {mse:.3f}'
	filling = {'name': name_of_run,
	'r_sq': r2_score(y_test, y_hat),
	'mse': mean_squared_error(y_test, y_hat)}
	report = report_template.format(**filling)
	print(report)

	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.scatter(X_train, y_train, color='blue', alpha=0.1)
	ax.scatter(X_test, y_test, color='orange', alpha=0.5)
	ax.scatter(X_test, y_hat, color='red', marker='x')
	ax.set_title(report)
	plt.show()


	def main():
	series_length = 100
	random_state = 361

	dataset = generate_trivial_time_series_with_trend(series_length)

	rgr = RandomForestRegressor(n_estimators=25, random_state=random_state)

	wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test = \
	train_test_split(dataset[:, :-1], dataset[:, -1],
	random_state=random_state)
	report_performance(
	wrong_X_train, wrong_X_test, wrong_y_train, wrong_y_test, rgr,
	'Wrong train/test split')

	train_test_threshold = int(round(0.75 * series_length, 0))
	fair_X_train, fair_X_test = (dataset[:train_test_threshold, :-1],
	dataset[train_test_threshold:, :-1])
	fair_y_train, fair_y_test = (dataset[:train_test_threshold, -1],
	dataset[train_test_threshold:, -1])
	report_performance(
	fair_X_train, fair_X_test, fair_y_train, fair_y_test, rgr,
	'Fair train/test split')


	if __name__ == '__main__':
	main()