Skip to content

Instantly share code, notes, and snippets.

@rtkilian
Created July 11, 2022 22:44
Show Gist options
  • Save rtkilian/284a37bf11bd96c53504e7c4cd1ed8df to your computer and use it in GitHub Desktop.
Save rtkilian/284a37bf11bd96c53504e7c4cd1ed8df to your computer and use it in GitHub Desktop.
from sktime.forecasting.model_selection import temporal_train_test_split, SingleWindowSplitter, ForecastingRandomizedSearchCV
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sktime.utils.plotting import plot_series
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error
from xgboost import XGBRegressor
# Create an exogenous dataframe indicating the month
X = pd.DataFrame({'month': y.index.month}, index=y.index)
X = pd.get_dummies(X.astype(str), drop_first=True)
# Split data
y_train, y_test = temporal_train_test_split(y, test_size=26) # Predict from 1st July 2019
X_train, X_test = temporal_train_test_split(X, test_size=26)
# Forecasting horizon, same as test data
fh = ForecastingHorizon(y_test.index, is_relative=False)
# Window for cross-validation
validation_size = 26
cv = SingleWindowSplitter(window_length=len(y)-validation_size, fh=validation_size)
# XGBoost forecaster with grid of parameters
param_grid = {
'estimator__max_depth': [3, 5, 6, 10, 15, 20],
'estimator__learning_rate': [0.01, 0.1, 0.2, 0.3],
'estimator__subsample': np.arange(0.5, 1.0, 0.1),
'estimator__colsample_bytree': np.arange(0.4, 1.0, 0.1),
'estimator__colsample_bylevel': np.arange(0.4, 1.0, 0.1),
'estimator__n_estimators': [100, 500, 1000]
}
regressor = XGBRegressor(objective='reg:squarederror', random_state=42)
forecaster = make_reduction(regressor, window_length=52, strategy="recursive")
# Randomised search
gscv = ForecastingRandomizedSearchCV(forecaster, cv=cv, param_distributions=param_grid, n_iter=100, random_state=42)
# Fit and predict
gscv.fit(y=y_train, X=X_train)
y_pred = gscv.predict(fh=fh, X=X_test)
# Plot predictions with training and test data
plot_series(y_train['2018-07-01':], y_test, y_pred, labels=["y_train", "y_test", "y_pred"], x_label='Date', y_label='Count pedestrians');
# Evaluate
print('MAPE: %.4f' % mean_absolute_percentage_error(y_test, y_pred, symmetric=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment