Created
June 12, 2020 14:22
-
-
Save topepo/718bdf1bb6951750fffc728414d00e5f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ----------------------------------------------------------------------------- | |
# Simple demonstration of stacking using three models with the Ames housing data | |
## ----------------------------------------------------------------------------- | |
library(tidymodels) | |
library(rules) | |
library(doMC) | |
library(AmesHousing) | |
library(ggforce) | |
registerDoMC(cores = 10) | |
## ----------------------------------------------------------------------------- | |
ames <- | |
make_ames() %>% | |
# Remove quality-related predictors | |
dplyr::select(-matches("Qu")) %>% | |
mutate(Sale_Price = log10(Sale_Price)) | |
set.seed(4595) | |
data_split <- initial_split(ames, strata = "Sale_Price") | |
ames_train <- training(data_split) | |
ames_test <- testing(data_split) | |
set.seed(8051) | |
rs <- vfold_cv(ames_train, strata = Sale_Price) | |
# ------------------------------------------------------------------------------ | |
# We'll use different pre-processing for different models but we can start with | |
# this: | |
basic_rec <- | |
recipe(Sale_Price ~ ., data = ames_train) %>% | |
step_dummy(all_nominal()) %>% | |
step_zv(all_predictors()) | |
# Stacking will always need the out-of-sample predictions | |
ctrl <- control_grid(save_pred = TRUE) | |
metric <- metric_set(rmse) | |
# ------------------------------------------------------------------------------ | |
# Fit a cubist rule-based model: | |
cubist_grid <- expand.grid(committees = 10 * (1:5), neighbors = 0:9) | |
set.seed(7966) | |
cubist_res <- | |
cubist_rules(committees = tune(), neighbors = tune()) %>% | |
set_engine("Cubist") %>% | |
tune_grid( | |
Sale_Price ~ ., | |
resamples = rs, | |
grid = cubist_grid, | |
control = ctrl, | |
metrics = metric | |
) | |
autoplot(cubist_res) | |
# ------------------------------------------------------------------------------ | |
# lasso model with supplimental nonlinear terms for a few key terms: | |
glmnet_rec <- | |
basic_rec %>% | |
step_ns(Longitude, deg_free = tune("longitude_df"))%>% | |
step_ns(Latitude, deg_free = tune("latitude_df"))%>% | |
step_ns(Gr_Liv_Area, deg_free = tune("live_area_df")) %>% | |
step_normalize(all_predictors()) | |
glmnet_mod <- | |
linear_reg(penalty = tune(), mixture = 1) %>% | |
set_engine("glmnet") | |
glmnet_wflow <- | |
workflow() %>% | |
add_recipe(glmnet_rec) %>% | |
add_model(glmnet_mod) | |
glmnet_param <- | |
glmnet_wflow %>% | |
parameters() %>% | |
update( | |
longitude_df = spline_degree(c(3, 50)), | |
latitude_df = spline_degree(c(3, 50)), | |
live_area_df = spline_degree(c(3, 50)) | |
) | |
set.seed(419) | |
glmnet_res <- | |
glmnet_wflow %>% | |
tune_grid(resamples = rs, grid = 25, control = ctrl, metrics = metric) | |
autoplot(glmnet_res) | |
# ------------------------------------------------------------------------------ | |
# MARS models | |
mars_grid <- expand.grid(num_terms = seq(2, 50, by = 2), prod_degree = 1:2) | |
set.seed(175) | |
mars_res <- | |
mars(num_terms = tune(), prod_degree = tune()) %>% | |
set_engine("earth") %>% | |
tune_grid( | |
basic_rec, | |
resamples = rs, | |
grid = mars_grid, | |
control = ctrl, | |
metrics = metric | |
) | |
autoplot(mars_res) | |
# ------------------------------------------------------------------------------ | |
# Stacking requires two things for each model: | |
# - A fitted workflow object for each model in the ensemble | |
# - The resampling results from `fit_resamples()`, `tune_grid()`, or `tune_bayes()` | |
# | |
# This function will collate the sets of out-of-sample predictions from the | |
# resampling results. This will need to be extended in a few ways as well as | |
# adding some convenience functions to assess the diversity of the ensemble | |
# results. | |
# | |
# Let's assume that we will select the sub-model with the smallest RMSE. We should | |
# generalize that along with the ability to use all/some sub-models in the | |
# tuning results. | |
get_best_pred <- function(x, pred_name = stop("need a name")) { | |
nm_list <- list(".pred") | |
names(nm_list) <- pred_name | |
keep_cols <- c(".row", pred_name) | |
mod_name <- | |
res <- select_best(x, metric = "rmse") | |
pred <- collect_predictions(x, parameters = res, summarize = TRUE) %>% | |
ungroup() | |
pred %>% dplyr::select(!!!nm_list, .row) | |
} | |
stack_data <- | |
ames_train %>% | |
select(Sale_Price) %>% | |
add_rowindex() %>% | |
full_join(get_best_pred(cubist_res, "cubist"), by = ".row") %>% | |
full_join(get_best_pred(glmnet_res, "glmnet"), by = ".row") %>% | |
full_join(get_best_pred(mars_res, "mars"), by = ".row") %>% | |
dplyr::select(-.row) | |
model_correlations <- cor(stack_data) | |
round(cor(stack_data), 2) | |
prcomp(model_correlations[-1, -1]) | |
ggplot(stack_data, aes(x = .panel_x, y = .panel_y)) + | |
geom_point(alpha = .3, size = 1) + | |
geom_autodensity(alpha = .25) + | |
facet_matrix(vars(everything()), layer.diag = 2) + | |
theme_bw() | |
## ----------------------------------------------------------------------------- | |
# Create the stacking loadings/coefficients. Given the correlations between | |
# model results, regularization should be used along with a non-negative | |
# coefficient constraint. glmnet is a good option but Bayesian methods might be | |
# a good choice too. | |
stack_model_spec <- | |
linear_reg(penalty = tune(), mixture = 1) %>% | |
set_engine("glmnet", lower.limits = 0) | |
set.seed(1872) | |
stack_model_res <- | |
stack_model_spec %>% | |
tune_grid( | |
Sale_Price ~ ., | |
resamples = bootstraps(stack_data), # any resampling method would work here | |
grid = tibble(penalty = 10 ^ (-6:-1)), | |
metrics = metric | |
) | |
autoplot(stack_model_res) | |
stack_model_fit <- | |
stack_model_spec %>% | |
finalize_model(tibble(penalty = 10^-3)) %>% | |
fit(Sale_Price ~ ., data = stack_data) | |
# What are the actual coefficients for each model? | |
coef(stack_model_fit$fit, s = 10^-3) %>% | |
as.matrix() %>% | |
t() %>% | |
as_tibble() | |
## ----------------------------------------------------------------------------- | |
# At this point, we would want to have fitted workflows for each model with | |
# non-zero coefficients. These would need to be bundled into an object so that | |
# we have a single-point of prediction. See next section for a mock-up of some | |
# interface thoughts. | |
## ----------------------------------------------------------------------------- | |
# API thoughts... here is my first thought about how the process might work. | |
# There's a lot of hand-waving here until we figure out some details. | |
if (FALSE) { | |
# This collates the resampling results and catalogs a few things about the | |
# models. | |
res <- | |
ens_stacked() %>% | |
add_member(fit = cubist_fit, results = cubist_res) %>% | |
add_member(fit = glmnet_fit, results = glmnet_res) %>% | |
add_member(fit = mars_fit, results = mars_res) | |
# Notes: | |
# | |
# 1. Rather than using a tibble as the primary data object for 'res', we | |
# would use a list I think. | |
# | |
# 2. We'd need to hope that the fit is matched to the grid results. I don't know | |
# of a way to do that. Potentially, the `fit` wouldn't be needed until the end | |
# so they could leave that empty and we do the fit at the end. | |
# | |
# 3. Instead of `fit`, maybe we should ask for the workflow and enforce a | |
# re-fitting | |
# | |
# 4. Use some verb to execute the process of estimating the stacking coefficients. | |
# `evaluate`? `components` would be a good non-verb choice. Or... we could use | |
# `fit` to calculate the coefficients and `refit` to make the object that has | |
# the ensemble members within. | |
# This would add the coefficients to the object. | |
res <- evaluate(res, model = "glmnet", parameters, resampling) | |
# More notes: | |
# | |
# 5. This object should be able to autoplot() the results of the tuning and | |
# allow users to set the amount of regularization and finalize. We might | |
# need a different object type here to do that part. | |
# | |
# 6. When we save the final coefficients, it should be a simple R expression | |
# that can be evaluated in a `dplyr::mutate()`. This will make the object | |
# smaller and also avoid having to have model objects for configurations/models | |
# that have coefficients of zero. | |
res <- fit(res, training = ames_train) | |
predict(res, new_data = ames_test) # `type` argument for classification models | |
} | |
## ----------------------------------------------------------------------------- | |
# Also, a lot of people would add _all_ of the sub-model results to the ensemble | |
# and let the regularized model select what is best for the ensemble. That's | |
# basically what h2o.ai does | |
get_all_pred <- function(x, pred_name = stop("need a name")) { | |
nm_list <- list(".pred") | |
names(nm_list) <- pred_name | |
keep_cols <- c(".row", pred_name) | |
params <- attributes(x)$parameters$id | |
pred <- | |
collect_predictions(x, summarize = TRUE) %>% | |
ungroup() | |
# Julia has a PR working to automatically assign a configuration ID to | |
# each parameter combination. We could use that and swap the prefix to pred_name | |
# https://github.com/tidymodels/tune/pull/227 | |
configs <- | |
pred %>% | |
dplyr::select(!!!params) %>% | |
distinct() | |
configs$.config <- recipes::names0(nrow(configs), pred_name) | |
pred <- | |
pred %>% | |
full_join(configs, by = params) %>% | |
dplyr::select(.row, .pred, .config) %>% | |
tidyr::pivot_wider(id_cols = ".row", names_from = ".config", values_from = ".pred") | |
pred # we will also have to save the mapping from the parameters to .config | |
} | |
# This seems slower than it should be | |
stack_submodels <- | |
ames_train %>% | |
select(Sale_Price) %>% | |
add_rowindex() %>% | |
full_join(get_all_pred(cubist_res, "cubist"), by = ".row") %>% | |
full_join(get_all_pred(glmnet_res, "glmnet"), by = ".row") %>% | |
full_join(get_all_pred(mars_res, "mars"), by = ".row") %>% | |
dplyr::select(-.row) | |
submodel_correlations <- cor(stack_submodels) | |
round(submodel_correlations, 2) | |
prcomp(submodel_correlations[-1, -1]) | |
ggplot(stack_submodels, aes(x = .panel_x, y = .panel_y)) + | |
geom_point(alpha = .3, size = 1) + | |
geom_autodensity(alpha = .25) + | |
facet_matrix(vars(everything()), layer.diag = 2) + | |
theme_bw() | |
set.seed(1872) | |
stack_submodel_res <- | |
stack_model_spec %>% | |
tune_grid( | |
Sale_Price ~ ., | |
resamples = bootstraps(stack_submodels), | |
grid = tibble(penalty = 10 ^ (-6:-1)), | |
metrics = metric | |
) | |
autoplot(stack_model_res) | |
stack_submodelsl_fit <- | |
stack_model_spec %>% | |
finalize_model(tibble(penalty = 10^-3)) %>% | |
fit(Sale_Price ~ ., data = stack_submodels) | |
coef(stack_model_fit$fit, s = 10^-3) %>% | |
as.matrix() %>% | |
as.data.frame() %>% | |
setNames("coefficient") %>% | |
rownames_to_column("term") %>% | |
as_tibble() %>% | |
filter(coefficient > 0) %>% | |
arrange(desc(coefficient)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment