Last active
July 19, 2019 17:27
-
-
Save topepo/e428a24e7bce3fd6f65bf524519c700b to your computer and use it in GitHub Desktop.
prototype for processing grids in terms of sub-models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _Prototype_ code to find the minimum grid that should be fit for models. This | |
# exploits the fact that some models can evaluate extra sub-models from the same | |
# object. | |
# devtools::install_github("tidymodels/parsnip") | |
# devtools::install_github("tidymodels/dials") | |
library(tidymodels) | |
#> ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.0.2 ── | |
#> ✔ broom 0.5.1 ✔ purrr 0.3.2 | |
#> ✔ dials 0.0.2.9000 ✔ recipes 0.1.6 | |
#> ✔ dplyr 0.8.3 ✔ rsample 0.0.5 | |
#> ✔ ggplot2 3.2.0 ✔ tibble 2.1.3 | |
#> ✔ infer 0.4.0 ✔ yardstick 0.0.3.9000 | |
#> ✔ parsnip 0.0.2.9000 | |
#> ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ── | |
#> ✖ purrr::discard() masks scales::discard() | |
#> ✖ dplyr::filter() masks stats::filter() | |
#> ✖ dplyr::lag() masks stats::lag() | |
#> ✖ ggplot2::margin() masks dials::margin() | |
#> ✖ recipes::step() masks stats::step() | |
min_grid <- function(x, grid, ...) { | |
# x is a `model_spec` object from parsnip | |
# grid is a tibble of tuning parameer values with names | |
# matching the parameter names. | |
UseMethod("min_grid") | |
} | |
# As an example, if we fit a boosted tree model and tune over | |
# trees = 1:20 and min_n = c(20, 30) | |
# we should only have to fit two models: | |
# | |
# trees = 20 & min_n = 20 | |
# trees = 20 & min_n = 30 | |
# | |
# The logic related to how this "mini grid" gets made is model-specific. | |
# | |
# To get the full set of predictions, we need to know, for each of these two | |
# models, what values of num_terms to give to the multi_predict() function. | |
# | |
# The current idea is to have a list column of the extra models for prediction. | |
# For the example above: | |
# | |
# # A tibble: 2 x 3 | |
# trees min_n .submodels | |
# <dbl> <dbl> <list> | |
# 1 20 20 <named list [1]> | |
# 2 20 30 <named list [1]> | |
# | |
# and the .submodels would both be | |
# | |
# list(trees = 1:19) | |
# | |
# There are a lot of other things to consider in future versions like grids | |
# where there are multiple columns with the same name (maybe the results of | |
# a recipe) and so on. | |
# helper functions | |
# Template for model results that do no have the sub-model feature | |
blank_submodels <- function(grid) { | |
grid %>% | |
dplyr::mutate(.submodels = map(1:nrow(grid), ~ list())) | |
} | |
get_fixed_args <- function(info) { | |
# Get non-sub-model columns to iterate over | |
fixed_args <- info$name[!info$has_submodel] | |
} | |
get_submodel_info <- function(spec, grid) { | |
param_info <- | |
get_from_env(paste0(class(spec)[1], "_args")) %>% | |
dplyr::filter(engine == spec$engine) %>% | |
dplyr::select(name = parsnip, has_submodel) | |
# In case a recipe or other activity has grid parameter columns, | |
# add those to the results | |
grid_names <- names(grid) | |
is_mod_param <- grid_names %in% param_info$name | |
if (any(!is_mod_param)) { | |
param_info <- | |
param_info %>% | |
bind_rows( | |
tibble(name = grid_names[!is_mod_param], | |
has_submodel = FALSE) | |
) | |
} | |
param_info %>% dplyr::filter(name %in% grid_names) | |
} | |
min_grid.boost_tree <- function(x, grid, ...) { | |
grid_names <- names(grid) | |
param_info <- get_submodel_info(x, grid) | |
# No ability to do submodels? Finish here: | |
if (!any(param_info$has_submodel)) { | |
return(blank_submodels(grid)) | |
} | |
fixed_args <- get_fixed_args(param_info) | |
# For boosted trees, fit the model with the most trees (conditional on the | |
# other parameters) so that you can do predictions on the smaller models. | |
fit_only <- | |
grid %>% | |
dplyr::group_by(!!!syms(fixed_args)) %>% | |
dplyr::summarize(trees = max(trees, na.rm = TRUE)) %>% | |
dplyr::ungroup() | |
# Add a column .submodels that is a list with what should be predicted | |
# by `multi_predict()` (assuming `predict()` has already been executed | |
# on the original value of 'trees') | |
min_grid_df <- | |
dplyr::full_join(fit_only %>% rename(max_tree = trees), grid, by = fixed_args) %>% | |
dplyr::filter(trees != max_tree) %>% | |
dplyr::group_by(!!!syms(fixed_args)) %>% | |
dplyr::summarize(.submodels = list(list(trees = trees))) %>% | |
dplyr::ungroup() %>% | |
dplyr::full_join(fit_only, grid, by = fixed_args) | |
min_grid_df %>% dplyr::select(one_of(grid_names), .submodels) | |
} | |
# Examples | |
# simple grids | |
boosting_spec <- boost_tree() %>% set_engine("xgboost") | |
basic_grid <- grid_regular(trees(), min_n(c(10, 20)), tree_depth(), levels = 3) | |
smaller_grid <- min_grid(boosting_spec, basic_grid) | |
smaller_grid | |
#> # A tibble: 9 x 4 | |
#> trees min_n tree_depth .submodels | |
#> <int> <int> <int> <list> | |
#> 1 2000 10 2 <named list [1]> | |
#> 2 2000 15 2 <named list [1]> | |
#> 3 2000 20 2 <named list [1]> | |
#> 4 2000 10 8 <named list [1]> | |
#> 5 2000 15 8 <named list [1]> | |
#> 6 2000 20 8 <named list [1]> | |
#> 7 2000 10 15 <named list [1]> | |
#> 8 2000 15 15 <named list [1]> | |
#> 9 2000 20 15 <named list [1]> | |
smaller_grid$.submodels[[1]] | |
#> $trees | |
#> [1] 1 1000 | |
# non-regular grids | |
set.seed(35) | |
filler_grid <- grid_max_entropy(trees(), min_n(c(10, 20)), tree_depth(), size = 5) | |
min_grid(boosting_spec, filler_grid) | |
#> # A tibble: 4 x 4 | |
#> trees min_n tree_depth .submodels | |
#> <int> <int> <int> <list> | |
#> 1 1259 12 3 <named list [1]> | |
#> 2 330 11 11 <NULL> | |
#> 3 1831 19 12 <NULL> | |
#> 4 1342 11 14 <NULL> | |
# an engine that does not support submodels: | |
spark_spec <- boost_tree() %>% set_engine("spark") | |
min_grid(spark_spec, basic_grid) | |
#> # A tibble: 27 x 4 | |
#> trees min_n tree_depth .submodels | |
#> * <int> <int> <int> <list> | |
#> 1 1 10 2 <list [0]> | |
#> 2 1000 10 2 <list [0]> | |
#> 3 2000 10 2 <list [0]> | |
#> 4 1 15 2 <list [0]> | |
#> 5 1000 15 2 <list [0]> | |
#> 6 2000 15 2 <list [0]> | |
#> 7 1 20 2 <list [0]> | |
#> 8 1000 20 2 <list [0]> | |
#> 9 2000 20 2 <list [0]> | |
#> 10 1 10 8 <list [0]> | |
#> # … with 17 more rows | |
min_grid.nearest_neighbor_kknn <- function(x, grid, ...) { | |
arg_info <- | |
get_from_env(paste0(class(x)[1], "_args")) %>% | |
dplyr::filter(engine == x$engine & parsnip %in% names(grid)) | |
all_param <- arg_info$parsnip | |
# If no submodels, just give the grid back with an extra column | |
if (!any(arg_info$has_submodel)) { | |
return(blank_submodels(grid)) | |
} | |
# Get non-sub-model columns to iterate over | |
fixed_args <- arg_info$parsnip[!arg_info$has_submodel] | |
fixed_syms <- rlang::syms(fixed_args) | |
fit_only <- | |
grid %>% | |
dplyr::group_by(!!!fixed_syms) %>% | |
# could do any value of k but we'll go with the max | |
dplyr::summarize(neighbors = max(neighbors, na.rm = TRUE)) %>% | |
dplyr::ungroup() | |
min_grid_df <- | |
dplyr::full_join(fit_only %>% rename(max_neighbors = neighbors), grid, by = fixed_args) %>% | |
dplyr::filter(neighbors != max_neighbors) %>% | |
dplyr::group_by(!!!fixed_syms) %>% | |
dplyr::summarize(.submodels = list(list(neighbors = neighbors))) %>% | |
dplyr::ungroup() %>% | |
dplyr::full_join(fit_only, grid, by = fixed_args) | |
min_grid_df %>% dplyr::select(!!!all_param, .submodels) | |
} | |
knn_grid <- | |
grid_regular(neighbors(c(1, 10)), dist_power(), weight_func(), levels = 3) %>% | |
# just for kicks: | |
slice(-4) | |
min_grid.nearest_neighbor_kknn(nearest_neighbor() %>% set_engine("kknn"), knn_grid) | |
#> # A tibble: 9 x 4 | |
#> neighbors weight_func dist_power .submodels | |
#> <int> <chr> <dbl> <list> | |
#> 1 10 epanechnikov 1 <named list [1]> | |
#> 2 10 epanechnikov 1.5 <named list [1]> | |
#> 3 10 epanechnikov 2 <named list [1]> | |
#> 4 10 rectangular 1 <named list [1]> | |
#> 5 10 rectangular 1.5 <named list [1]> | |
#> 6 10 rectangular 2 <named list [1]> | |
#> 7 10 triangular 1 <named list [1]> | |
#> 8 10 triangular 1.5 <named list [1]> | |
#> 9 10 triangular 2 <named list [1]> | |
knn_grid2 <- grid_max_entropy(neighbors(c(1, 10)), dist_power(), weight_func(), size = 10) | |
min_grid.nearest_neighbor_kknn(nearest_neighbor() %>% set_engine("kknn"), knn_grid2) | |
#> # A tibble: 10 x 4 | |
#> neighbors weight_func dist_power .submodels | |
#> <int> <chr> <dbl> <list> | |
#> 1 2 biweight 1.04 <NULL> | |
#> 2 9 epanechnikov 1.10 <NULL> | |
#> 3 9 gaussian 1.06 <NULL> | |
#> 4 9 gaussian 1.78 <NULL> | |
#> 5 3 rank 1.12 <NULL> | |
#> 6 4 rank 1.81 <NULL> | |
#> 7 9 rectangular 1.51 <NULL> | |
#> 8 1 rectangular 1.89 <NULL> | |
#> 9 9 rectangular 1.95 <NULL> | |
#> 10 6 triweight 1.44 <NULL> | |
min_grid.mars_earth <- function(x, grid, ...) { | |
arg_info <- | |
get_from_env(paste0(class(x)[1], "_args")) %>% | |
dplyr::filter(engine == x$engine & parsnip %in% names(grid)) | |
all_param <- arg_info$parsnip | |
# If no submodels, just give the grid back with an extra column | |
if (!any(arg_info$has_submodel)) { | |
return(blank_submodels(grid)) | |
} | |
# Get non-sub-model columns to iterate over | |
fixed_args <- arg_info$parsnip[!arg_info$has_submodel] | |
fixed_syms <- rlang::syms(fixed_args) | |
fit_only <- | |
grid %>% | |
dplyr::group_by(!!!fixed_syms) %>% | |
dplyr::summarize(num_terms = max(num_terms, na.rm = TRUE)) %>% | |
dplyr::ungroup() | |
min_grid_df <- | |
dplyr::full_join(fit_only %>% rename(max_terms = num_terms), grid, by = fixed_args) %>% | |
dplyr::filter(num_terms != max_terms) %>% | |
dplyr::group_by(!!!fixed_syms) %>% | |
dplyr::summarize(.submodels = list(list(num_terms = num_terms))) %>% | |
dplyr::ungroup() %>% | |
dplyr::full_join(fit_only, grid, by = fixed_args) | |
min_grid_df %>% dplyr::select(!!!all_param, .submodels) | |
} | |
min_grid.linear_reg_glmnet <- function(x, grid, ...) { | |
arg_info <- | |
get_from_env(paste0(class(x)[1], "_args")) %>% | |
dplyr::filter(engine == x$engine & parsnip %in% names(grid)) | |
all_param <- arg_info$parsnip | |
# If no submodels, just give the grid back with an extra column | |
if (!any(arg_info$has_submodel)) { | |
return(blank_submodels(grid)) | |
} | |
fit_only <- | |
grid %>% | |
distinct(mixture) | |
# To get entire regularization path, we will set lambda = NULL. Since you | |
# can't have a column of NULLS in a data frame, we set to NA. | |
min_grid_df <- | |
grid %>% | |
group_by(mixture) %>% | |
nest(-mixture) %>% | |
ungroup() %>% | |
mutate( | |
penalty = NA_real_, # should really be set to NULL; fix in translate function | |
.submodels = map(data, pull, 1) | |
) %>% | |
dplyr::select(-data) | |
min_grid_df %>% dplyr::select(!!!all_param, .submodels) | |
} | |
min_grid.logistic_reg_glmnet <- min_grid.linear_reg_glmnet | |
xgb_full_grid <- | |
crossing(tree_depth = 10:11, min_n = 5:7, trees = 1:3) | |
xgb_full_exp <- | |
crossing(tree_depth = 10:11, min_n = 5:7, trees = 3) %>% | |
mutate(.submodels = map(trees, ~ 1:2)) %>% | |
dplyr::select(tree_depth, trees, min_n, .submodels) | |
xgb_full_res <- | |
min_grid.boost_tree_xgboost( | |
boost_tree() %>% set_engine("xgboost"), | |
xgb_full_grid | |
) | |
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("xgboost"), : could not find function "min_grid.boost_tree_xgboost" | |
# Need to use vctrs: | |
# Error: Can't join on '.submodels' x '.submodels' because of incompatible types (list / list) | |
# all_equal(xgb_full_res, xgb_full_exp) | |
# re-use test_by_col() here | |
xgb_smol_grid <- | |
crossing(tree_depth = 10:11, min_n = 5:7) | |
xgb_full_exp <- | |
crossing(tree_depth = 10:11, min_n = 5:7) %>% | |
mutate(.submodels = map(tree_depth, ~ list())) | |
min_grid.boost_tree_xgboost( | |
boost_tree() %>% set_engine("xgboost"), | |
xgb_smol_grid | |
) | |
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("xgboost"), : could not find function "min_grid.boost_tree_xgboost" | |
c5_full_grid <- | |
crossing(min_n = 5:7, trees = 1:3) | |
c5_full_exp <- | |
crossing(min_n = 5:7, trees = 3) %>% | |
mutate(.submodels = map(trees, ~ 1:2)) %>% | |
dplyr::select(trees, min_n, .submodels) | |
c5_full_res <- | |
min_grid.boost_tree_xgboost( | |
boost_tree() %>% set_engine("C5.0"), | |
c5_full_grid | |
) | |
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("C5.0"), : could not find function "min_grid.boost_tree_xgboost" | |
c5_smol_grid <- | |
crossing(min_n = 5:7) | |
c5_full_exp <- | |
crossing(min_n = 5:7) %>% | |
mutate(.submodels = map(min_n, ~ list())) | |
min_grid.boost_tree_xgboost( | |
boost_tree() %>% set_engine("C5.0"), | |
c5_smol_grid | |
) | |
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("C5.0"), : could not find function "min_grid.boost_tree_xgboost" | |
earth_full_grid <- | |
crossing(num_terms = 10:11, prod_degree = 1:2, prune_method = c("none", "backward")) | |
earth_full_exp <- | |
crossing(num_terms = 11, prod_degree = 1:2, prune_method = c("none", "backward")) %>% | |
mutate(.submodels = map(prod_degree, ~ 10)) %>% | |
dplyr::select(num_terms, prod_degree, prune_method, .submodels) | |
earth_full_res <- | |
min_grid.mars_earth( | |
mars() %>% set_engine("earth"), | |
earth_full_grid | |
) | |
glmn_full_grid <- | |
crossing(mixture = (0:3)/3, penalty = c(0, .1, 1)) | |
glmn_full_exp <- | |
crossing(penalty = NA_real_, mixture = (0:3)/3) %>% | |
mutate(.submodels = map(mixture, ~ c(0, .1, 1))) | |
glmn_full_res <- | |
min_grid.linear_reg_glmnet( | |
linear_reg() %>% set_engine("glmnet"), | |
glmn_full_grid | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment