Skip to content

Instantly share code, notes, and snippets.

@stufield
Last active June 4, 2019 21:17
Show Gist options
  • Select an option

  • Save stufield/1864c45db9e2f396d89a42f85d64153c to your computer and use it in GitHub Desktop.

Select an option

Save stufield/1864c45db9e2f396d89a42f85d64153c to your computer and use it in GitHub Desktop.

Reprex output recipes missing variables


if variables are present in test set

that were absent in the training/recipe set

they are silently dropped from the prepped test set

This may not suit all use cases

The reprex output (see *.R file for raw reprex)


options(width = 100)
library(dplyr)
library(tibble)
library(recipes)
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#> 
#>     step
set.seed(1)                               # reproducible
packageVersion("recipes")
#> [1] '0.1.4'

Setup: create binary class data

data <- iris %>% 
  filter(Species != "versicolor") %>%   # binary; rm 1 class
  mutate(id = row_number())  %>%   # set up temp variable for the join
  as_tibble()

train <- data %>% sample_frac(size = 0.8)  # random selection of rows @ 80%

test <- data %>%
  anti_join(train, by = "id") %>%  # use anti_join to get the sample setdiff
  select(-id)                      # remove id

train <- dplyr::select(train, -id)      # rm id

Create simple recipe: center, scale

recp <- recipe(Species ~ ., data = train) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>% 
  prep(training = train)

train <- bake(recp, new_data = train)
names(train)
#> [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

Future test set may have additional sample information we want to track that were unrelated to the training set

test <- test %>% 
  mutate(
    pid = sample(1:nrow(.)),
    state = sample(c("CO", "WY", "NY", "LA"), nrow(.), replace = TRUE),
    diabetes = sample(c("Y", "N"), nrow(.), replace = TRUE)
  )
head(test, 10)
#> # A tibble: 10 x 8
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species     pid state diabetes
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>     <int> <chr> <chr>   
#>  1          5           3.4          1.5         0.2 setosa        5 LA    N       
#>  2          5.8         4            1.2         0.2 setosa        2 NY    Y       
#>  3          5.2         4.1          1.5         0.1 setosa       12 WY    Y       
#>  4          4.6         3.2          1.4         0.2 setosa       15 WY    N       
#>  5          5.8         2.7          5.1         1.9 virginica    13 CO    N       
#>  6          6.3         2.9          5.6         1.8 virginica    18 CO    N       
#>  7          7.6         3            6.6         2.1 virginica     7 NY    N       
#>  8          6.4         3.2          5.3         2.3 virginica     6 CO    N       
#>  9          7.7         3.8          6.7         2.2 virginica    10 WY    N       
#> 10          5.6         2.8          4.9         2   virginica    14 NY    N

Pre-process the “new” test data set

test_prep <- bake(recp, new_data = test)

pid, state, and diabetes missing following data prep/baking because they were not present in the original training recipe

head(test_prep, 10)
#> # A tibble: 10 x 5
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
#>  1      -0.720       0.439        -0.824      -0.890 setosa   
#>  2       0.133       1.85         -0.968      -0.890 setosa   
#>  3      -0.506       2.09         -0.824      -1.00  setosa   
#>  4      -1.15       -0.0324       -0.872      -0.890 setosa   
#>  5       0.133      -1.21          0.901       0.998 virginica
#>  6       0.666      -0.740         1.14        0.887 virginica
#>  7       2.05       -0.504         1.62        1.22  virginica
#>  8       0.773      -0.0324        0.997       1.44  virginica
#>  9       2.16        1.38          1.67        1.33  virginica
#> 10      -0.0799     -0.976         0.805       1.11  virginica

Workaround; simply bind the variables back on after baking; Not ideal, could be error prone, breaks ‘tidy’ data frame philosophy

bind_cols(test_prep, select(test, pid, state, diabetes)) %>% head(10)
#> # A tibble: 10 x 8
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species     pid state diabetes
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>     <int> <chr> <chr>   
#>  1      -0.720       0.439        -0.824      -0.890 setosa        5 LA    N       
#>  2       0.133       1.85         -0.968      -0.890 setosa        2 NY    Y       
#>  3      -0.506       2.09         -0.824      -1.00  setosa       12 WY    Y       
#>  4      -1.15       -0.0324       -0.872      -0.890 setosa       15 WY    N       
#>  5       0.133      -1.21          0.901       0.998 virginica    13 CO    N       
#>  6       0.666      -0.740         1.14        0.887 virginica    18 CO    N       
#>  7       2.05       -0.504         1.62        1.22  virginica     7 NY    N       
#>  8       0.773      -0.0324        0.997       1.44  virginica     6 CO    N       
#>  9       2.16        1.38          1.67        1.33  virginica    10 WY    N       
#> 10      -0.0799     -0.976         0.805       1.11  virginica    14 NY    N

Created on 2019-06-04 by the reprex package (v0.2.1)

Session info
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 3.5.2 (2018-12-20)
#>  os       macOS Mojave 10.14.5        
#>  system   x86_64, darwin15.6.0        
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/Denver              
#>  date     2019-06-04                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────────────────────────
#>  package     * version    date       lib source             
#>  assertthat    0.2.0      2017-04-11 [1] CRAN (R 3.5.0)     
#>  backports     1.1.3      2018-12-14 [1] CRAN (R 3.5.0)     
#>  callr         3.1.1      2018-12-21 [1] CRAN (R 3.5.0)     
#>  class         7.3-15     2019-01-01 [2] CRAN (R 3.5.2)     
#>  cli           1.1.0      2019-03-19 [1] RSPM (R 3.5.2)     
#>  crayon        1.3.4      2017-09-16 [1] RSPM (R 3.5.2)     
#>  desc          1.2.0      2018-05-01 [1] CRAN (R 3.5.0)     
#>  devtools      2.0.1      2018-10-26 [1] CRAN (R 3.5.1)     
#>  digest        0.6.18     2018-10-10 [1] CRAN (R 3.5.0)     
#>  dplyr       * 0.8.0.1    2019-02-15 [1] RSPM (R 3.5.2)     
#>  evaluate      0.13       2019-02-12 [1] CRAN (R 3.5.2)     
#>  fansi         0.4.0      2018-10-05 [1] CRAN (R 3.5.0)     
#>  fs            1.2.6      2018-08-23 [1] CRAN (R 3.5.0)     
#>  generics      0.0.2      2018-11-29 [1] CRAN (R 3.5.0)     
#>  glue          1.3.1      2019-03-12 [1] CRAN (R 3.5.2)     
#>  gower         0.1.2      2017-02-23 [1] CRAN (R 3.5.0)     
#>  highr         0.7        2018-06-09 [1] CRAN (R 3.5.0)     
#>  htmltools     0.3.6      2017-04-28 [1] CRAN (R 3.5.0)     
#>  ipred         0.9-8      2018-11-05 [1] CRAN (R 3.5.0)     
#>  knitr         1.22       2019-03-08 [1] CRAN (R 3.5.2)     
#>  lattice       0.20-38    2018-11-04 [1] CRAN (R 3.5.0)     
#>  lava          1.6.5      2019-02-12 [1] CRAN (R 3.5.2)     
#>  lubridate     1.7.4      2018-04-11 [1] CRAN (R 3.5.0)     
#>  magrittr      1.5        2014-11-22 [1] RSPM (R 3.5.2)     
#>  MASS          7.3-51.4   2019-03-31 [1] standard (@7.3-51.)
#>  Matrix        1.2-17     2019-03-22 [1] standard (@1.2-17) 
#>  memoise       1.1.0      2017-04-21 [1] CRAN (R 3.5.0)     
#>  nnet          7.3-12     2016-02-02 [2] CRAN (R 3.5.2)     
#>  pillar        1.3.1      2018-12-15 [1] CRAN (R 3.5.0)     
#>  pkgbuild      1.0.2      2018-10-16 [1] CRAN (R 3.5.0)     
#>  pkgconfig     2.0.2      2018-08-16 [1] CRAN (R 3.5.0)     
#>  pkgload       1.0.2      2018-10-29 [1] CRAN (R 3.5.0)     
#>  prettyunits   1.0.2      2015-07-13 [1] CRAN (R 3.5.0)     
#>  processx      3.2.1      2018-12-05 [1] CRAN (R 3.5.0)     
#>  prodlim       2018.04.18 2018-04-18 [1] CRAN (R 3.5.0)     
#>  ps            1.3.0      2018-12-21 [1] CRAN (R 3.5.0)     
#>  purrr         0.3.2      2019-03-15 [1] RSPM (R 3.5.2)     
#>  R6            2.4.0      2019-02-14 [1] CRAN (R 3.5.2)     
#>  Rcpp          1.0.0      2018-11-07 [1] CRAN (R 3.5.0)     
#>  recipes     * 0.1.4      2018-11-19 [1] CRAN (R 3.5.2)     
#>  remotes       2.0.2      2018-10-30 [1] CRAN (R 3.5.0)     
#>  rlang         0.3.4      2019-04-07 [1] RSPM (R 3.5.2)     
#>  rmarkdown     1.11       2018-12-08 [1] CRAN (R 3.5.0)     
#>  rpart         4.1-13     2018-02-23 [2] CRAN (R 3.5.2)     
#>  rprojroot     1.3-2      2018-01-03 [1] CRAN (R 3.5.0)     
#>  sessioninfo   1.1.1      2018-11-05 [1] CRAN (R 3.5.0)     
#>  stringi       1.4.3      2019-03-12 [1] CRAN (R 3.5.2)     
#>  stringr       1.4.0      2019-02-10 [1] RSPM (R 3.5.2)     
#>  survival      2.43-3     2018-11-26 [2] CRAN (R 3.5.2)     
#>  testthat      2.0.0      2017-12-13 [1] CRAN (R 3.5.2)     
#>  tibble      * 2.1.1      2019-03-16 [1] RSPM (R 3.5.2)     
#>  tidyr         0.8.3      2019-03-01 [1] RSPM (R 3.5.2)     
#>  tidyselect    0.2.5      2018-10-11 [1] CRAN (R 3.5.0)     
#>  timeDate      3043.102   2018-02-21 [1] CRAN (R 3.5.0)     
#>  usethis       1.5.0      2019-04-07 [1] CRAN (R 3.5.2)     
#>  utf8          1.1.4      2018-05-24 [1] CRAN (R 3.5.0)     
#>  withr         2.1.2      2018-03-15 [1] CRAN (R 3.5.0)     
#>  xfun          0.5        2019-02-20 [1] CRAN (R 3.5.2)     
#>  yaml          2.2.0      2018-07-25 [1] CRAN (R 3.5.0)     
#> 
#> [1] /Users/sfield/r_libs
#> [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library
reprex::reprex({
options(width = 100)
library(dplyr)
library(tibble)
library(recipes)
set.seed(1) # reproducible
packageVersion("recipes")
#' Setup: create binary class data
data <- iris %>%
filter(Species != "versicolor") %>% # binary; rm 1 class
mutate(id = row_number()) %>% # set up temp variable for the join
as_tibble()
train <- data %>% sample_frac(size = 0.8) # random selection of rows @ 80%
test <- data %>%
anti_join(train, by = "id") %>% # use anti_join to get the sample setdiff
select(-id) # remove id
train <- dplyr::select(train, -id) # rm id
#' Create simple recipe: center, scale
recp <- recipe(Species ~ ., data = train) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
prep(training = train)
train <- bake(recp, new_data = train)
names(train)
#' Future test set may have additional sample information we want to track
#' that were unrelated to the training set
test <- test %>%
mutate(
pid = sample(1:nrow(.)),
state = sample(c("CO", "WY", "NY", "LA"), nrow(.), replace = TRUE),
diabetes = sample(c("Y", "N"), nrow(.), replace = TRUE)
)
head(test, 10)
#' Pre-process the "new" test data set
test_prep <- bake(recp, new_data = test)
#' pid, state, and diabetes missing following data prep/baking
#' because they were not present in the original training recipe
head(test_prep, 10)
#' Workaround; simply bind the variables back on after baking;
#' Not ideal, could be error prone, breaks 'tidy' data frame philosophy
bind_cols(test_prep, select(test, pid, state, diabetes)) %>% head(10)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment