options(width = 100)
library(dplyr)
library(tibble)
library(recipes)
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
set.seed(1) # reproducible
packageVersion("recipes")
#> [1] '0.1.4'Setup: create binary class data
data <- iris %>%
filter(Species != "versicolor") %>% # binary; rm 1 class
mutate(id = row_number()) %>% # set up temp variable for the join
as_tibble()
train <- data %>% sample_frac(size = 0.8) # random selection of rows @ 80%
test <- data %>%
anti_join(train, by = "id") %>% # use anti_join to get the sample setdiff
select(-id) # remove id
train <- dplyr::select(train, -id) # rm idCreate simple recipe: center, scale
recp <- recipe(Species ~ ., data = train) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
prep(training = train)
train <- bake(recp, new_data = train)
names(train)
#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"Future test set may have additional sample information we want to track that were unrelated to the training set
test <- test %>%
mutate(
pid = sample(1:nrow(.)),
state = sample(c("CO", "WY", "NY", "LA"), nrow(.), replace = TRUE),
diabetes = sample(c("Y", "N"), nrow(.), replace = TRUE)
)
head(test, 10)
#> # A tibble: 10 x 8
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species pid state diabetes
#> <dbl> <dbl> <dbl> <dbl> <fct> <int> <chr> <chr>
#> 1 5 3.4 1.5 0.2 setosa 5 LA N
#> 2 5.8 4 1.2 0.2 setosa 2 NY Y
#> 3 5.2 4.1 1.5 0.1 setosa 12 WY Y
#> 4 4.6 3.2 1.4 0.2 setosa 15 WY N
#> 5 5.8 2.7 5.1 1.9 virginica 13 CO N
#> 6 6.3 2.9 5.6 1.8 virginica 18 CO N
#> 7 7.6 3 6.6 2.1 virginica 7 NY N
#> 8 6.4 3.2 5.3 2.3 virginica 6 CO N
#> 9 7.7 3.8 6.7 2.2 virginica 10 WY N
#> 10 5.6 2.8 4.9 2 virginica 14 NY NPre-process the “new” test data set
test_prep <- bake(recp, new_data = test)pid, state, and diabetes missing following data prep/baking because they were not present in the original training recipe
head(test_prep, 10)
#> # A tibble: 10 x 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 -0.720 0.439 -0.824 -0.890 setosa
#> 2 0.133 1.85 -0.968 -0.890 setosa
#> 3 -0.506 2.09 -0.824 -1.00 setosa
#> 4 -1.15 -0.0324 -0.872 -0.890 setosa
#> 5 0.133 -1.21 0.901 0.998 virginica
#> 6 0.666 -0.740 1.14 0.887 virginica
#> 7 2.05 -0.504 1.62 1.22 virginica
#> 8 0.773 -0.0324 0.997 1.44 virginica
#> 9 2.16 1.38 1.67 1.33 virginica
#> 10 -0.0799 -0.976 0.805 1.11 virginicaWorkaround; simply bind the variables back on after baking; Not ideal, could be error prone, breaks ‘tidy’ data frame philosophy
bind_cols(test_prep, select(test, pid, state, diabetes)) %>% head(10)
#> # A tibble: 10 x 8
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species pid state diabetes
#> <dbl> <dbl> <dbl> <dbl> <fct> <int> <chr> <chr>
#> 1 -0.720 0.439 -0.824 -0.890 setosa 5 LA N
#> 2 0.133 1.85 -0.968 -0.890 setosa 2 NY Y
#> 3 -0.506 2.09 -0.824 -1.00 setosa 12 WY Y
#> 4 -1.15 -0.0324 -0.872 -0.890 setosa 15 WY N
#> 5 0.133 -1.21 0.901 0.998 virginica 13 CO N
#> 6 0.666 -0.740 1.14 0.887 virginica 18 CO N
#> 7 2.05 -0.504 1.62 1.22 virginica 7 NY N
#> 8 0.773 -0.0324 0.997 1.44 virginica 6 CO N
#> 9 2.16 1.38 1.67 1.33 virginica 10 WY N
#> 10 -0.0799 -0.976 0.805 1.11 virginica 14 NY NCreated on 2019-06-04 by the reprex package (v0.2.1)
Session info
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────────────────────────
#> setting value
#> version R version 3.5.2 (2018-12-20)
#> os macOS Mojave 10.14.5
#> system x86_64, darwin15.6.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/Denver
#> date 2019-06-04
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.0 2017-04-11 [1] CRAN (R 3.5.0)
#> backports 1.1.3 2018-12-14 [1] CRAN (R 3.5.0)
#> callr 3.1.1 2018-12-21 [1] CRAN (R 3.5.0)
#> class 7.3-15 2019-01-01 [2] CRAN (R 3.5.2)
#> cli 1.1.0 2019-03-19 [1] RSPM (R 3.5.2)
#> crayon 1.3.4 2017-09-16 [1] RSPM (R 3.5.2)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 3.5.0)
#> devtools 2.0.1 2018-10-26 [1] CRAN (R 3.5.1)
#> digest 0.6.18 2018-10-10 [1] CRAN (R 3.5.0)
#> dplyr * 0.8.0.1 2019-02-15 [1] RSPM (R 3.5.2)
#> evaluate 0.13 2019-02-12 [1] CRAN (R 3.5.2)
#> fansi 0.4.0 2018-10-05 [1] CRAN (R 3.5.0)
#> fs 1.2.6 2018-08-23 [1] CRAN (R 3.5.0)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 3.5.0)
#> glue 1.3.1 2019-03-12 [1] CRAN (R 3.5.2)
#> gower 0.1.2 2017-02-23 [1] CRAN (R 3.5.0)
#> highr 0.7 2018-06-09 [1] CRAN (R 3.5.0)
#> htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.5.0)
#> ipred 0.9-8 2018-11-05 [1] CRAN (R 3.5.0)
#> knitr 1.22 2019-03-08 [1] CRAN (R 3.5.2)
#> lattice 0.20-38 2018-11-04 [1] CRAN (R 3.5.0)
#> lava 1.6.5 2019-02-12 [1] CRAN (R 3.5.2)
#> lubridate 1.7.4 2018-04-11 [1] CRAN (R 3.5.0)
#> magrittr 1.5 2014-11-22 [1] RSPM (R 3.5.2)
#> MASS 7.3-51.4 2019-03-31 [1] standard (@7.3-51.)
#> Matrix 1.2-17 2019-03-22 [1] standard (@1.2-17)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 3.5.0)
#> nnet 7.3-12 2016-02-02 [2] CRAN (R 3.5.2)
#> pillar 1.3.1 2018-12-15 [1] CRAN (R 3.5.0)
#> pkgbuild 1.0.2 2018-10-16 [1] CRAN (R 3.5.0)
#> pkgconfig 2.0.2 2018-08-16 [1] CRAN (R 3.5.0)
#> pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.5.0)
#> prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.5.0)
#> processx 3.2.1 2018-12-05 [1] CRAN (R 3.5.0)
#> prodlim 2018.04.18 2018-04-18 [1] CRAN (R 3.5.0)
#> ps 1.3.0 2018-12-21 [1] CRAN (R 3.5.0)
#> purrr 0.3.2 2019-03-15 [1] RSPM (R 3.5.2)
#> R6 2.4.0 2019-02-14 [1] CRAN (R 3.5.2)
#> Rcpp 1.0.0 2018-11-07 [1] CRAN (R 3.5.0)
#> recipes * 0.1.4 2018-11-19 [1] CRAN (R 3.5.2)
#> remotes 2.0.2 2018-10-30 [1] CRAN (R 3.5.0)
#> rlang 0.3.4 2019-04-07 [1] RSPM (R 3.5.2)
#> rmarkdown 1.11 2018-12-08 [1] CRAN (R 3.5.0)
#> rpart 4.1-13 2018-02-23 [2] CRAN (R 3.5.2)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.5.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.5.0)
#> stringi 1.4.3 2019-03-12 [1] CRAN (R 3.5.2)
#> stringr 1.4.0 2019-02-10 [1] RSPM (R 3.5.2)
#> survival 2.43-3 2018-11-26 [2] CRAN (R 3.5.2)
#> testthat 2.0.0 2017-12-13 [1] CRAN (R 3.5.2)
#> tibble * 2.1.1 2019-03-16 [1] RSPM (R 3.5.2)
#> tidyr 0.8.3 2019-03-01 [1] RSPM (R 3.5.2)
#> tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.5.0)
#> timeDate 3043.102 2018-02-21 [1] CRAN (R 3.5.0)
#> usethis 1.5.0 2019-04-07 [1] CRAN (R 3.5.2)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 3.5.0)
#> withr 2.1.2 2018-03-15 [1] CRAN (R 3.5.0)
#> xfun 0.5 2019-02-20 [1] CRAN (R 3.5.2)
#> yaml 2.2.0 2018-07-25 [1] CRAN (R 3.5.0)
#>
#> [1] /Users/sfield/r_libs
#> [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library