Skip to content

Instantly share code, notes, and snippets.

@juliasilge
Created August 26, 2020 02:50
Show Gist options
  • Select an option

  • Save juliasilge/72ec225f115d707cf60e4e721ec272d7 to your computer and use it in GitHub Desktop.

Select an option

Save juliasilge/72ec225f115d707cf60e4e721ec272d7 to your computer and use it in GitHub Desktop.
When do bake() and juice() skip?
library(tidymodels)
data(ames)

set.seed(833961)
ames_split <- initial_split(ames, prob = 0.80, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)

ames_rec <- recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Sale_Price, skip = TRUE, base = 10) %>%
  step_dummy(all_nominal())


## skipped
prep(ames_rec) %>% bake(ames_train)
#> # A tibble: 2,199 x 35
#>    Gr_Liv_Area Year_Built Sale_Price Neighborhood_Co… Neighborhood_Ol…
#>          <int>      <int>      <int>            <dbl>            <dbl>
#>  1        1656       1960     215000                0                0
#>  2         896       1961     105000                0                0
#>  3        1329       1958     172000                0                0
#>  4        1604       1998     195500                0                0
#>  5        1338       2001     213500                0                0
#>  6        1280       1992     191500                0                0
#>  7        1616       1995     236500                0                0
#>  8        1804       1999     189000                0                0
#>  9        1655       1993     175900                0                0
#> 10        1187       1992     185000                0                0
#> # … with 2,189 more rows, and 30 more variables: Neighborhood_Edwards <dbl>,
#> #   Neighborhood_Somerset <dbl>, Neighborhood_Northridge_Heights <dbl>,
#> #   Neighborhood_Gilbert <dbl>, Neighborhood_Sawyer <dbl>,
#> #   Neighborhood_Northwest_Ames <dbl>, Neighborhood_Sawyer_West <dbl>,
#> #   Neighborhood_Mitchell <dbl>, Neighborhood_Brookside <dbl>,
#> #   Neighborhood_Crawford <dbl>, Neighborhood_Iowa_DOT_and_Rail_Road <dbl>,
#> #   Neighborhood_Timberland <dbl>, Neighborhood_Northridge <dbl>,
#> #   Neighborhood_Stone_Brook <dbl>,
#> #   Neighborhood_South_and_West_of_Iowa_State_University <dbl>,
#> #   Neighborhood_Clear_Creek <dbl>, Neighborhood_Meadow_Village <dbl>,
#> #   Neighborhood_Briardale <dbl>, Neighborhood_Bloomington_Heights <dbl>,
#> #   Neighborhood_Veenker <dbl>, Neighborhood_Northpark_Villa <dbl>,
#> #   Neighborhood_Blueste <dbl>, Neighborhood_Greens <dbl>,
#> #   Neighborhood_Green_Hills <dbl>, Neighborhood_Landmark <dbl>,
#> #   Neighborhood_Hayden_Lake <dbl>, Bldg_Type_TwoFmCon <dbl>,
#> #   Bldg_Type_Duplex <dbl>, Bldg_Type_Twnhs <dbl>, Bldg_Type_TwnhsE <dbl>

## not skipped
prep(ames_rec) %>% juice()
#> # A tibble: 2,199 x 35
#>    Gr_Liv_Area Year_Built Sale_Price Neighborhood_Co… Neighborhood_Ol…
#>          <int>      <int>      <dbl>            <dbl>            <dbl>
#>  1        1656       1960       5.33                0                0
#>  2         896       1961       5.02                0                0
#>  3        1329       1958       5.24                0                0
#>  4        1604       1998       5.29                0                0
#>  5        1338       2001       5.33                0                0
#>  6        1280       1992       5.28                0                0
#>  7        1616       1995       5.37                0                0
#>  8        1804       1999       5.28                0                0
#>  9        1655       1993       5.25                0                0
#> 10        1187       1992       5.27                0                0
#> # … with 2,189 more rows, and 30 more variables: Neighborhood_Edwards <dbl>,
#> #   Neighborhood_Somerset <dbl>, Neighborhood_Northridge_Heights <dbl>,
#> #   Neighborhood_Gilbert <dbl>, Neighborhood_Sawyer <dbl>,
#> #   Neighborhood_Northwest_Ames <dbl>, Neighborhood_Sawyer_West <dbl>,
#> #   Neighborhood_Mitchell <dbl>, Neighborhood_Brookside <dbl>,
#> #   Neighborhood_Crawford <dbl>, Neighborhood_Iowa_DOT_and_Rail_Road <dbl>,
#> #   Neighborhood_Timberland <dbl>, Neighborhood_Northridge <dbl>,
#> #   Neighborhood_Stone_Brook <dbl>,
#> #   Neighborhood_South_and_West_of_Iowa_State_University <dbl>,
#> #   Neighborhood_Clear_Creek <dbl>, Neighborhood_Meadow_Village <dbl>,
#> #   Neighborhood_Briardale <dbl>, Neighborhood_Bloomington_Heights <dbl>,
#> #   Neighborhood_Veenker <dbl>, Neighborhood_Northpark_Villa <dbl>,
#> #   Neighborhood_Blueste <dbl>, Neighborhood_Greens <dbl>,
#> #   Neighborhood_Green_Hills <dbl>, Neighborhood_Landmark <dbl>,
#> #   Neighborhood_Hayden_Lake <dbl>, Bldg_Type_TwoFmCon <dbl>,
#> #   Bldg_Type_Duplex <dbl>, Bldg_Type_Twnhs <dbl>, Bldg_Type_TwnhsE <dbl>

## skipped
prep(ames_rec) %>% bake(ames_test)
#> # A tibble: 731 x 35
#>    Gr_Liv_Area Year_Built Sale_Price Neighborhood_Co… Neighborhood_Ol…
#>          <int>      <int>      <int>            <dbl>            <dbl>
#>  1        2110       1968     244000                0                0
#>  2        1629       1997     189900                0                0
#>  3        1341       1990     171500                0                0
#>  4        1856       2010     394432                0                0
#>  5        1844       1977     190000                0                0
#>  6        1173       1974     170000                0                0
#>  7        1056       1968     142000                0                0
#>  8         864       1971     115000                0                0
#>  9        1704       2007     306000                0                0
#> 10        1822       2005     259000                0                0
#> # … with 721 more rows, and 30 more variables: Neighborhood_Edwards <dbl>,
#> #   Neighborhood_Somerset <dbl>, Neighborhood_Northridge_Heights <dbl>,
#> #   Neighborhood_Gilbert <dbl>, Neighborhood_Sawyer <dbl>,
#> #   Neighborhood_Northwest_Ames <dbl>, Neighborhood_Sawyer_West <dbl>,
#> #   Neighborhood_Mitchell <dbl>, Neighborhood_Brookside <dbl>,
#> #   Neighborhood_Crawford <dbl>, Neighborhood_Iowa_DOT_and_Rail_Road <dbl>,
#> #   Neighborhood_Timberland <dbl>, Neighborhood_Northridge <dbl>,
#> #   Neighborhood_Stone_Brook <dbl>,
#> #   Neighborhood_South_and_West_of_Iowa_State_University <dbl>,
#> #   Neighborhood_Clear_Creek <dbl>, Neighborhood_Meadow_Village <dbl>,
#> #   Neighborhood_Briardale <dbl>, Neighborhood_Bloomington_Heights <dbl>,
#> #   Neighborhood_Veenker <dbl>, Neighborhood_Northpark_Villa <dbl>,
#> #   Neighborhood_Blueste <dbl>, Neighborhood_Greens <dbl>,
#> #   Neighborhood_Green_Hills <dbl>, Neighborhood_Landmark <dbl>,
#> #   Neighborhood_Hayden_Lake <dbl>, Bldg_Type_TwoFmCon <dbl>,
#> #   Bldg_Type_Duplex <dbl>, Bldg_Type_Twnhs <dbl>, Bldg_Type_TwnhsE <dbl>

Created on 2020-08-25 by the reprex package (v0.3.0.9001)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment