topepo · October 24, 2023 17:10 · mdancho84 · Oct 26, 2023
diff --git a/lm_vs_nnet.R b/lm_vs_nnet.R
 library(tidymodels)
 library(doMC)

 # ------------------------------------------------------------------------------

 tidymodels_prefer()
 theme_set(theme_bw())
 options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
 registerDoMC(cores = parallel::detectCores())

 # ------------------------------------------------------------------------------

 data("Chicago")

 n <- nrow(Chicago)
 p <- 1 - (14/n)

 Chicago <- 
  Chicago %>% 
  select(ridership, date)

 init_split <- initial_time_split(Chicago, prop = p)
 chi_train <- training(init_split)
 chi_test <- testing(init_split)

 # ------------------------------------------------------------------------------
 # The effects in the data are almost completely driven by the day-of-the-week 
 # and holidays. For example:

 plot_start <- 1050
 chi_train %>% 
  mutate(
    day = lubridate::wday(date, label = TRUE),
    day = factor(day, ordered = FALSE)
  ) %>% 
  slice(plot_start:(plot_start + 35)) %>% 
  ggplot(aes(date, ridership)) +
  geom_point(aes(col = day), cex = 2) + 
  geom_line(alpha = .2)

 # ------------------------------------------------------------------------------

 chi_rs <-
  chi_train %>%
  sliding_period(
    index = c(date),  
    period = "week",
    lookback = 12 * 52,
    assess_stop = 2,
    step = 2 
  )

 # ------------------------------------------------------------------------------
 # Use a handful of date features to get good preformance

 lm_rec <- 
  recipe(ridership ~ date, data = chi_train) %>% 
  step_date(date) %>% 
  step_holiday(date) %>% 
  update_role(date, new_role = "date id")

 lm_res <- 
  linear_reg() %>% 
  fit_resamples(lm_rec, chi_rs)

 collect_metrics(lm_res)

 # ------------------------------------------------------------------------------
 # Convert data to integers and let the network figure it out. 

 nnet_rec <- 
  recipe(ridership ~ date, data = chi_train) %>% 
  step_mutate(day_index = as.numeric(date)) %>% 
  step_normalize(day_index) %>% 
  update_role(date, new_role = "date id")

 nnet_spec <- 
  mlp(
    hidden_units = tune(),
    penalty = tune(),
    epochs = tune()
  ) %>%
  set_mode("regression") %>% 
  # open up the range of the initial values ('rang') in case that's an issue
  set_engine("nnet", rang = 0.75)

 print(translate(nnet_spec))

 nnet_param <- 
  nnet_spec %>% 
  extract_parameter_set_dials() %>% 
  update(
    hidden_units = hidden_units(c(2, 100)),
    epochs = epochs(c(150, 10000))
    )

 set.seed(3828)
 nnet_res <- 
  nnet_spec %>% 
  tune_grid(nnet_rec, chi_rs, grid = 25, param_info = nnet_param)

 show_best(nnet_res)
 autoplot(nnet_res)

 nnet_fit <- 
  workflow(
    nnet_rec,
  finalize_model(nnet_spec, select_best(nnet_res, metric = "rmse"))
  ) %>% 
  fit(chi_train)

 augment(nnet_fit, chi_train) %>% 
  mutate(
    day = lubridate::wday(date, label = TRUE),
    day = factor(day, ordered = FALSE)
  ) %>% 
  slice(plot_start:(plot_start + 35)) %>% 
  ggplot(aes(day_index, ridership)) +
  geom_point(aes(col = day), cex = 2) + 
  geom_line(aes(y = .pred))

 # ------------------------------------------------------------------------------
 # Use the simple features with the network

 nnet_feat_rec <- 
  recipe(ridership ~ date, data = chi_train) %>% 
  step_mutate(day_index = as.numeric(date)) %>% 
  step_date(date) %>% 
  step_holiday(date) %>% 
  update_role(date, new_role = "date id") %>% 
  step_range(all_numeric_predictors()) %>% 
  step_dummy(all_factor_predictors(), one_hot = TRUE)

 set.seed(3828)
 nnet_feat_res <- 
  nnet_spec %>% 
  tune_grid(nnet_feat_rec, chi_rs, grid = 25, param_info = nnet_param)

 autoplot(nnet_feat_res)
	library(tidymodels)
	library(doMC)

	# ------------------------------------------------------------------------------

	tidymodels_prefer()
	theme_set(theme_bw())
	options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
	registerDoMC(cores = parallel::detectCores())

	# ------------------------------------------------------------------------------

	data("Chicago")

	n <- nrow(Chicago)
	p <- 1 - (14/n)

	Chicago <-
	Chicago %>%
	select(ridership, date)

	init_split <- initial_time_split(Chicago, prop = p)
	chi_train <- training(init_split)
	chi_test <- testing(init_split)

	# ------------------------------------------------------------------------------
	# The effects in the data are almost completely driven by the day-of-the-week
	# and holidays. For example:

	plot_start <- 1050
	chi_train %>%
	mutate(
	day = lubridate::wday(date, label = TRUE),
	day = factor(day, ordered = FALSE)
	) %>%
	slice(plot_start:(plot_start + 35)) %>%
	ggplot(aes(date, ridership)) +
	geom_point(aes(col = day), cex = 2) +
	geom_line(alpha = .2)

	# ------------------------------------------------------------------------------

	chi_rs <-
	chi_train %>%
	sliding_period(
	index = c(date),
	period = "week",
	lookback = 12 * 52,
	assess_stop = 2,
	step = 2
	)

	# ------------------------------------------------------------------------------
	# Use a handful of date features to get good preformance

	lm_rec <-
	recipe(ridership ~ date, data = chi_train) %>%
	step_date(date) %>%
	step_holiday(date) %>%
	update_role(date, new_role = "date id")

	lm_res <-
	linear_reg() %>%
	fit_resamples(lm_rec, chi_rs)

	collect_metrics(lm_res)

	# ------------------------------------------------------------------------------
	# Convert data to integers and let the network figure it out.

	nnet_rec <-
	recipe(ridership ~ date, data = chi_train) %>%
	step_mutate(day_index = as.numeric(date)) %>%
	step_normalize(day_index) %>%
	update_role(date, new_role = "date id")

	nnet_spec <-
	mlp(
	hidden_units = tune(),
	penalty = tune(),
	epochs = tune()
	) %>%
	set_mode("regression") %>%
	# open up the range of the initial values ('rang') in case that's an issue
	set_engine("nnet", rang = 0.75)

	print(translate(nnet_spec))

	nnet_param <-
	nnet_spec %>%
	extract_parameter_set_dials() %>%
	update(
	hidden_units = hidden_units(c(2, 100)),
	epochs = epochs(c(150, 10000))
	)

	set.seed(3828)
	nnet_res <-
	nnet_spec %>%
	tune_grid(nnet_rec, chi_rs, grid = 25, param_info = nnet_param)

	show_best(nnet_res)
	autoplot(nnet_res)

	nnet_fit <-
	workflow(
	nnet_rec,
	finalize_model(nnet_spec, select_best(nnet_res, metric = "rmse"))
	) %>%
	fit(chi_train)

	augment(nnet_fit, chi_train) %>%
	mutate(
	day = lubridate::wday(date, label = TRUE),
	day = factor(day, ordered = FALSE)
	) %>%
	slice(plot_start:(plot_start + 35)) %>%
	ggplot(aes(day_index, ridership)) +
	geom_point(aes(col = day), cex = 2) +
	geom_line(aes(y = .pred))

	# ------------------------------------------------------------------------------
	# Use the simple features with the network

	nnet_feat_rec <-
	recipe(ridership ~ date, data = chi_train) %>%
	step_mutate(day_index = as.numeric(date)) %>%
	step_date(date) %>%
	step_holiday(date) %>%
	update_role(date, new_role = "date id") %>%
	step_range(all_numeric_predictors()) %>%
	step_dummy(all_factor_predictors(), one_hot = TRUE)

	set.seed(3828)
	nnet_feat_res <-
	nnet_spec %>%
	tune_grid(nnet_feat_rec, chi_rs, grid = 25, param_info = nnet_param)

	autoplot(nnet_feat_res)