jrosell · October 23, 2024 10:35
diff --git a/many-models.R b/many-models.R
 # Original code: https://tim-tiefenbach.de/post/2023-dplyr-many-models/

 # Loading the packages and getting the data ------------------------------------

 if (!rlang::is_installed("pak")) install.packages("pak")
 rlang::check_installed(c(
  "pak",
  "dplyr",
  "tidyr",
  "broom",
  "modelsummary",
  "purrr",
  "dplyover",
  "rlang"
 ))
 library(dplyr)
 library(tidyr)
 library(broom)
 library(rlang)
 library(modelsummary)
 library(purrr)
 library(dplyover)

 csat_named <- csatraw |>
  rename(any_of(set_names(names(csatraw), names(csat)))) |>
  select(
    cust_id, type, product, csat,
    ends_with("rating")
  )


 # Analyzing the subgrups -------------------------------------------------------

 csat_prod_nested <-
  csat_named |>
  nest_by(product) |> 
  print()


 base_formula <- csat ~ postal_rating + phone_rating + email_rating +
  website_rating + shop_rating

 csat_prod_nested_res <- csat_prod_nested |>
  mutate(
    mod     = list(lm(base_formula, data = data)),
    modstat = list(broom::glance(mod)),
    res =     list(broom::tidy(mod))
  ) |>
  print()

 csat_prod_nested_res |>
  select(product, modstat) |>
  unnest(modstat) |>
  select(r.squared, p.value, nobs) |>
  print()

 csat_prod_nested_res |>
  select(product, res) |>
  unnest(res) |>
  filter(term != "(Intercept)") |>
  print()


 # Analyzing the subgrups and all the data --------------------------------------

 csat_all <- csat_named |>
  mutate(product = "All") |>
  bind_rows(csat_named) 

 csat_all |> count(product)

 csat_all_res <- csat_all |>
  nest_by(product) |>
  mutate(
    mod     = list(lm(base_formula, data = data)),
    res     = list(broom::tidy(mod)),
    modstat = list(broom::glance(mod))
  ) |> 
  print()

 csat_all_res |>
  select(product, modstat) |>
  unnest(modstat) |>
  select(r.squared, p.value, nobs) |>
  print()

 csat_all_res |>
  select(product, res) |>
  unnest(res) |>
  filter(term != "(Intercept)") |>
  print()



 # Create additional subgroups that meet specific filter criteria ---------------

 filter_ls <- list(
  All = TRUE,
  no_reactivate = expr(type != "reactivate")
 )

 csat_all_grps <- csat_all |>
  nest_by(product) |>
  expand_grid(filter_ls) |>
  mutate(
    type = names(filter_ls),
    .after = product
  ) |>
  rowwise() |>
  mutate(data = list(
    filter(data, eval(filter_ls))
    ),
    .keep = "unused"
  ) |>
  print()
  
 csat_all_grps_grid <- csat_all_grps |>
  mutate(
    mod = list(lm(base_formula, data = data)),
    res = list(broom::tidy(mod)),
    modstat = list(broom::glance(mod))
  ) |>
  print()

 csat_all_grps_grid |>
  select(product, type, modstat) |>
  unnest(modstat) |>
  select(-c(sigma, statistic, df:df.residual))  |>
  print()

 csat_all_grps_grid |>
  select(product, type, res) |>
  unnest(res) |>
  filter(term == "website_rating")

 csat_all_grps_grid$modstat[4]


 # Dynamically name list elements -----------------------------------------------

 csat_all_grps_grid <- csat_all_grps |>
  rowwise() |>
  mutate(
    mod     = list2("{product}_{type}" := lm(base_formula, data = data)),
    res     = list2("{product}_{type}" := broom::tidy(mod)),
    modstat = list2("{product}_{type}" := broom::glance(mod))
  ) |> 
  print()

 csat_all_grps_grid$modstat

 # Data-less grids: Multiple filters --------------------------------------------


 filter_ls <- list(
  All = TRUE,
  no_reactivate = expr(type != "reactivate")
 )

 product <- c(
  "All", unique(csat_named$product)
 )

 all_grps_grid <- expand_grid(product, filter_ls) |>
  mutate(type = names(filter_ls),
         .after = product)

 all_grps_grid


 all_grps_grid_mod <- all_grps_grid |>
  rowwise() |>
  mutate(mod = list(
    lm(
      base_formula,
      data = filter(
        csat_named,
        .env$product == "All" | .env$product == product,
        eval(filter_ls)
      )
    )
    )
  ) |>
  select(!filter_ls) |> 
  print()


 all_grps_grid_res <- all_grps_grid_mod |>
  mutate(
    res     = list(broom::tidy(mod)),
    modstat = list(broom::glance(mod))
  ) |> 
  print()

 all_grps_grid_res |>
  select(product, type, modstat) |>
  unnest(modstat) |>
  select(-c(sigma, statistic, df:df.residual))|> 
  print()


 # Create a separate model for each independent variable ------------------------

 indep_vars <- c("postal_rating",
                "phone_rating",
                "email_rating",
                "website_rating",
                "shop_rating")

 all_grps_grid_vars <- all_grps_grid |>
   expand_grid(indep_vars) |> 
   print()

 all_grps_grid_vars_mod <- all_grps_grid_vars |>
  rowwise() |>
  mutate(mod = list(
    lm(reformulate(indep_vars, "csat"), # <- this part is new
       data = filter(csat_named,
                     .env$product == "All" | .env$product == product,
                     eval(filter_ls)
       )
    )
  )
  ) |> 
  select(!filter_ls) |> 
  print()


 # Update a baseline model with additional variables ----------------------------

 min_formula <- csat ~ postal_rating + phone_rating + shop_rating

 update_vars <- list(
  base = NULL,
  email = "email_rating",
  website = "website_rating",
  both = c("email_rating", "website_rating"),
  both_plus_interaction = "email_rating*website_rating"
 )

 all_grid_upd_vars <- all_grps_grid |>
  expand_grid(update_vars) |>
  mutate(
    model_spec = names(update_vars),
    .after = type
  ) |> 
  print()


 all_grid_upd_vars_form <- all_grid_upd_vars |>
  rowwise() |>
  mutate(
    form = list(
      update(
        min_formula, # old formula
        reformulate(c(".", update_vars)) # changes to formula
        )
      ),
    mod = list2( "{product}_{type}_{model_spec}" :=
      lm(
        form,
        data = filter(
          csat_named,
          .env$product == "All" | .env$product == product,
          eval(filter_ls)
        )
      )
    )
  ) |> 
  print()

 head(all_grid_upd_vars_form$mod, 2)

 # Save the results -------------------------------------------------------------

 out <- modelsummary(
  models = all_grid_upd_vars_form$mod,
  output = "model_results.xlsx",
  gof_omit = "AIC|BIC|Log.Lik|RMSE",
  coef_omit = "(Intercept)",
  stars = TRUE,
  statistic = NULL
 )

 out <- modelsummary(models = all_grid_upd_vars_form$mod,
  output = "data.frame",
  gof_omit = "AIC|BIC|Log.Lik|RMSE",
  coef_omit = "(Intercept)",
  statistic = "stars") |>
  mutate(statistic = ifelse(statistic == "", "estimate", statistic)) |>
  select(-part) |>
  pivot_wider(
    names_from = statistic,
    values_from = -c(term, statistic),
    values_fn = as.numeric
  )

 openxlsx::write.xlsx(out, "model_results2.xlsx")


 # Compare two different versions of our dependent variable ---------------------


 csat_named_top <- csat_named |>
  mutate(csat_top = ifelse(csat == 5, 1, 0))

 dep_vars <- c("csat", "csat_top")

 all_grps_grid_final <- all_grid_upd_vars |>
  expand_grid(dep_vars) |> 
  print()

 all_grps_grid_final_res <- all_grps_grid_final |>
  rowwise() |>
  mutate(
    form = list2(
      "{product}_{type}_{model_spec}_{dep_vars}" :=  
        update(min_formula, reformulate(c(".", update_vars), dep_vars))
    ),
    mod = list(
      lm(
        form,     
      data = filter(
        csat_named_top,
        .env$product == "All" | .env$product == product,
        eval(filter_ls)
      )
      )
    ),
    res = list(broom::tidy(mod)),
    modstat = list(broom::glance(mod))
  ) |>
  select(product:model_spec, dep_vars, mod:modstat) |> 
  print()


 # Significant at the 10% level by descending adjusted r-squared ----------------

 all_grps_grid_final_res |>
  unnest(modstat) |>
  select(-c(sigma, statistic, df:df.residual)) |>
  filter(p.value < 0.1) |>
  arrange(desc(adj.r.squared))
	# Original code: https://tim-tiefenbach.de/post/2023-dplyr-many-models/

	# Loading the packages and getting the data ------------------------------------

	if (!rlang::is_installed("pak")) install.packages("pak")
	rlang::check_installed(c(
	"pak",
	"dplyr",
	"tidyr",
	"broom",
	"modelsummary",
	"purrr",
	"dplyover",
	"rlang"
	))
	library(dplyr)
	library(tidyr)
	library(broom)
	library(rlang)
	library(modelsummary)
	library(purrr)
	library(dplyover)

	csat_named <- csatraw \|>
	rename(any_of(set_names(names(csatraw), names(csat)))) \|>
	select(
	cust_id, type, product, csat,
	ends_with("rating")
	)


	# Analyzing the subgrups -------------------------------------------------------

	csat_prod_nested <-
	csat_named \|>
	nest_by(product) \|>
	print()


	base_formula <- csat ~ postal_rating + phone_rating + email_rating +
	website_rating + shop_rating

	csat_prod_nested_res <- csat_prod_nested \|>
	mutate(
	mod = list(lm(base_formula, data = data)),
	modstat = list(broom::glance(mod)),
	res = list(broom::tidy(mod))
	) \|>
	print()

	csat_prod_nested_res \|>
	select(product, modstat) \|>
	unnest(modstat) \|>
	select(r.squared, p.value, nobs) \|>
	print()

	csat_prod_nested_res \|>
	select(product, res) \|>
	unnest(res) \|>
	filter(term != "(Intercept)") \|>
	print()


	# Analyzing the subgrups and all the data --------------------------------------

	csat_all <- csat_named \|>
	mutate(product = "All") \|>
	bind_rows(csat_named)

	csat_all \|> count(product)

	csat_all_res <- csat_all \|>
	nest_by(product) \|>
	mutate(
	mod = list(lm(base_formula, data = data)),
	res = list(broom::tidy(mod)),
	modstat = list(broom::glance(mod))
	) \|>
	print()

	csat_all_res \|>
	select(product, modstat) \|>
	unnest(modstat) \|>
	select(r.squared, p.value, nobs) \|>
	print()

	csat_all_res \|>
	select(product, res) \|>
	unnest(res) \|>
	filter(term != "(Intercept)") \|>
	print()



	# Create additional subgroups that meet specific filter criteria ---------------

	filter_ls <- list(
	All = TRUE,
	no_reactivate = expr(type != "reactivate")
	)

	csat_all_grps <- csat_all \|>
	nest_by(product) \|>
	expand_grid(filter_ls) \|>
	mutate(
	type = names(filter_ls),
	.after = product
	) \|>
	rowwise() \|>
	mutate(data = list(
	filter(data, eval(filter_ls))
	),
	.keep = "unused"
	) \|>
	print()

	csat_all_grps_grid <- csat_all_grps \|>
	mutate(
	mod = list(lm(base_formula, data = data)),
	res = list(broom::tidy(mod)),
	modstat = list(broom::glance(mod))
	) \|>
	print()

	csat_all_grps_grid \|>
	select(product, type, modstat) \|>
	unnest(modstat) \|>
	select(-c(sigma, statistic, df:df.residual)) \|>
	print()

	csat_all_grps_grid \|>
	select(product, type, res) \|>
	unnest(res) \|>
	filter(term == "website_rating")

	csat_all_grps_grid$modstat[4]


	# Dynamically name list elements -----------------------------------------------

	csat_all_grps_grid <- csat_all_grps \|>
	rowwise() \|>
	mutate(
	mod = list2("{product}_{type}" := lm(base_formula, data = data)),
	res = list2("{product}_{type}" := broom::tidy(mod)),
	modstat = list2("{product}_{type}" := broom::glance(mod))
	) \|>
	print()

	csat_all_grps_grid$modstat

	# Data-less grids: Multiple filters --------------------------------------------


	filter_ls <- list(
	All = TRUE,
	no_reactivate = expr(type != "reactivate")
	)

	product <- c(
	"All", unique(csat_named$product)
	)

	all_grps_grid <- expand_grid(product, filter_ls) \|>
	mutate(type = names(filter_ls),
	.after = product)

	all_grps_grid


	all_grps_grid_mod <- all_grps_grid \|>
	rowwise() \|>
	mutate(mod = list(
	lm(
	base_formula,
	data = filter(
	csat_named,
	.env$product == "All" \| .env$product == product,
	eval(filter_ls)
	)
	)
	)
	) \|>
	select(!filter_ls) \|>
	print()


	all_grps_grid_res <- all_grps_grid_mod \|>
	mutate(
	res = list(broom::tidy(mod)),
	modstat = list(broom::glance(mod))
	) \|>
	print()

	all_grps_grid_res \|>
	select(product, type, modstat) \|>
	unnest(modstat) \|>
	select(-c(sigma, statistic, df:df.residual))\|>
	print()


	# Create a separate model for each independent variable ------------------------

	indep_vars <- c("postal_rating",
	"phone_rating",
	"email_rating",
	"website_rating",
	"shop_rating")

	all_grps_grid_vars <- all_grps_grid \|>
	expand_grid(indep_vars) \|>
	print()

	all_grps_grid_vars_mod <- all_grps_grid_vars \|>
	rowwise() \|>
	mutate(mod = list(
	lm(reformulate(indep_vars, "csat"), # <- this part is new
	data = filter(csat_named,
	.env$product == "All" \| .env$product == product,
	eval(filter_ls)
	)
	)
	)
	) \|>
	select(!filter_ls) \|>
	print()


	# Update a baseline model with additional variables ----------------------------

	min_formula <- csat ~ postal_rating + phone_rating + shop_rating

	update_vars <- list(
	base = NULL,
	email = "email_rating",
	website = "website_rating",
	both = c("email_rating", "website_rating"),
	both_plus_interaction = "email_rating*website_rating"
	)

	all_grid_upd_vars <- all_grps_grid \|>
	expand_grid(update_vars) \|>
	mutate(
	model_spec = names(update_vars),
	.after = type
	) \|>
	print()


	all_grid_upd_vars_form <- all_grid_upd_vars \|>
	rowwise() \|>
	mutate(
	form = list(
	update(
	min_formula, # old formula
	reformulate(c(".", update_vars)) # changes to formula
	)
	),
	mod = list2( "{product}_{type}_{model_spec}" :=
	lm(
	form,
	data = filter(
	csat_named,
	.env$product == "All" \| .env$product == product,
	eval(filter_ls)
	)
	)
	)
	) \|>
	print()

	head(all_grid_upd_vars_form$mod, 2)

	# Save the results -------------------------------------------------------------

	out <- modelsummary(
	models = all_grid_upd_vars_form$mod,
	output = "model_results.xlsx",
	gof_omit = "AIC\|BIC\|Log.Lik\|RMSE",
	coef_omit = "(Intercept)",
	stars = TRUE,
	statistic = NULL
	)

	out <- modelsummary(models = all_grid_upd_vars_form$mod,
	output = "data.frame",
	gof_omit = "AIC\|BIC\|Log.Lik\|RMSE",
	coef_omit = "(Intercept)",
	statistic = "stars") \|>
	mutate(statistic = ifelse(statistic == "", "estimate", statistic)) \|>
	select(-part) \|>
	pivot_wider(
	names_from = statistic,
	values_from = -c(term, statistic),
	values_fn = as.numeric
	)

	openxlsx::write.xlsx(out, "model_results2.xlsx")


	# Compare two different versions of our dependent variable ---------------------


	csat_named_top <- csat_named \|>
	mutate(csat_top = ifelse(csat == 5, 1, 0))

	dep_vars <- c("csat", "csat_top")

	all_grps_grid_final <- all_grid_upd_vars \|>
	expand_grid(dep_vars) \|>
	print()

	all_grps_grid_final_res <- all_grps_grid_final \|>
	rowwise() \|>
	mutate(
	form = list2(
	"{product}_{type}_{model_spec}_{dep_vars}" :=
	update(min_formula, reformulate(c(".", update_vars), dep_vars))
	),
	mod = list(
	lm(
	form,
	data = filter(
	csat_named_top,
	.env$product == "All" \| .env$product == product,
	eval(filter_ls)
	)
	)
	),
	res = list(broom::tidy(mod)),
	modstat = list(broom::glance(mod))
	) \|>
	select(product:model_spec, dep_vars, mod:modstat) \|>
	print()


	# Significant at the 10% level by descending adjusted r-squared ----------------

	all_grps_grid_final_res \|>
	unnest(modstat) \|>
	select(-c(sigma, statistic, df:df.residual)) \|>
	filter(p.value < 0.1) \|>
	arrange(desc(adj.r.squared))