daob · October 30, 2020 22:15
diff --git a/simple-bagging-simulation.R b/simple-bagging-simulation.R
 # A small simulation to investigate bagging
 # 2020-10-30 DLO

 set.seed(4232)

 # Root mean squared error metric
 rmse <- function(y_pred, y) {
  r <- y_pred - y
  sqrt(mean(r^2))
 }

 # Calculate a metric on a model and test set
 metric_fit <- function(fit, dat_test, metric = rmse) {
  y_pred <- predict(fit, newdata = dat_test)
  metric(y_pred, dat_test$y)
 }

 # Train an (unstable) model
 fit_mod <- function(dat) {
  lm(y ~ .^3, data = dat)
 }

 # Draw one bootstrap sample and fit the model
 boot_fit <- function(i_boot, dat) {
  idx_boot <- sample(1:nrow(dat), size = nrow(dat), replace = TRUE)
  dat_boot <- dat[idx_boot, ]
  fit_mod(dat_boot)
 }

 # Data generating process (DGP), returns a data frame
 dgp <- function(n, beta) {
  J <- length(beta)
  xpop <- matrix(rnorm(n) + rnorm(n * J), n)
  ypop <- xpop %*% beta + rnorm(n)
  
  data.frame(x = xpop, y = ypop)
 }

 # 
 sim <- function(it, n, n_boot, dat_pop, beta, progress_bar) {
  # Update the reassuring progress bar with iteration number
  setTxtProgressBar(progress_bar, it)
  
  # Draw fresh sample from the DGP (=infinite population)
  dat_sam <- dgp(n, beta = beta) 

  # Uncomment below for finite population setup
  # idx_sam <- sample(1:nrow(dat_pop), size = n, replace = FALSE)
  # dat_sam <- dat_pop[idx_sam, ]
  
  # Fit the model without anything special
  fit_sam <- fit_mod(dat_sam)
  # Calculate metric for model without anything special
  rmse_train <- metric_fit(fit = fit_sam, dat_test = dat_pop)
  
  # Draw n_boot bootstrap samples and fit, returning a list of models
  fit_boot <- lapply(1:n_boot, boot_fit, dat = dat_sam)
  # Obtain predictions of the models in the list
  y_pred_boot <- sapply(fit_boot, predict, newdata = dat_pop)
  # Summarize predictions by averaging them over bootstrap samples
  # (NOTE: using median instead of mean appears to improve performance a lot!)
  y_pred_bagged <- apply(y_pred_boot, 1, median)
  # Calculate metric for bagged predictions
  rmse_bagged <- rmse(y_pred_bagged, dat_pop$y)
  
  return(c(rmse_train = rmse_train, 
           rmse_bagged = rmse_bagged))
 }


 # Number of features
 J <- 8
 # True weights on features in linear predictor
 beta <- runif(J, min = -5, max = +5)
 # A "population" to use as test set, generated from DGP
 dat_pop <- dgp(2e3, beta) # data frame

 # Number of simulation replicates
 nsim <- 50

 # Nice-looking progress bar, updated inside sim()
 pb <- txtProgressBar(min = 1, max = nsim, style = 3)

 # Run the simulation
 system.time({ # time result
  res <- t(sapply(1:nsim, # Number of replicates
                  sim, # Call simulation function
                  n = 100,  # Sample size in each replicate
                  n_boot = 60,  # No. bagged bootstrap samples
                  dat_pop = dat_pop,  # Population used as test set
                  beta = beta, # True beta weights
                  progress_bar = pb # Reassuring progress bar
  ))
 })

 # Compare performance of bagging with regular fit:
 summary(res)

 # Bagging is better when:
 #  * Model is overspecified (e.g. .^3)
 #  * Sample size is low
 # When model is very unstable, just a few bootstraps (5) already does the trick

 # See how the two compare in different samples
 limits <- c(min(as.vector(res)), quantile(as.vector(res), 0.9))
 plot(res, pch = 19, xlim = limits, ylim = limits, 
     main = "Bagged vs. Non-bagged over samples", 
     col = "#00000099")
 abline(0, 1, col = "#66000077")
 text(mean(limits), mean(limits)*0.96, labels = "Bagging is better", 
     adj = 0.5, srt = 45, col = "#00000077")
 text(mean(limits), mean(limits)*1.04, labels = "Not bagging is better", 
     adj = 0.5, srt = 45, col = "#00000077")
	# A small simulation to investigate bagging
	# 2020-10-30 DLO

	set.seed(4232)

	# Root mean squared error metric
	rmse <- function(y_pred, y) {
	r <- y_pred - y
	sqrt(mean(r^2))
	}

	# Calculate a metric on a model and test set
	metric_fit <- function(fit, dat_test, metric = rmse) {
	y_pred <- predict(fit, newdata = dat_test)
	metric(y_pred, dat_test$y)
	}

	# Train an (unstable) model
	fit_mod <- function(dat) {
	lm(y ~ .^3, data = dat)
	}

	# Draw one bootstrap sample and fit the model
	boot_fit <- function(i_boot, dat) {
	idx_boot <- sample(1:nrow(dat), size = nrow(dat), replace = TRUE)
	dat_boot <- dat[idx_boot, ]
	fit_mod(dat_boot)
	}

	# Data generating process (DGP), returns a data frame
	dgp <- function(n, beta) {
	J <- length(beta)
	xpop <- matrix(rnorm(n) + rnorm(n * J), n)
	ypop <- xpop %*% beta + rnorm(n)

	data.frame(x = xpop, y = ypop)
	}

	#
	sim <- function(it, n, n_boot, dat_pop, beta, progress_bar) {
	# Update the reassuring progress bar with iteration number
	setTxtProgressBar(progress_bar, it)

	# Draw fresh sample from the DGP (=infinite population)
	dat_sam <- dgp(n, beta = beta)

	# Uncomment below for finite population setup
	# idx_sam <- sample(1:nrow(dat_pop), size = n, replace = FALSE)
	# dat_sam <- dat_pop[idx_sam, ]

	# Fit the model without anything special
	fit_sam <- fit_mod(dat_sam)
	# Calculate metric for model without anything special
	rmse_train <- metric_fit(fit = fit_sam, dat_test = dat_pop)

	# Draw n_boot bootstrap samples and fit, returning a list of models
	fit_boot <- lapply(1:n_boot, boot_fit, dat = dat_sam)
	# Obtain predictions of the models in the list
	y_pred_boot <- sapply(fit_boot, predict, newdata = dat_pop)
	# Summarize predictions by averaging them over bootstrap samples
	# (NOTE: using median instead of mean appears to improve performance a lot!)
	y_pred_bagged <- apply(y_pred_boot, 1, median)
	# Calculate metric for bagged predictions
	rmse_bagged <- rmse(y_pred_bagged, dat_pop$y)

	return(c(rmse_train = rmse_train,
	rmse_bagged = rmse_bagged))
	}


	# Number of features
	J <- 8
	# True weights on features in linear predictor
	beta <- runif(J, min = -5, max = +5)
	# A "population" to use as test set, generated from DGP
	dat_pop <- dgp(2e3, beta) # data frame

	# Number of simulation replicates
	nsim <- 50

	# Nice-looking progress bar, updated inside sim()
	pb <- txtProgressBar(min = 1, max = nsim, style = 3)

	# Run the simulation
	system.time({ # time result
	res <- t(sapply(1:nsim, # Number of replicates
	sim, # Call simulation function
	n = 100, # Sample size in each replicate
	n_boot = 60, # No. bagged bootstrap samples
	dat_pop = dat_pop, # Population used as test set
	beta = beta, # True beta weights
	progress_bar = pb # Reassuring progress bar
	))
	})

	# Compare performance of bagging with regular fit:
	summary(res)

	# Bagging is better when:
	# * Model is overspecified (e.g. .^3)
	# * Sample size is low
	# When model is very unstable, just a few bootstraps (5) already does the trick

	# See how the two compare in different samples
	limits <- c(min(as.vector(res)), quantile(as.vector(res), 0.9))
	plot(res, pch = 19, xlim = limits, ylim = limits,
	main = "Bagged vs. Non-bagged over samples",
	col = "#00000099")
	abline(0, 1, col = "#66000077")
	text(mean(limits), mean(limits)*0.96, labels = "Bagging is better",
	adj = 0.5, srt = 45, col = "#00000077")
	text(mean(limits), mean(limits)*1.04, labels = "Not bagging is better",
	adj = 0.5, srt = 45, col = "#00000077")