hillarysanders · September 13, 2016 20:43
diff --git a/clt_wins.R b/clt_wins.R
 ################################################################################
 ################################################################################
 # author: hillz
 # subject: CLT simulation: bootstrap vs CLT on right tailed distributions with
 #          many samples - bootstrap converges to CLT answer. No reason to use 
 #          bootstrap.
 ################################################################################
 ################################################################################

 ################################################################################
 # utils functions:
 get.bootstrap.se = function(x, n.simulations=1000){
  sample.means = sapply(1:n.simulations, 
                        FUN = function(i) mean(sample(x, size = length(x), replace = T)))
  se = sd(sample.means)
  return(se)
 }

 plot.boot.vs.clt.estimates = function(many.boot.SEs, clt.se){
  hist(many.boot.SEs, col='darkblue', main='Standard Error Estimates', breaks=30)
  abline(v=clt.se, lwd=8, col='darkred')
  abline(v=mean(many.boot.SEs), lwd=4, col='lightblue')
  legend('topright', box.lwd=0, bty=T, pch=c(15, NA, NA), pt.cex=c(1.5, NA, NA), 
         lwd=c(NA, 8, 4), col=c('darkblue', 'darkred', 'lightblue'),
         legend=c('Bootstrap SE Estimate',
                  'CLT based SE Estimate',
                  'Average Bootstrap SE Estimate')) 
 }
 ################################################################################

 # test paramaters:
 n.obs = 100000
 raise.power = 10 # for skewness
 # create the fake skewed data:
 x = rexp(n = n.obs, rate = 1)^raise.power

 # estimate SE of sample mean by bootstrap:
 bootstrap.se = get.bootstrap.se(x)
 # estimate SE of sample mean by Central limit theorom:
 clt.se = sd(x)/sqrt(length(x))

 cat(paste0('CLT based SE: ', clt.se))
 cat(paste0('\nBootstrap based SE: ', bootstrap.se))

 # get many bootstrap  SE estimates to make sure it is always very close to CLT estimate:
 many.boot.SEs = sapply(1:100, FUN = function(i) get.bootstrap.se(x, n.simulations=250))
 # plot the results:
 # the data is very right tailed:
 par(mfrow=c(2,1))
 hist(x, breaks=1000, main='Very Right Tailed Distribution')
 plot.boot.vs.clt.estimates(many.boot.SEs, clt.se)

 # --> bootstrap just converges to CLT answer. There's no reason to use bootstraps when
 # estimating the SD of your sample mean, if your data are iid, even if the data is massively
 # skewed.
 # This doesn't prove that CLT or bootstrap estimates are perfect at this skewness X sample
 # size, just that there's no reason to bootstrap it over using CLT.
	################################################################################
	################################################################################
	# author: hillz
	# subject: CLT simulation: bootstrap vs CLT on right tailed distributions with
	# many samples - bootstrap converges to CLT answer. No reason to use
	# bootstrap.
	################################################################################
	################################################################################

	################################################################################
	# utils functions:
	get.bootstrap.se = function(x, n.simulations=1000){
	sample.means = sapply(1:n.simulations,
	FUN = function(i) mean(sample(x, size = length(x), replace = T)))
	se = sd(sample.means)
	return(se)
	}

	plot.boot.vs.clt.estimates = function(many.boot.SEs, clt.se){
	hist(many.boot.SEs, col='darkblue', main='Standard Error Estimates', breaks=30)
	abline(v=clt.se, lwd=8, col='darkred')
	abline(v=mean(many.boot.SEs), lwd=4, col='lightblue')
	legend('topright', box.lwd=0, bty=T, pch=c(15, NA, NA), pt.cex=c(1.5, NA, NA),
	lwd=c(NA, 8, 4), col=c('darkblue', 'darkred', 'lightblue'),
	legend=c('Bootstrap SE Estimate',
	'CLT based SE Estimate',
	'Average Bootstrap SE Estimate'))
	}
	################################################################################

	# test paramaters:
	n.obs = 100000
	raise.power = 10 # for skewness
	# create the fake skewed data:
	x = rexp(n = n.obs, rate = 1)^raise.power

	# estimate SE of sample mean by bootstrap:
	bootstrap.se = get.bootstrap.se(x)
	# estimate SE of sample mean by Central limit theorom:
	clt.se = sd(x)/sqrt(length(x))

	cat(paste0('CLT based SE: ', clt.se))
	cat(paste0('\nBootstrap based SE: ', bootstrap.se))

	# get many bootstrap SE estimates to make sure it is always very close to CLT estimate:
	many.boot.SEs = sapply(1:100, FUN = function(i) get.bootstrap.se(x, n.simulations=250))
	# plot the results:
	# the data is very right tailed:
	par(mfrow=c(2,1))
	hist(x, breaks=1000, main='Very Right Tailed Distribution')
	plot.boot.vs.clt.estimates(many.boot.SEs, clt.se)

	# --> bootstrap just converges to CLT answer. There's no reason to use bootstraps when
	# estimating the SD of your sample mean, if your data are iid, even if the data is massively
	# skewed.
	# This doesn't prove that CLT or bootstrap estimates are perfect at this skewness X sample
	# size, just that there's no reason to bootstrap it over using CLT.