crazyhottommy · August 29, 2015 14:19
diff --git a/simulation.r b/simulation.r
 ## Overview
 # central limit theorem (CLT) states that, given certain conditions, the arithmetic mean of a sufficiently large number of iterates of independent random variables, each with a well-defined expected value and well-defined variance, will be approximately normally distributed, regardless of the underlying distribution. 

 # I am going to draw 40 numbers from exponential distribution for 1000 times, and calcuate the mean
 # of each draw (we will have 1000 means), and through this simulation to test if the 
 # distribution of the means will be normal or not.
  
 ## start simulation
 # number of simulation, sample size and lambda
 nosim<- 1000
 n<- 40
 lambda<- 0.2

 # make a matrix containing 1000 rows and 40 columns with simulated value
 mat<- matrix(rexp(nosim * n, lambda), nosim)


 # calculate the mean of the 40 numbers draw from exponential distritubtion 
 # for each row (total 1000 rows), and then take the mean
 # we got 1000 means of the 1000 simulation
 # now let's calculate the standard deviation
 mean_expo<- apply(mat, 1, mean)

 ## the sample mean and compare it to the theoretical mean of the distribution
 #sample mean
 mean(mean_expo)
 # 5.1
 # theroretical mean
 1/lambda
 # 5.017575

 # they are very close to each other.

 ## how variable the sample is (via variance) and compare it to the theoretical variance of the distribution?

 # sample variance
 var(mean_expo)
 # 0.6469216
 # thoretical variance
 (1/lambda)^2 * 1/n 
 # 0.625
 # they are very close to each other. you may get a different mean and variance for the sample, but if I set the seed. it should be reproducible 

 # Let's look at the distribution of the means by histogram

 library(ggplot2)
 g<- ggplot(data.frame(mean_expo)) + geom_histogram(aes(x=mean_expo, y=..density..), 
                                                   fill="black", color="white")
 g<- g + geom_vline(xintercept=5, color="red")
 g<- g + geom_vline(xintercept= mean(mean_expo), color="green")
 g<- g + stat_function(fun = dnorm, args= list(mean= 1/lambda, sd= 1/lambda * 1/sqrt(40)), 
                      color="red") 
 g<- g + geom_density(aes(x= mean_expo,y=..density..), color="green")
 g

 # the distribution of means should be centered on 1/lambda = 1/0.2 = 5 (the theoretical center).
 # the theoretical standard deviation should be sigma/sqrt(n).
 # the red bell curve is the density function of Normal(1/lambda, 1/lambda * 1/sqrt(40)).
 # the red vertical line is the theoretical mean (5).
 # the green vertical line is the sample mean(5.036)
 # the green bell curve is the density of the simulated sample means.
 # we can see they are very close to each other.

 ## compare with the large number of exponential distribution

 expo<- rexp(1000, lambda)

 g<- ggplot(data.frame(expo)) + geom_histogram(aes(x=expo, y=..density..), 
                                                   fill="black", color="white")
 g<- g + geom_vline(xintercept=5, color="red")
 g<- g + stat_function(fun = dnorm, args= list(mean= 1/lambda, sd= 1/lambda * 1/sqrt(40)), 
                      color="red") 
 g<- g + geom_density(aes(x= expo,y=..density..), color="green")
 g

 # clearly, we see it is not normal distributed
	## Overview
	# central limit theorem (CLT) states that, given certain conditions, the arithmetic mean of a sufficiently large number of iterates of independent random variables, each with a well-defined expected value and well-defined variance, will be approximately normally distributed, regardless of the underlying distribution.

	# I am going to draw 40 numbers from exponential distribution for 1000 times, and calcuate the mean
	# of each draw (we will have 1000 means), and through this simulation to test if the
	# distribution of the means will be normal or not.

	## start simulation
	# number of simulation, sample size and lambda
	nosim<- 1000
	n<- 40
	lambda<- 0.2

	# make a matrix containing 1000 rows and 40 columns with simulated value
	mat<- matrix(rexp(nosim * n, lambda), nosim)


	# calculate the mean of the 40 numbers draw from exponential distritubtion
	# for each row (total 1000 rows), and then take the mean
	# we got 1000 means of the 1000 simulation
	# now let's calculate the standard deviation
	mean_expo<- apply(mat, 1, mean)

	## the sample mean and compare it to the theoretical mean of the distribution
	#sample mean
	mean(mean_expo)
	# 5.1
	# theroretical mean
	1/lambda
	# 5.017575

	# they are very close to each other.

	## how variable the sample is (via variance) and compare it to the theoretical variance of the distribution?

	# sample variance
	var(mean_expo)
	# 0.6469216
	# thoretical variance
	(1/lambda)^2 * 1/n
	# 0.625
	# they are very close to each other. you may get a different mean and variance for the sample, but if I set the seed. it should be reproducible

	# Let's look at the distribution of the means by histogram

	library(ggplot2)
	g<- ggplot(data.frame(mean_expo)) + geom_histogram(aes(x=mean_expo, y=..density..),
	fill="black", color="white")
	g<- g + geom_vline(xintercept=5, color="red")
	g<- g + geom_vline(xintercept= mean(mean_expo), color="green")
	g<- g + stat_function(fun = dnorm, args= list(mean= 1/lambda, sd= 1/lambda * 1/sqrt(40)),
	color="red")
	g<- g + geom_density(aes(x= mean_expo,y=..density..), color="green")
	g

	# the distribution of means should be centered on 1/lambda = 1/0.2 = 5 (the theoretical center).
	# the theoretical standard deviation should be sigma/sqrt(n).
	# the red bell curve is the density function of Normal(1/lambda, 1/lambda * 1/sqrt(40)).
	# the red vertical line is the theoretical mean (5).
	# the green vertical line is the sample mean(5.036)
	# the green bell curve is the density of the simulated sample means.
	# we can see they are very close to each other.

	## compare with the large number of exponential distribution

	expo<- rexp(1000, lambda)

	g<- ggplot(data.frame(expo)) + geom_histogram(aes(x=expo, y=..density..),
	fill="black", color="white")
	g<- g + geom_vline(xintercept=5, color="red")
	g<- g + stat_function(fun = dnorm, args= list(mean= 1/lambda, sd= 1/lambda * 1/sqrt(40)),
	color="red")
	g<- g + geom_density(aes(x= expo,y=..density..), color="green")
	g

	# clearly, we see it is not normal distributed