fgregg · February 9, 2015 22:14
diff --git a/fdistribution.R b/fdistribution.R
 pums <- read.csv("small_pums.csv")

 # We want to test the hypothesis that standard deviation of earnings
 # are the same for men and women in Illinois
 #
 # For this hypothesis, we will use the F statistic.

 male.earnings <- pums$WAGP[pums$SEX=="male"]
 n.male.earnings <- length(na.omit(male.earnings))

 female.earnings <- pums$WAGP[pums$SEX=="female"]
 n.female.earnings <- length(na.omit(pums$WAGP[pums$SEX=="female"]))

 observed.F = sd(male.earnings, na.rm=TRUE)^2/sd(female.earnings, na.rm=TRUE)^2

 # First we want to explore the sampling distribution of this statistic
 # WHEN the nully hypothesis is true.
 #
 # We'll draw samples of the appropriate size from two normal
 # distributions with the same standard deviation. The means can be
 # different. We will then calculate the F statistic for these
 # samples. We'll repeat this procedure many times and we'll get an
 # empirical distribution of the F statistic

 F.samples <- c() # This creates an empty list that we will use to
                 # store our calculated F statistics

 for (i in 1:2000) { # 1:2000 makes a list of the values 1 through
                    # 1000, what the for loop does is step through
                    # each element of this list starting at the
                    # beginning. At each step i takes on the value of
                    # the current position of this list
    sample.a <- rnorm(n.male.earnings, mean=0, sd=5)
    sd.a <- sd(sample.a) # Here's the sd of our first sample

    sample.b <- rnorm(n.female.earnings, mean=20, sd=5)
    # notice that this sample is from a normal distribtion with a
    # different mean. Our hypothesis is just about the standard
    # deviation, so the values of the means should not matter.

    sd.b <- sd(sample.b)

    # Now we are ready to calculate the F
    F = (sd.a/sd.b)^2

    # and store it in the F.samples list we'll store in the ith position
    F.samples[i] = F

 }
    

 # Now F should be a list of 1000 F statistics. Let's plot this
 # empirical sampling distribution
 hist(F.samples, 
     breaks = 40, # larger number of breaks means more bins
     freq=FALSE)  # we want to plot the histogram on the density
                  # scale, not the frequency scale

 # From theory, we know that the sampling distribution should follow a
 # F distribution. Let's see if does.
 #
 # We are going to plot the densities of an F distribution for various
 # values of an F statistic. First, let's generate a range of F values
 # from 0 to 2 with an 0.01 steps.

 f <- seq(0, 2, 0.01)

 # For each of these values, we want to calculate the density of the F
 # distribution with the appropriate parameters. For the F
 # distribution, these parameters will be the sample size of the two
 # samples minus one

 densities <- df(f, # the values that we want to evaluate the density at
                df1=(n.male.earnings-1),   # first parameter
                df2=(n.female.earnings-1) # second parameter
                )

 # Now let's overplot this theoretical density over the empirical
 # sampling distribution

 lines(densities ~ f)

 # They line up very well, right?!  So now, we can use the theoretical
 # F distribution and our observed F statistic to calculate the
 # probability of seeing our observed F, or a more extreme value, if
 # the null hypothesis was true

 upper.p.value = pf(observed.F,
                   df1=(n.male.earnings-1),  
                   df2=(n.female.earnings-1),
                   lower.tail=FALSE) # we want to calculate the
                                     # probability of seeing a value
                                     # as large or larger than our
                                     # oubserved.F

 p.value = upper.p.value * 2 # we need a two tail test, so we multiply by 2

 p.value

 # Now compare all of what we just did to
 var.test(male.earnings, female.earnings)
	pums <- read.csv("small_pums.csv")

	# We want to test the hypothesis that standard deviation of earnings
	# are the same for men and women in Illinois
	#
	# For this hypothesis, we will use the F statistic.

	male.earnings <- pums$WAGP[pums$SEX=="male"]
	n.male.earnings <- length(na.omit(male.earnings))

	female.earnings <- pums$WAGP[pums$SEX=="female"]
	n.female.earnings <- length(na.omit(pums$WAGP[pums$SEX=="female"]))

	observed.F = sd(male.earnings, na.rm=TRUE)^2/sd(female.earnings, na.rm=TRUE)^2

	# First we want to explore the sampling distribution of this statistic
	# WHEN the nully hypothesis is true.
	#
	# We'll draw samples of the appropriate size from two normal
	# distributions with the same standard deviation. The means can be
	# different. We will then calculate the F statistic for these
	# samples. We'll repeat this procedure many times and we'll get an
	# empirical distribution of the F statistic

	F.samples <- c() # This creates an empty list that we will use to
	# store our calculated F statistics

	for (i in 1:2000) { # 1:2000 makes a list of the values 1 through
	# 1000, what the for loop does is step through
	# each element of this list starting at the
	# beginning. At each step i takes on the value of
	# the current position of this list
	sample.a <- rnorm(n.male.earnings, mean=0, sd=5)
	sd.a <- sd(sample.a) # Here's the sd of our first sample

	sample.b <- rnorm(n.female.earnings, mean=20, sd=5)
	# notice that this sample is from a normal distribtion with a
	# different mean. Our hypothesis is just about the standard
	# deviation, so the values of the means should not matter.

	sd.b <- sd(sample.b)

	# Now we are ready to calculate the F
	F = (sd.a/sd.b)^2

	# and store it in the F.samples list we'll store in the ith position
	F.samples[i] = F

	}


	# Now F should be a list of 1000 F statistics. Let's plot this
	# empirical sampling distribution
	hist(F.samples,
	breaks = 40, # larger number of breaks means more bins
	freq=FALSE) # we want to plot the histogram on the density
	# scale, not the frequency scale

	# From theory, we know that the sampling distribution should follow a
	# F distribution. Let's see if does.
	#
	# We are going to plot the densities of an F distribution for various
	# values of an F statistic. First, let's generate a range of F values
	# from 0 to 2 with an 0.01 steps.

	f <- seq(0, 2, 0.01)

	# For each of these values, we want to calculate the density of the F
	# distribution with the appropriate parameters. For the F
	# distribution, these parameters will be the sample size of the two
	# samples minus one

	densities <- df(f, # the values that we want to evaluate the density at
	df1=(n.male.earnings-1), # first parameter
	df2=(n.female.earnings-1) # second parameter
	)

	# Now let's overplot this theoretical density over the empirical
	# sampling distribution

	lines(densities ~ f)

	# They line up very well, right?! So now, we can use the theoretical
	# F distribution and our observed F statistic to calculate the
	# probability of seeing our observed F, or a more extreme value, if
	# the null hypothesis was true

	upper.p.value = pf(observed.F,
	df1=(n.male.earnings-1),
	df2=(n.female.earnings-1),
	lower.tail=FALSE) # we want to calculate the
	# probability of seeing a value
	# as large or larger than our
	# oubserved.F

	p.value = upper.p.value * 2 # we need a two tail test, so we multiply by 2

	p.value

	# Now compare all of what we just did to
	var.test(male.earnings, female.earnings)
No results found