Created
February 9, 2015 22:14
-
-
Save fgregg/4cfbcff8c0a8d6bb985f to your computer and use it in GitHub Desktop.
Sampling distribution of F statistic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pums <- read.csv("small_pums.csv") | |
| # We want to test the hypothesis that standard deviation of earnings | |
| # are the same for men and women in Illinois | |
| # | |
| # For this hypothesis, we will use the F statistic. | |
| male.earnings <- pums$WAGP[pums$SEX=="male"] | |
| n.male.earnings <- length(na.omit(male.earnings)) | |
| female.earnings <- pums$WAGP[pums$SEX=="female"] | |
| n.female.earnings <- length(na.omit(pums$WAGP[pums$SEX=="female"])) | |
| observed.F = sd(male.earnings, na.rm=TRUE)^2/sd(female.earnings, na.rm=TRUE)^2 | |
| # First we want to explore the sampling distribution of this statistic | |
| # WHEN the nully hypothesis is true. | |
| # | |
| # We'll draw samples of the appropriate size from two normal | |
| # distributions with the same standard deviation. The means can be | |
| # different. We will then calculate the F statistic for these | |
| # samples. We'll repeat this procedure many times and we'll get an | |
| # empirical distribution of the F statistic | |
| F.samples <- c() # This creates an empty list that we will use to | |
| # store our calculated F statistics | |
| for (i in 1:2000) { # 1:2000 makes a list of the values 1 through | |
| # 1000, what the for loop does is step through | |
| # each element of this list starting at the | |
| # beginning. At each step i takes on the value of | |
| # the current position of this list | |
| sample.a <- rnorm(n.male.earnings, mean=0, sd=5) | |
| sd.a <- sd(sample.a) # Here's the sd of our first sample | |
| sample.b <- rnorm(n.female.earnings, mean=20, sd=5) | |
| # notice that this sample is from a normal distribtion with a | |
| # different mean. Our hypothesis is just about the standard | |
| # deviation, so the values of the means should not matter. | |
| sd.b <- sd(sample.b) | |
| # Now we are ready to calculate the F | |
| F = (sd.a/sd.b)^2 | |
| # and store it in the F.samples list we'll store in the ith position | |
| F.samples[i] = F | |
| } | |
| # Now F should be a list of 1000 F statistics. Let's plot this | |
| # empirical sampling distribution | |
| hist(F.samples, | |
| breaks = 40, # larger number of breaks means more bins | |
| freq=FALSE) # we want to plot the histogram on the density | |
| # scale, not the frequency scale | |
| # From theory, we know that the sampling distribution should follow a | |
| # F distribution. Let's see if does. | |
| # | |
| # We are going to plot the densities of an F distribution for various | |
| # values of an F statistic. First, let's generate a range of F values | |
| # from 0 to 2 with an 0.01 steps. | |
| f <- seq(0, 2, 0.01) | |
| # For each of these values, we want to calculate the density of the F | |
| # distribution with the appropriate parameters. For the F | |
| # distribution, these parameters will be the sample size of the two | |
| # samples minus one | |
| densities <- df(f, # the values that we want to evaluate the density at | |
| df1=(n.male.earnings-1), # first parameter | |
| df2=(n.female.earnings-1) # second parameter | |
| ) | |
| # Now let's overplot this theoretical density over the empirical | |
| # sampling distribution | |
| lines(densities ~ f) | |
| # They line up very well, right?! So now, we can use the theoretical | |
| # F distribution and our observed F statistic to calculate the | |
| # probability of seeing our observed F, or a more extreme value, if | |
| # the null hypothesis was true | |
| upper.p.value = pf(observed.F, | |
| df1=(n.male.earnings-1), | |
| df2=(n.female.earnings-1), | |
| lower.tail=FALSE) # we want to calculate the | |
| # probability of seeing a value | |
| # as large or larger than our | |
| # oubserved.F | |
| p.value = upper.p.value * 2 # we need a two tail test, so we multiply by 2 | |
| p.value | |
| # Now compare all of what we just did to | |
| var.test(male.earnings, female.earnings) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment