Last active
November 19, 2015 21:12
-
-
Save willium/98b4ab719ef71bba1e85 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################### | |
# SETUP | |
############################################### | |
# set working directory | |
setwd("/Users/willium/Documents/School/uw/2015-2016/Quarter 1/Stat 311/assignments/group-2") | |
# load data.csv into variable d | |
d <- read.csv("raw_data.csv", header=TRUE, as.is=TRUE, na.strings=c("", "NA")) # d is data | |
############################################### | |
# DATA ORGANIZATION | |
############################################### | |
d$CandidateName <- d$Candidate.Name | |
d$Candidate.Name <- NULL | |
d$CandidateName <- factor(d$CandidateName) | |
d$SpenderName <- d$Spender.Name | |
d$Spender.Name <- NULL | |
d$SpenderName <- factor(d$SpenderName) | |
d$CandidateParty <- d$Candidate.Party | |
d$Candidate.Party <- NULL | |
d$CandidateParty <- factor(d$CandidateParty) | |
ea <- d$Expenditure.Amount | |
d$Expenditure.Amount <- NULL | |
eaNoDollar <- sub('\\$','',as.character(ea)) | |
eaClean <- as.numeric(gsub('\\,','',as.character(eaNoDollar))) | |
d$ExpenditureAmount <- eaClean | |
d$ExpenditureDate <- d$Expenditure.Date | |
d$Expenditure.Date <- NULL | |
d$ExpenditureDate <- as.Date(d$ExpenditureDate, "%m/%d/%y") | |
d$Purpose <- factor(d$Purpose) | |
d$CandidateParty <- factor(d$CandidateParty) | |
write.csv(x=d, file="data.csv") | |
############################################### | |
# FUNCTIONS | |
############################################### | |
pop.var <- function(x) var(x) * (length(x)-1) / length(x) | |
pop.sd <- function(x) sqrt(pop.var(x)) | |
sampling.sd <- function(x, n) x / sqrt(n) | |
############################################### | |
# ASSIGNMENT | |
############################################### | |
# 1 # | |
# Describe your population. Your descriptions should be limited to your single variable and should include visual, numerical, and verbal descriptions. Since we are treating the data as the population, you need to make sure you use the correct formula for finding a population standard deviation. Many software packages default to a sample SD. In StatCrunch you can get a population SD by requesting “Unadj. std. dev.” from the selection of Summary statistics. | |
summary(d$ExpenditureAmount) | |
popsd <- pop.sd(d$ExpenditureAmount) | |
length(d$ExpenditureAmount) | |
ME <- 5000 - mean(d$ExpenditureAmount) | |
h <- hist(d$ExpenditureAmount, xlim=c(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME), breaks=10, density=10, col="steelblue", main="", xlab="Expenditure Amount ($)") | |
box() | |
# visual: histograms | |
# numerical: 5 number summary, sd, mean, iqr | |
# verbal: spread, center, adjustment | |
# 2 # | |
# Use the mean and SD of your variable to create a normal model. Compare the normal model to the distribution of your variable and explain why you think the model is or is not useful. | |
xfit <- seq(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME, length=100) | |
yfit <- dnorm(xfit, mean=mean(d$ExpenditureAmount), sd=popsd) | |
yfit <- yfit * diff(h$mids[1:2]) * length(d$ExpenditureAmount) | |
lines(xfit, yfit, col="black", lwd=2) | |
# xseq<-seq(-5000,5000,.01) | |
# densities <- dnorm(xseq, mean=mean(d$ExpenditureAmount), sd=popsd) | |
# | |
# plot(xseq, densities, xlab="", ylab="Density", type="l",lwd=2, cex=2, main="PDF of Expenditure Amount Normal", cex.axis=.8) | |
# plot normal model | |
# not useful because its heavily skewed, might be if you use a logorithmic transformation | |
# 3. Draw 100 simple random samples (sampling with replacement) from your population for each of n = 10, n = 25, and n = 50. For each sample calculate the mean and create a histogram of the sampling distribution of the mean. This will result in three histograms, one for each sample size. Also calculate the mean and SD for each sampling distribution. Then calculate the theoretical mean and SD for each of the three sampling distributions based on the mean and SD of your population. You might end up with a table that looks something like Table 1 shown below (be sure to adjust your caption as appropriate). Comment on how the means and SDs from the three sampling distributions you simulated compare to the theoretical means and SDs. Also, comment on any differences you observe between samples of size n = 10, n = 25 and n = 50. | |
par(mfrow=c(1,3)) | |
samples <- 100 | |
wide <- 2500 | |
high <- 50 | |
n <- 10 | |
means <- 1:samples | |
z <- 0 | |
while(z < samples) { | |
z <- z+1 | |
s<- sample(d$ExpenditureAmount, n, replace = TRUE) | |
means[z] <- mean(s) | |
} | |
mean(means) | |
sd(means) | |
length(means) | |
sampling.sd(popsd, length(means)) | |
hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 10", xlab="Mean Expenditure Amount ($)", col="steelblue") | |
box() | |
n <- 25 | |
means <- 1:samples | |
z <- 0 | |
while(z < samples) { | |
z <- z+1 | |
s<- sample(d$ExpenditureAmount, n, replace = TRUE) | |
means[z] <- mean(s) | |
} | |
summary(means) | |
sd(means) | |
length(means) | |
sampling.sd(popsd, length(means)) | |
hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 25", xlab="Mean Expenditure Amount ($)", col="steelblue") | |
box() | |
n <- 50 | |
means <- 1:samples | |
z <- 0 | |
while(z < samples) { | |
z <- z+1 | |
s<- sample(d$ExpenditureAmount, n, replace = TRUE) | |
means[z] <- mean(s) | |
} | |
summary(means) | |
sd(means) | |
length(means) | |
sampling.sd(popsd, length(means)) | |
hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 50", xlab="Mean Expenditure Amount ($)", col="steelblue") | |
box() | |
# 4. Repeat Step 2 for your sample of size n = 50 only. | |
##### EXTRA #### | |
# | |
# d$LogExpenditureAmount <- log(d$ExpenditureAmount) | |
# summary(d$LogExpenditureAmount) | |
# leaSD <- pop.sd(d$LogExpenditureAmount) | |
# hist(d$LogExpenditureAmount, main="", xlab="Log Expenditure Amount ($)") | |
# hist(d$LogExpenditureAmount, main="With normal curve", xlab="Log Expenditure Amount ($)", prob=TRUE) | |
# curve(dnorm(x, mean=mean(d$LogExpenditureAmount), sd=leaSD), add=TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Gjgg