Skip to content

Instantly share code, notes, and snippets.

@willium
Last active November 19, 2015 21:12
Show Gist options
  • Save willium/98b4ab719ef71bba1e85 to your computer and use it in GitHub Desktop.
Save willium/98b4ab719ef71bba1e85 to your computer and use it in GitHub Desktop.
###############################################
# SETUP
###############################################
# set working directory
setwd("/Users/willium/Documents/School/uw/2015-2016/Quarter 1/Stat 311/assignments/group-2")
# load data.csv into variable d
d <- read.csv("raw_data.csv", header=TRUE, as.is=TRUE, na.strings=c("", "NA")) # d is data
###############################################
# DATA ORGANIZATION
###############################################
d$CandidateName <- d$Candidate.Name
d$Candidate.Name <- NULL
d$CandidateName <- factor(d$CandidateName)
d$SpenderName <- d$Spender.Name
d$Spender.Name <- NULL
d$SpenderName <- factor(d$SpenderName)
d$CandidateParty <- d$Candidate.Party
d$Candidate.Party <- NULL
d$CandidateParty <- factor(d$CandidateParty)
ea <- d$Expenditure.Amount
d$Expenditure.Amount <- NULL
eaNoDollar <- sub('\\$','',as.character(ea))
eaClean <- as.numeric(gsub('\\,','',as.character(eaNoDollar)))
d$ExpenditureAmount <- eaClean
d$ExpenditureDate <- d$Expenditure.Date
d$Expenditure.Date <- NULL
d$ExpenditureDate <- as.Date(d$ExpenditureDate, "%m/%d/%y")
d$Purpose <- factor(d$Purpose)
d$CandidateParty <- factor(d$CandidateParty)
write.csv(x=d, file="data.csv")
###############################################
# FUNCTIONS
###############################################
pop.var <- function(x) var(x) * (length(x)-1) / length(x)
pop.sd <- function(x) sqrt(pop.var(x))
sampling.sd <- function(x, n) x / sqrt(n)
###############################################
# ASSIGNMENT
###############################################
# 1 #
# Describe your population. Your descriptions should be limited to your single variable and should include visual, numerical, and verbal descriptions. Since we are treating the data as the population, you need to make sure you use the correct formula for finding a population standard deviation. Many software packages default to a sample SD. In StatCrunch you can get a population SD by requesting “Unadj. std. dev.” from the selection of Summary statistics.
summary(d$ExpenditureAmount)
popsd <- pop.sd(d$ExpenditureAmount)
length(d$ExpenditureAmount)
ME <- 5000 - mean(d$ExpenditureAmount)
h <- hist(d$ExpenditureAmount, xlim=c(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME), breaks=10, density=10, col="steelblue", main="", xlab="Expenditure Amount ($)")
box()
# visual: histograms
# numerical: 5 number summary, sd, mean, iqr
# verbal: spread, center, adjustment
# 2 #
# Use the mean and SD of your variable to create a normal model. Compare the normal model to the distribution of your variable and explain why you think the model is or is not useful.
xfit <- seq(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME, length=100)
yfit <- dnorm(xfit, mean=mean(d$ExpenditureAmount), sd=popsd)
yfit <- yfit * diff(h$mids[1:2]) * length(d$ExpenditureAmount)
lines(xfit, yfit, col="black", lwd=2)
# xseq<-seq(-5000,5000,.01)
# densities <- dnorm(xseq, mean=mean(d$ExpenditureAmount), sd=popsd)
#
# plot(xseq, densities, xlab="", ylab="Density", type="l",lwd=2, cex=2, main="PDF of Expenditure Amount Normal", cex.axis=.8)
# plot normal model
# not useful because its heavily skewed, might be if you use a logorithmic transformation
# 3. Draw 100 simple random samples (sampling with replacement) from your population for each of n = 10, n = 25, and n = 50. For each sample calculate the mean and create a histogram of the sampling distribution of the mean. This will result in three histograms, one for each sample size. Also calculate the mean and SD for each sampling distribution. Then calculate the theoretical mean and SD for each of the three sampling distributions based on the mean and SD of your population. You might end up with a table that looks something like Table 1 shown below (be sure to adjust your caption as appropriate). Comment on how the means and SDs from the three sampling distributions you simulated compare to the theoretical means and SDs. Also, comment on any differences you observe between samples of size n = 10, n = 25 and n = 50.
par(mfrow=c(1,3))
samples <- 100
wide <- 2500
high <- 50
n <- 10
means <- 1:samples
z <- 0
while(z < samples) {
z <- z+1
s<- sample(d$ExpenditureAmount, n, replace = TRUE)
means[z] <- mean(s)
}
mean(means)
sd(means)
length(means)
sampling.sd(popsd, length(means))
hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 10", xlab="Mean Expenditure Amount ($)", col="steelblue")
box()
n <- 25
means <- 1:samples
z <- 0
while(z < samples) {
z <- z+1
s<- sample(d$ExpenditureAmount, n, replace = TRUE)
means[z] <- mean(s)
}
summary(means)
sd(means)
length(means)
sampling.sd(popsd, length(means))
hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 25", xlab="Mean Expenditure Amount ($)", col="steelblue")
box()
n <- 50
means <- 1:samples
z <- 0
while(z < samples) {
z <- z+1
s<- sample(d$ExpenditureAmount, n, replace = TRUE)
means[z] <- mean(s)
}
summary(means)
sd(means)
length(means)
sampling.sd(popsd, length(means))
hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 50", xlab="Mean Expenditure Amount ($)", col="steelblue")
box()
# 4. Repeat Step 2 for your sample of size n = 50 only.
##### EXTRA ####
#
# d$LogExpenditureAmount <- log(d$ExpenditureAmount)
# summary(d$LogExpenditureAmount)
# leaSD <- pop.sd(d$LogExpenditureAmount)
# hist(d$LogExpenditureAmount, main="", xlab="Log Expenditure Amount ($)")
# hist(d$LogExpenditureAmount, main="With normal curve", xlab="Log Expenditure Amount ($)", prob=TRUE)
# curve(dnorm(x, mean=mean(d$LogExpenditureAmount), sd=leaSD), add=TRUE)
@jsalazare
Copy link

Gjgg

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment