willium · November 19, 2015 21:12 · jsalazare · Nov 19, 2015
diff --git a/analysis.R b/analysis.R
 ###############################################
 # SETUP
 ###############################################

 # set working directory
 setwd("/Users/willium/Documents/School/uw/2015-2016/Quarter 1/Stat 311/assignments/group-2")

 # load data.csv into variable d
 d <- read.csv("raw_data.csv", header=TRUE, as.is=TRUE, na.strings=c("", "NA")) # d is data

 ###############################################
 # DATA ORGANIZATION
 ###############################################

 d$CandidateName <- d$Candidate.Name
 d$Candidate.Name <- NULL
 d$CandidateName <- factor(d$CandidateName)

 d$SpenderName <- d$Spender.Name
 d$Spender.Name <- NULL
 d$SpenderName <- factor(d$SpenderName)

 d$CandidateParty <- d$Candidate.Party
 d$Candidate.Party <- NULL
 d$CandidateParty <- factor(d$CandidateParty)

 ea <- d$Expenditure.Amount
 d$Expenditure.Amount <- NULL
 eaNoDollar <- sub('\\$','',as.character(ea))
 eaClean <- as.numeric(gsub('\\,','',as.character(eaNoDollar)))
 d$ExpenditureAmount <- eaClean

 d$ExpenditureDate <- d$Expenditure.Date
 d$Expenditure.Date <- NULL
 d$ExpenditureDate <- as.Date(d$ExpenditureDate, "%m/%d/%y")

 d$Purpose <- factor(d$Purpose)

 d$CandidateParty <- factor(d$CandidateParty)

 write.csv(x=d, file="data.csv")

 ###############################################
 # FUNCTIONS
 ###############################################

 pop.var <- function(x) var(x) * (length(x)-1) / length(x)
 pop.sd <- function(x) sqrt(pop.var(x))
 sampling.sd <- function(x, n) x / sqrt(n)

 ###############################################
 # ASSIGNMENT
 ###############################################

 # 1 #
 # Describe your population. Your descriptions should be limited to your single variable and should include visual, numerical, and verbal descriptions. Since we are treating the data as the population, you need to make sure you use the correct formula for finding a population standard deviation. Many software packages default to a sample SD. In StatCrunch you can get a population SD by requesting “Unadj. std. dev.” from the selection of Summary statistics.

 summary(d$ExpenditureAmount)
 popsd <- pop.sd(d$ExpenditureAmount)
 length(d$ExpenditureAmount)
 ME <- 5000 - mean(d$ExpenditureAmount)

 h <- hist(d$ExpenditureAmount, xlim=c(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME), breaks=10, density=10, col="steelblue", main="", xlab="Expenditure Amount ($)")
 box()

 # visual: histograms
 # numerical: 5 number summary, sd, mean, iqr
 # verbal: spread, center, adjustment

 # 2 #
 # Use the mean and SD of your variable to create a normal model. Compare the normal model to the distribution of your variable and explain why you think the model is or is not useful.

 xfit <- seq(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME, length=100)
 yfit <- dnorm(xfit, mean=mean(d$ExpenditureAmount), sd=popsd)
 yfit <- yfit * diff(h$mids[1:2]) * length(d$ExpenditureAmount)
 lines(xfit, yfit, col="black", lwd=2)

 # xseq<-seq(-5000,5000,.01)
 # densities <- dnorm(xseq, mean=mean(d$ExpenditureAmount), sd=popsd)
 #
 # plot(xseq, densities, xlab="", ylab="Density", type="l",lwd=2, cex=2, main="PDF of Expenditure Amount Normal", cex.axis=.8)

 # plot normal model
 # not useful because its heavily skewed, might be if you use a logorithmic transformation

 # 3. Draw 100 simple random samples (sampling with replacement) from your population for each of n = 10, n = 25, and n = 50. For each sample calculate the mean and create a histogram of the sampling distribution of the mean. This will result in three histograms, one for each sample size. Also calculate the mean and SD for each sampling distribution. Then calculate the theoretical mean and SD for each of the three sampling distributions based on the mean and SD of your population. You might end up with a table that looks something like Table 1 shown below (be sure to adjust your caption as appropriate). Comment on how the means and SDs from the three sampling distributions you simulated compare to the theoretical means and SDs. Also, comment on any differences you observe between samples of size n = 10, n = 25 and n = 50.

 par(mfrow=c(1,3))

 samples <- 100
 wide <- 2500
 high <- 50

 n <- 10
 means <- 1:samples
 z <- 0
 while(z < samples) {
    z <- z+1
    s<- sample(d$ExpenditureAmount, n, replace = TRUE)
    means[z] <- mean(s)
 }
 mean(means)
 sd(means)
 length(means)
 sampling.sd(popsd, length(means))

 hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 10", xlab="Mean Expenditure Amount ($)", col="steelblue")
 box()

 n <- 25
 means <- 1:samples
 z <- 0
 while(z < samples) {
    z <- z+1
    s<- sample(d$ExpenditureAmount, n, replace = TRUE)
    means[z] <- mean(s)
 }
 summary(means)
 sd(means)
 length(means)
 sampling.sd(popsd, length(means))

 hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 25", xlab="Mean Expenditure Amount ($)", col="steelblue")
 box()

 n <- 50
 means <- 1:samples
 z <- 0
 while(z < samples) {
    z <- z+1
    s<- sample(d$ExpenditureAmount, n, replace = TRUE)
    means[z] <- mean(s)
 }
 summary(means)
 sd(means)
 length(means)
 sampling.sd(popsd, length(means))

 hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 50", xlab="Mean Expenditure Amount ($)", col="steelblue")
 box()

 # 4. Repeat Step 2 for your sample of size n = 50 only.


 ##### EXTRA ####
 #
 # d$LogExpenditureAmount <- log(d$ExpenditureAmount)
 # summary(d$LogExpenditureAmount)
 # leaSD <- pop.sd(d$LogExpenditureAmount)
 # hist(d$LogExpenditureAmount, main="", xlab="Log Expenditure Amount ($)")

 # hist(d$LogExpenditureAmount, main="With normal curve", xlab="Log Expenditure Amount ($)", prob=TRUE)
 # curve(dnorm(x, mean=mean(d$LogExpenditureAmount), sd=leaSD), add=TRUE)
	###############################################
	# SETUP
	###############################################

	# set working directory
	setwd("/Users/willium/Documents/School/uw/2015-2016/Quarter 1/Stat 311/assignments/group-2")

	# load data.csv into variable d
	d <- read.csv("raw_data.csv", header=TRUE, as.is=TRUE, na.strings=c("", "NA")) # d is data

	###############################################
	# DATA ORGANIZATION
	###############################################

	d$CandidateName <- d$Candidate.Name
	d$Candidate.Name <- NULL
	d$CandidateName <- factor(d$CandidateName)

	d$SpenderName <- d$Spender.Name
	d$Spender.Name <- NULL
	d$SpenderName <- factor(d$SpenderName)

	d$CandidateParty <- d$Candidate.Party
	d$Candidate.Party <- NULL
	d$CandidateParty <- factor(d$CandidateParty)

	ea <- d$Expenditure.Amount
	d$Expenditure.Amount <- NULL
	eaNoDollar <- sub('\\$','',as.character(ea))
	eaClean <- as.numeric(gsub('\\,','',as.character(eaNoDollar)))
	d$ExpenditureAmount <- eaClean

	d$ExpenditureDate <- d$Expenditure.Date
	d$Expenditure.Date <- NULL
	d$ExpenditureDate <- as.Date(d$ExpenditureDate, "%m/%d/%y")

	d$Purpose <- factor(d$Purpose)

	d$CandidateParty <- factor(d$CandidateParty)

	write.csv(x=d, file="data.csv")

	###############################################
	# FUNCTIONS
	###############################################

	pop.var <- function(x) var(x) * (length(x)-1) / length(x)
	pop.sd <- function(x) sqrt(pop.var(x))
	sampling.sd <- function(x, n) x / sqrt(n)

	###############################################
	# ASSIGNMENT
	###############################################

	# 1 #
	# Describe your population. Your descriptions should be limited to your single variable and should include visual, numerical, and verbal descriptions. Since we are treating the data as the population, you need to make sure you use the correct formula for finding a population standard deviation. Many software packages default to a sample SD. In StatCrunch you can get a population SD by requesting “Unadj. std. dev.” from the selection of Summary statistics.

	summary(d$ExpenditureAmount)
	popsd <- pop.sd(d$ExpenditureAmount)
	length(d$ExpenditureAmount)
	ME <- 5000 - mean(d$ExpenditureAmount)

	h <- hist(d$ExpenditureAmount, xlim=c(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME), breaks=10, density=10, col="steelblue", main="", xlab="Expenditure Amount ($)")
	box()

	# visual: histograms
	# numerical: 5 number summary, sd, mean, iqr
	# verbal: spread, center, adjustment

	# 2 #
	# Use the mean and SD of your variable to create a normal model. Compare the normal model to the distribution of your variable and explain why you think the model is or is not useful.

	xfit <- seq(mean(d$ExpenditureAmount) - ME, mean(d$ExpenditureAmount) + ME, length=100)
	yfit <- dnorm(xfit, mean=mean(d$ExpenditureAmount), sd=popsd)
	yfit <- yfit * diff(h$mids[1:2]) * length(d$ExpenditureAmount)
	lines(xfit, yfit, col="black", lwd=2)

	# xseq<-seq(-5000,5000,.01)
	# densities <- dnorm(xseq, mean=mean(d$ExpenditureAmount), sd=popsd)
	#
	# plot(xseq, densities, xlab="", ylab="Density", type="l",lwd=2, cex=2, main="PDF of Expenditure Amount Normal", cex.axis=.8)

	# plot normal model
	# not useful because its heavily skewed, might be if you use a logorithmic transformation

	# 3. Draw 100 simple random samples (sampling with replacement) from your population for each of n = 10, n = 25, and n = 50. For each sample calculate the mean and create a histogram of the sampling distribution of the mean. This will result in three histograms, one for each sample size. Also calculate the mean and SD for each sampling distribution. Then calculate the theoretical mean and SD for each of the three sampling distributions based on the mean and SD of your population. You might end up with a table that looks something like Table 1 shown below (be sure to adjust your caption as appropriate). Comment on how the means and SDs from the three sampling distributions you simulated compare to the theoretical means and SDs. Also, comment on any differences you observe between samples of size n = 10, n = 25 and n = 50.

	par(mfrow=c(1,3))

	samples <- 100
	wide <- 2500
	high <- 50

	n <- 10
	means <- 1:samples
	z <- 0
	while(z < samples) {
	z <- z+1
	s<- sample(d$ExpenditureAmount, n, replace = TRUE)
	means[z] <- mean(s)
	}
	mean(means)
	sd(means)
	length(means)
	sampling.sd(popsd, length(means))

	hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 10", xlab="Mean Expenditure Amount ($)", col="steelblue")
	box()

	n <- 25
	means <- 1:samples
	z <- 0
	while(z < samples) {
	z <- z+1
	s<- sample(d$ExpenditureAmount, n, replace = TRUE)
	means[z] <- mean(s)
	}
	summary(means)
	sd(means)
	length(means)
	sampling.sd(popsd, length(means))

	hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 25", xlab="Mean Expenditure Amount ($)", col="steelblue")
	box()

	n <- 50
	means <- 1:samples
	z <- 0
	while(z < samples) {
	z <- z+1
	s<- sample(d$ExpenditureAmount, n, replace = TRUE)
	means[z] <- mean(s)
	}
	summary(means)
	sd(means)
	length(means)
	sampling.sd(popsd, length(means))

	hist(means, xlim=c(0,wide), ylim=c(0, high), breaks=10, density=10, main="n = 50", xlab="Mean Expenditure Amount ($)", col="steelblue")
	box()

	# 4. Repeat Step 2 for your sample of size n = 50 only.


	##### EXTRA ####
	#
	# d$LogExpenditureAmount <- log(d$ExpenditureAmount)
	# summary(d$LogExpenditureAmount)
	# leaSD <- pop.sd(d$LogExpenditureAmount)
	# hist(d$LogExpenditureAmount, main="", xlab="Log Expenditure Amount ($)")

	# hist(d$LogExpenditureAmount, main="With normal curve", xlab="Log Expenditure Amount ($)", prob=TRUE)
	# curve(dnorm(x, mean=mean(d$LogExpenditureAmount), sd=leaSD), add=TRUE)