Protonk · March 7, 2011 22:47
diff --git a/earnings.R b/earnings.R
 #loads the packae which allows us to import state files
 library(foreign)
 #The file can also be a url to the data on your website, obviously if you load it
 #on your own PC the path will be different. :)
 adams<-read.dta(file="~/Downloads/assignment1.dta")
 #Removes columns which aren't used and moves the eph column
 adams<-adams[,c(7,2:6,8,9,14,15)]

 #This next section converts the binary factors in the state file to factors as
 #R is used to seeing them, one column with a specific level for each (hschool, college)
 race<-c("white","black","othrace")
 schooling<-c("hschool","college")
 sex<-c("male","female")
 #We pull the columns for race, sex and schooling out of the data frame
 racelvl<-as.matrix(subset(adams,select=which(names(adams) %in% race)))
 schoolinglvl<-as.matrix(subset(adams,select=which(names(adams) %in% schooling)))
 sexlvl<-as.matrix(subset(adams,select=which(names(adams) %in% sex)))
 #Then we create a factor for each by premultiplying the n by k matrix by the k levels 
 #of the factor, giving us an n by 1 matrix of levels and add them to the dataframe
 adams$race<-factor(racelvl%*%(1:ncol(racelvl)), labels = colnames(racelvl))
 adams$schooling<-factor(schoolinglvl%*%(1:ncol(schoolinglvl)), labels = colnames(schoolinglvl))
 adams$sex<-factor(sexlvl%*%(1:ncol(sexlvl)), labels = colnames(sexlvl))
 #This part is just housekeeping.  We remove the old columns and remove the matrices
 #we created from memory.
 oldlvls<-c(race,sex,schooling)
 adams <- adams[, !(names(adams) %in% oldlvls)]
 rm(racelvl,schoolinglvl,sexlvl)
 #More houskeeping, though this will materially change the estimates.
 #We set the values of eph which are 0 to NA and remove them from the data
 #The last line simply characterizes the state as a categorical variable
 adams[adams==0]<-NA
 adams<-subset(adams,!is.na(adams$eph))
 adams$state<-factor(adams$state)

 #This is the t-test, the uncorrected regression and the regression with controls, respectively.
 schooling.t<-t.test(eph ~ schooling, data=adams, alternative="two.sided", var.equal=TRUE, conf.level=.95)
 simple.lm<-lm(log(eph) ~ schooling, data=adams)
 more.lm<-lm(log(eph) ~ schooling + age + I(age^2) + sex + race, data=adams);


 #This is the density plot.  The subsetting of the data below is a bit kludgy but
 #it is a bit easier to do it this way with density()
 forcol<-subset(adams,schooling=="college")
 forhs<-subset(adams,schooling=="hschool")
 plot(density(forhs$eph,na.rm=TRUE),type="n",main="College and High School Earnings",xlab="Earnings per Hour")
 lines(density(forhs$eph,na.rm=TRUE),col=4)
 lines(density(forcol$eph,na.rm=TRUE),col=3)
 text(20,0.06,labels="High School",col=4,cex=0.8)
 text(40,0.02,labels="College",col=3,cex=0.8);

 #Age earnings profile
 plot(log(eph)~jitter(age,factor=3),data=adams,type="p",col=rgb(red=0.5, green=0.5, blue=0.5, alpha=0.15),pch=19,main="A Hint at the Age-Earnings Profile",xlab="Age",ylab=expression(log(Earnings)))

 #The below code generated our two bar plots for college graduation and earnings.
 #The library plyr is not part of base R and must be downloaded via
 # install.packages("plyr"), then installed w/ library().  We use plyr to
 #give easy access to the conditional means of the dataset (it can do a lot more)
 #breaking down the data by state.  The for loop exists to compute the percentage of
 #college graduates.  It is a bit messy but it works.
 library(plyr)
 state.sch<-ddply(adams, .(state,schooling),"nrow")
 state.prop<-numeric(0)
 for (i in seq(1, 18, by = 2)) { 
 	state.prop[i]<-(state.sch[i+1,3]/(state.sch[i+1,3]+state.sch[i,3]))
 	}
 state.prop<-state.prop[seq(1, 18, by = 2)]
 names(state.prop)<-as.character(state.sch[seq(1, 18, by = 2),1])
 barplot(state.prop,main="Proportion of College Graduates in the\nLabor Force")

 state.eph<-ddply(adams, .(state), summarize, mean.eph= mean(eph))
 with(state.eph,barplot(mean.eph, names.arg=state,main="Mean Earnings by State",ylab="Earnings per Hour"))

 #Here we re-use state.prop to serve as part of the first state IV.  It is 
 #converted to a data frame in order to allow us to merge it easily with
 #the data for the IV.
 state.prop.iv<-as.data.frame(as.matrix(state.prop))
 colnames(state.prop.iv)<-"prop.college"
 state.prop.iv$state<-rownames(state.prop.iv)
 rownames(state.prop.iv)<-NULL

 #The sources for the IVs are in somewhat messy tables build for web display.
 #Instead of rehashing the steps to get them into a simple tabular form I just 
 #recreated the eventual data frame.  Sources are identified in the homework
 state.iv<-structure(list(state = c("CT", "MA", "ME", "NH", "NJ", "NY", 
 "PA", "RI", "VT"), expenditure = c(946042.616752, 1413092.7716, 
 348656.5172, 317483.926258, 2480931.9152, 5743924.8648, 3869841.293705, 
 304841.3876, 280731.1066), grants = c(20550.3636363636, 53072.0909090909, 
 4726.18181818182, 953.727272727273, 115471.454545455, 522320.909090909, 
 178854.636363636, 7778.27272727273, 10872.4545454545), require.income = c(24.9, 
 20.6, 21.1, 25.6, 18.6, 27.1, 28.6, 31.3, 34.1)), .Names = c("state", 
 "expenditure", "grants", "require.income"), row.names = c(NA, 
 -9L), class = "data.frame")

 state.test<-merge(state.iv,state.prop.iv,by="state")

 #Normalize each of the IVs.  This isn't necessary but it makes coefficients
 #on the independent variables a little more sensible.
 state.test$expenditure<-state.test$expenditure/mean(state.test$expenditure)
 state.test$grants<-state.test$grants/mean(state.test$grants)
 state.test$require.income<-state.test$require.income/100

 #code to compute the truncated normal example from part I is at
 #https://gist.github.com/850512#file_bootstrap truncated.r
	#loads the packae which allows us to import state files
	library(foreign)
	#The file can also be a url to the data on your website, obviously if you load it
	#on your own PC the path will be different. :)
	adams<-read.dta(file="~/Downloads/assignment1.dta")
	#Removes columns which aren't used and moves the eph column
	adams<-adams[,c(7,2:6,8,9,14,15)]

	#This next section converts the binary factors in the state file to factors as
	#R is used to seeing them, one column with a specific level for each (hschool, college)
	race<-c("white","black","othrace")
	schooling<-c("hschool","college")
	sex<-c("male","female")
	#We pull the columns for race, sex and schooling out of the data frame
	racelvl<-as.matrix(subset(adams,select=which(names(adams) %in% race)))
	schoolinglvl<-as.matrix(subset(adams,select=which(names(adams) %in% schooling)))
	sexlvl<-as.matrix(subset(adams,select=which(names(adams) %in% sex)))
	#Then we create a factor for each by premultiplying the n by k matrix by the k levels
	#of the factor, giving us an n by 1 matrix of levels and add them to the dataframe
	adams$race<-factor(racelvl%*%(1:ncol(racelvl)), labels = colnames(racelvl))
	adams$schooling<-factor(schoolinglvl%*%(1:ncol(schoolinglvl)), labels = colnames(schoolinglvl))
	adams$sex<-factor(sexlvl%*%(1:ncol(sexlvl)), labels = colnames(sexlvl))
	#This part is just housekeeping. We remove the old columns and remove the matrices
	#we created from memory.
	oldlvls<-c(race,sex,schooling)
	adams <- adams[, !(names(adams) %in% oldlvls)]
	rm(racelvl,schoolinglvl,sexlvl)
	#More houskeeping, though this will materially change the estimates.
	#We set the values of eph which are 0 to NA and remove them from the data
	#The last line simply characterizes the state as a categorical variable
	adams[adams==0]<-NA
	adams<-subset(adams,!is.na(adams$eph))
	adams$state<-factor(adams$state)

	#This is the t-test, the uncorrected regression and the regression with controls, respectively.
	schooling.t<-t.test(eph ~ schooling, data=adams, alternative="two.sided", var.equal=TRUE, conf.level=.95)
	simple.lm<-lm(log(eph) ~ schooling, data=adams)
	more.lm<-lm(log(eph) ~ schooling + age + I(age^2) + sex + race, data=adams);


	#This is the density plot. The subsetting of the data below is a bit kludgy but
	#it is a bit easier to do it this way with density()
	forcol<-subset(adams,schooling=="college")
	forhs<-subset(adams,schooling=="hschool")
	plot(density(forhs$eph,na.rm=TRUE),type="n",main="College and High School Earnings",xlab="Earnings per Hour")
	lines(density(forhs$eph,na.rm=TRUE),col=4)
	lines(density(forcol$eph,na.rm=TRUE),col=3)
	text(20,0.06,labels="High School",col=4,cex=0.8)
	text(40,0.02,labels="College",col=3,cex=0.8);

	#Age earnings profile
	plot(log(eph)~jitter(age,factor=3),data=adams,type="p",col=rgb(red=0.5, green=0.5, blue=0.5, alpha=0.15),pch=19,main="A Hint at the Age-Earnings Profile",xlab="Age",ylab=expression(log(Earnings)))

	#The below code generated our two bar plots for college graduation and earnings.
	#The library plyr is not part of base R and must be downloaded via
	# install.packages("plyr"), then installed w/ library(). We use plyr to
	#give easy access to the conditional means of the dataset (it can do a lot more)
	#breaking down the data by state. The for loop exists to compute the percentage of
	#college graduates. It is a bit messy but it works.
	library(plyr)
	state.sch<-ddply(adams, .(state,schooling),"nrow")
	state.prop<-numeric(0)
	for (i in seq(1, 18, by = 2)) {
	state.prop[i]<-(state.sch[i+1,3]/(state.sch[i+1,3]+state.sch[i,3]))
	}
	state.prop<-state.prop[seq(1, 18, by = 2)]
	names(state.prop)<-as.character(state.sch[seq(1, 18, by = 2),1])
	barplot(state.prop,main="Proportion of College Graduates in the\nLabor Force")

	state.eph<-ddply(adams, .(state), summarize, mean.eph= mean(eph))
	with(state.eph,barplot(mean.eph, names.arg=state,main="Mean Earnings by State",ylab="Earnings per Hour"))

	#Here we re-use state.prop to serve as part of the first state IV. It is
	#converted to a data frame in order to allow us to merge it easily with
	#the data for the IV.
	state.prop.iv<-as.data.frame(as.matrix(state.prop))
	colnames(state.prop.iv)<-"prop.college"
	state.prop.iv$state<-rownames(state.prop.iv)
	rownames(state.prop.iv)<-NULL

	#The sources for the IVs are in somewhat messy tables build for web display.
	#Instead of rehashing the steps to get them into a simple tabular form I just
	#recreated the eventual data frame. Sources are identified in the homework
	state.iv<-structure(list(state = c("CT", "MA", "ME", "NH", "NJ", "NY",
	"PA", "RI", "VT"), expenditure = c(946042.616752, 1413092.7716,
	348656.5172, 317483.926258, 2480931.9152, 5743924.8648, 3869841.293705,
	304841.3876, 280731.1066), grants = c(20550.3636363636, 53072.0909090909,
	4726.18181818182, 953.727272727273, 115471.454545455, 522320.909090909,
	178854.636363636, 7778.27272727273, 10872.4545454545), require.income = c(24.9,
	20.6, 21.1, 25.6, 18.6, 27.1, 28.6, 31.3, 34.1)), .Names = c("state",
	"expenditure", "grants", "require.income"), row.names = c(NA,
	-9L), class = "data.frame")

	state.test<-merge(state.iv,state.prop.iv,by="state")

	#Normalize each of the IVs. This isn't necessary but it makes coefficients
	#on the independent variables a little more sensible.
	state.test$expenditure<-state.test$expenditure/mean(state.test$expenditure)
	state.test$grants<-state.test$grants/mean(state.test$grants)
	state.test$require.income<-state.test$require.income/100

	#code to compute the truncated normal example from part I is at
	#https://gist.github.com/850512#file_bootstrap truncated.r
No results found