Created
March 7, 2011 22:47
-
-
Save Protonk/859429 to your computer and use it in GitHub Desktop.
Age-earnings profile
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #loads the packae which allows us to import state files | |
| library(foreign) | |
| #The file can also be a url to the data on your website, obviously if you load it | |
| #on your own PC the path will be different. :) | |
| adams<-read.dta(file="~/Downloads/assignment1.dta") | |
| #Removes columns which aren't used and moves the eph column | |
| adams<-adams[,c(7,2:6,8,9,14,15)] | |
| #This next section converts the binary factors in the state file to factors as | |
| #R is used to seeing them, one column with a specific level for each (hschool, college) | |
| race<-c("white","black","othrace") | |
| schooling<-c("hschool","college") | |
| sex<-c("male","female") | |
| #We pull the columns for race, sex and schooling out of the data frame | |
| racelvl<-as.matrix(subset(adams,select=which(names(adams) %in% race))) | |
| schoolinglvl<-as.matrix(subset(adams,select=which(names(adams) %in% schooling))) | |
| sexlvl<-as.matrix(subset(adams,select=which(names(adams) %in% sex))) | |
| #Then we create a factor for each by premultiplying the n by k matrix by the k levels | |
| #of the factor, giving us an n by 1 matrix of levels and add them to the dataframe | |
| adams$race<-factor(racelvl%*%(1:ncol(racelvl)), labels = colnames(racelvl)) | |
| adams$schooling<-factor(schoolinglvl%*%(1:ncol(schoolinglvl)), labels = colnames(schoolinglvl)) | |
| adams$sex<-factor(sexlvl%*%(1:ncol(sexlvl)), labels = colnames(sexlvl)) | |
| #This part is just housekeeping. We remove the old columns and remove the matrices | |
| #we created from memory. | |
| oldlvls<-c(race,sex,schooling) | |
| adams <- adams[, !(names(adams) %in% oldlvls)] | |
| rm(racelvl,schoolinglvl,sexlvl) | |
| #More houskeeping, though this will materially change the estimates. | |
| #We set the values of eph which are 0 to NA and remove them from the data | |
| #The last line simply characterizes the state as a categorical variable | |
| adams[adams==0]<-NA | |
| adams<-subset(adams,!is.na(adams$eph)) | |
| adams$state<-factor(adams$state) | |
| #This is the t-test, the uncorrected regression and the regression with controls, respectively. | |
| schooling.t<-t.test(eph ~ schooling, data=adams, alternative="two.sided", var.equal=TRUE, conf.level=.95) | |
| simple.lm<-lm(log(eph) ~ schooling, data=adams) | |
| more.lm<-lm(log(eph) ~ schooling + age + I(age^2) + sex + race, data=adams); | |
| #This is the density plot. The subsetting of the data below is a bit kludgy but | |
| #it is a bit easier to do it this way with density() | |
| forcol<-subset(adams,schooling=="college") | |
| forhs<-subset(adams,schooling=="hschool") | |
| plot(density(forhs$eph,na.rm=TRUE),type="n",main="College and High School Earnings",xlab="Earnings per Hour") | |
| lines(density(forhs$eph,na.rm=TRUE),col=4) | |
| lines(density(forcol$eph,na.rm=TRUE),col=3) | |
| text(20,0.06,labels="High School",col=4,cex=0.8) | |
| text(40,0.02,labels="College",col=3,cex=0.8); | |
| #Age earnings profile | |
| plot(log(eph)~jitter(age,factor=3),data=adams,type="p",col=rgb(red=0.5, green=0.5, blue=0.5, alpha=0.15),pch=19,main="A Hint at the Age-Earnings Profile",xlab="Age",ylab=expression(log(Earnings))) | |
| #The below code generated our two bar plots for college graduation and earnings. | |
| #The library plyr is not part of base R and must be downloaded via | |
| # install.packages("plyr"), then installed w/ library(). We use plyr to | |
| #give easy access to the conditional means of the dataset (it can do a lot more) | |
| #breaking down the data by state. The for loop exists to compute the percentage of | |
| #college graduates. It is a bit messy but it works. | |
| library(plyr) | |
| state.sch<-ddply(adams, .(state,schooling),"nrow") | |
| state.prop<-numeric(0) | |
| for (i in seq(1, 18, by = 2)) { | |
| state.prop[i]<-(state.sch[i+1,3]/(state.sch[i+1,3]+state.sch[i,3])) | |
| } | |
| state.prop<-state.prop[seq(1, 18, by = 2)] | |
| names(state.prop)<-as.character(state.sch[seq(1, 18, by = 2),1]) | |
| barplot(state.prop,main="Proportion of College Graduates in the\nLabor Force") | |
| state.eph<-ddply(adams, .(state), summarize, mean.eph= mean(eph)) | |
| with(state.eph,barplot(mean.eph, names.arg=state,main="Mean Earnings by State",ylab="Earnings per Hour")) | |
| #Here we re-use state.prop to serve as part of the first state IV. It is | |
| #converted to a data frame in order to allow us to merge it easily with | |
| #the data for the IV. | |
| state.prop.iv<-as.data.frame(as.matrix(state.prop)) | |
| colnames(state.prop.iv)<-"prop.college" | |
| state.prop.iv$state<-rownames(state.prop.iv) | |
| rownames(state.prop.iv)<-NULL | |
| #The sources for the IVs are in somewhat messy tables build for web display. | |
| #Instead of rehashing the steps to get them into a simple tabular form I just | |
| #recreated the eventual data frame. Sources are identified in the homework | |
| state.iv<-structure(list(state = c("CT", "MA", "ME", "NH", "NJ", "NY", | |
| "PA", "RI", "VT"), expenditure = c(946042.616752, 1413092.7716, | |
| 348656.5172, 317483.926258, 2480931.9152, 5743924.8648, 3869841.293705, | |
| 304841.3876, 280731.1066), grants = c(20550.3636363636, 53072.0909090909, | |
| 4726.18181818182, 953.727272727273, 115471.454545455, 522320.909090909, | |
| 178854.636363636, 7778.27272727273, 10872.4545454545), require.income = c(24.9, | |
| 20.6, 21.1, 25.6, 18.6, 27.1, 28.6, 31.3, 34.1)), .Names = c("state", | |
| "expenditure", "grants", "require.income"), row.names = c(NA, | |
| -9L), class = "data.frame") | |
| state.test<-merge(state.iv,state.prop.iv,by="state") | |
| #Normalize each of the IVs. This isn't necessary but it makes coefficients | |
| #on the independent variables a little more sensible. | |
| state.test$expenditure<-state.test$expenditure/mean(state.test$expenditure) | |
| state.test$grants<-state.test$grants/mean(state.test$grants) | |
| state.test$require.income<-state.test$require.income/100 | |
| #code to compute the truncated normal example from part I is at | |
| #https://gist.github.com/850512#file_bootstrap truncated.r |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment