Skip to content

Instantly share code, notes, and snippets.

@Protonk
Created March 7, 2011 22:47
Show Gist options
  • Select an option

  • Save Protonk/859429 to your computer and use it in GitHub Desktop.

Select an option

Save Protonk/859429 to your computer and use it in GitHub Desktop.
Age-earnings profile
#loads the packae which allows us to import state files
library(foreign)
#The file can also be a url to the data on your website, obviously if you load it
#on your own PC the path will be different. :)
adams<-read.dta(file="~/Downloads/assignment1.dta")
#Removes columns which aren't used and moves the eph column
adams<-adams[,c(7,2:6,8,9,14,15)]
#This next section converts the binary factors in the state file to factors as
#R is used to seeing them, one column with a specific level for each (hschool, college)
race<-c("white","black","othrace")
schooling<-c("hschool","college")
sex<-c("male","female")
#We pull the columns for race, sex and schooling out of the data frame
racelvl<-as.matrix(subset(adams,select=which(names(adams) %in% race)))
schoolinglvl<-as.matrix(subset(adams,select=which(names(adams) %in% schooling)))
sexlvl<-as.matrix(subset(adams,select=which(names(adams) %in% sex)))
#Then we create a factor for each by premultiplying the n by k matrix by the k levels
#of the factor, giving us an n by 1 matrix of levels and add them to the dataframe
adams$race<-factor(racelvl%*%(1:ncol(racelvl)), labels = colnames(racelvl))
adams$schooling<-factor(schoolinglvl%*%(1:ncol(schoolinglvl)), labels = colnames(schoolinglvl))
adams$sex<-factor(sexlvl%*%(1:ncol(sexlvl)), labels = colnames(sexlvl))
#This part is just housekeeping. We remove the old columns and remove the matrices
#we created from memory.
oldlvls<-c(race,sex,schooling)
adams <- adams[, !(names(adams) %in% oldlvls)]
rm(racelvl,schoolinglvl,sexlvl)
#More houskeeping, though this will materially change the estimates.
#We set the values of eph which are 0 to NA and remove them from the data
#The last line simply characterizes the state as a categorical variable
adams[adams==0]<-NA
adams<-subset(adams,!is.na(adams$eph))
adams$state<-factor(adams$state)
#This is the t-test, the uncorrected regression and the regression with controls, respectively.
schooling.t<-t.test(eph ~ schooling, data=adams, alternative="two.sided", var.equal=TRUE, conf.level=.95)
simple.lm<-lm(log(eph) ~ schooling, data=adams)
more.lm<-lm(log(eph) ~ schooling + age + I(age^2) + sex + race, data=adams);
#This is the density plot. The subsetting of the data below is a bit kludgy but
#it is a bit easier to do it this way with density()
forcol<-subset(adams,schooling=="college")
forhs<-subset(adams,schooling=="hschool")
plot(density(forhs$eph,na.rm=TRUE),type="n",main="College and High School Earnings",xlab="Earnings per Hour")
lines(density(forhs$eph,na.rm=TRUE),col=4)
lines(density(forcol$eph,na.rm=TRUE),col=3)
text(20,0.06,labels="High School",col=4,cex=0.8)
text(40,0.02,labels="College",col=3,cex=0.8);
#Age earnings profile
plot(log(eph)~jitter(age,factor=3),data=adams,type="p",col=rgb(red=0.5, green=0.5, blue=0.5, alpha=0.15),pch=19,main="A Hint at the Age-Earnings Profile",xlab="Age",ylab=expression(log(Earnings)))
#The below code generated our two bar plots for college graduation and earnings.
#The library plyr is not part of base R and must be downloaded via
# install.packages("plyr"), then installed w/ library(). We use plyr to
#give easy access to the conditional means of the dataset (it can do a lot more)
#breaking down the data by state. The for loop exists to compute the percentage of
#college graduates. It is a bit messy but it works.
library(plyr)
state.sch<-ddply(adams, .(state,schooling),"nrow")
state.prop<-numeric(0)
for (i in seq(1, 18, by = 2)) {
state.prop[i]<-(state.sch[i+1,3]/(state.sch[i+1,3]+state.sch[i,3]))
}
state.prop<-state.prop[seq(1, 18, by = 2)]
names(state.prop)<-as.character(state.sch[seq(1, 18, by = 2),1])
barplot(state.prop,main="Proportion of College Graduates in the\nLabor Force")
state.eph<-ddply(adams, .(state), summarize, mean.eph= mean(eph))
with(state.eph,barplot(mean.eph, names.arg=state,main="Mean Earnings by State",ylab="Earnings per Hour"))
#Here we re-use state.prop to serve as part of the first state IV. It is
#converted to a data frame in order to allow us to merge it easily with
#the data for the IV.
state.prop.iv<-as.data.frame(as.matrix(state.prop))
colnames(state.prop.iv)<-"prop.college"
state.prop.iv$state<-rownames(state.prop.iv)
rownames(state.prop.iv)<-NULL
#The sources for the IVs are in somewhat messy tables build for web display.
#Instead of rehashing the steps to get them into a simple tabular form I just
#recreated the eventual data frame. Sources are identified in the homework
state.iv<-structure(list(state = c("CT", "MA", "ME", "NH", "NJ", "NY",
"PA", "RI", "VT"), expenditure = c(946042.616752, 1413092.7716,
348656.5172, 317483.926258, 2480931.9152, 5743924.8648, 3869841.293705,
304841.3876, 280731.1066), grants = c(20550.3636363636, 53072.0909090909,
4726.18181818182, 953.727272727273, 115471.454545455, 522320.909090909,
178854.636363636, 7778.27272727273, 10872.4545454545), require.income = c(24.9,
20.6, 21.1, 25.6, 18.6, 27.1, 28.6, 31.3, 34.1)), .Names = c("state",
"expenditure", "grants", "require.income"), row.names = c(NA,
-9L), class = "data.frame")
state.test<-merge(state.iv,state.prop.iv,by="state")
#Normalize each of the IVs. This isn't necessary but it makes coefficients
#on the independent variables a little more sensible.
state.test$expenditure<-state.test$expenditure/mean(state.test$expenditure)
state.test$grants<-state.test$grants/mean(state.test$grants)
state.test$require.income<-state.test$require.income/100
#code to compute the truncated normal example from part I is at
#https://gist.github.com/850512#file_bootstrap truncated.r
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment