Skip to content

Instantly share code, notes, and snippets.

@Btibert3
Created January 12, 2011 22:55
Show Gist options
  • Select an option

  • Save Btibert3/777069 to your computer and use it in GitHub Desktop.

Select an option

Save Btibert3/777069 to your computer and use it in GitHub Desktop.
Correlate 6-year Grad Rate with Freshmen Yield Rate
################################################################
# Author: @BrockTibert
# Date: January 2011
#
# Purpose: Study the relationship of freshmen yield rate and grad rate
# Comes from Chronicle blog post on research done about college selection
# and more likely to be selected if have higher grad rate. Let's use
# actual data to see if this is true.
#
# http://chronicle.com/blogs/headcount/how-graduation-rates-shape-college-choice/27770
#
# Windows 7 Pro, R 2.10
#
################################################################
# initial setup
DIR <- "C:/Users/Brock/Documents/My Dropbox/Eclipse/Projects/R/IPEDS/Grad and Yield Rate/"
setwd(DIR)
# Base URL for the IPEDS data raw datasets
BASE_URL <- 'http://nces.ed.gov/ipeds/datacenter/data/'
# the files of interest
FILES <- c('GR2009', 'IC2009', 'HD2009')
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Grab and unzip the datasets
# An example path: http://nces.ed.gov/ipeds/datacenter/data/EF2004D.zip
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# download the files
for(i in FILES) {
# build the URL String
URL <- paste(BASE_URL, i, ".zip", sep="")
# build the destination -- needs to include path AND file
dest <- paste(DIR, i, ".zip", sep="")
# fetch the file
download.file(URL, destfile=dest, mode="wb")
}
# unzip the files
for (i in dir(pattern="\\.zip$"))
unzip(i)
# delete the zip files if you want
for (i in dir(pattern="\\.zip$"))
unlink(i)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Read in the datasets, filter, clean as needed
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#------------------------------------------
# app and yield data
#------------------------------------------
# read in the data
app.raw <- read.table("ic2009.csv", header=T, stringsAsFactors=F, sep=",")
dim(app.raw)
# filter columns -- pull some extra fields in case I want to use them
app <- app.raw[c("unitid", "applcn", "admssn", "enrlt", "satactdt",
"satnum", "satpct", "actnum", "actpct", "satvr25",
"satvr75", "satmt25", "satmt75", "actcm25", "actcm75",
"acten25", "acten75", "actmt25", "actmt75",
"tuitvary")]
rm(app.raw)
#------------------------------------------
# directory info so we can filter on 4-year, etc.
#------------------------------------------
# read in the raw data
dir.raw <- read.table("hd2009.csv", header=T, sep=",", stringsAsFactors=F)
dim(dir.raw)
# filter the rows/columns I want
dir.raw <- dir.raw[dir.raw$sector %in% c(1,2), ]
dir.raw <- dir.raw[dir.raw$obereg %in% c(1,2), ]
dir.raw <- dir.raw[dir.raw$deggrant==1, ]
dir.raw <- dir.raw[dir.raw$pset4flg==1, ]
inst <- dir.raw[c("unitid", "instnm", "sector", "addr", "stabbr", "zip",
"carnegie", "locale")]
rm(dir.raw)
#------------------------------------------
# directory info so we can filter on 6-year rate
# prefer to do 4-year, but 6 is more common
#
# definition: total cohort considered = GRRACE24 when GRTYPE=8
# grad in 6-years = GRRACE24 when GRTYPE=12
#
# NOTES: These data are multi-record per school
# need to restructure - will use reshape package
#------------------------------------------
# read in the data
grad.raw <- read.table("gr2009.csv", header=T, sep=",", stringsAsFactors=F)
dim(grad.raw)
# filter the rows
grad.temp <- grad.raw[grad.raw$GRTYPE %in% c(8,12),]
grad.temp <- grad.temp[c("UNITID", "GRTYPE", "GRTOTLT")] # there isn't a GRRACE24
# reshape the data
library(reshape)
grad <- cast(grad.temp, UNITID ~ GRTYPE)
colnames(grad) <- c("unitid", "cohort", "grad")
grad$sixyear<- grad$grad/grad$cohort
rm(grad.raw, grad.temp)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Merge the datasets
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# merge all app data that matches are filtered school list
ds <- merge(x=inst, y=app, by.x="unitid", by.y="unitid", all.x=T)
# merge on the grad rate data
ds <- merge(x=ds, y=grad, by.x="unitid", by.y="unitid", all.x=T)
# calculate a variable for yield rate
ds$yield <- ds$enrlt/ds$admssn
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Finally.... let's play with the data!
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# quick look at the variables of interest
summary(ds[c("sector", "sixyear", "yield")])
# lets correlate the two, ignoring sector
cor(ds$sixyear, ds$yield, use="complete.obs", method="pearson")
#hmmmm - not what I thought we would see
# just keep the complete observations for sixyear and yield
# could use ?complete.cases or na.omit, but looks across whole data frame
ds.comp <- ds[!is.na(ds$sixyear), ]
ds.comp <- ds.comp[!is.na(ds.comp$yield), ]
ds.comp <- ds.comp[ds.comp$yield <= .85, ] # throw out cases with very high yield, Harvard=80%
# it's easier to see the relationships visually
plot(ds.comp$sixyear, ds.comp$yield,
xlab="6-year Rate", ylab="Yield Rate",
main="Grad Rate and Yield Rate - 2009 Data",
pch=20)
# No real pattern - article says higher grad rates increased chances of selection as
# as grad rates increased. To be fair, they used public schools and pitted
# one school against another, all else equal. Reality is that
# the selection set is much larger
# filter on public
public <- ds.comp[ds.comp$sector==1, ]
plot(public$sixyear, public$yield,
xlab="6-year Rate", ylab="Yield Rate",
main="Grad Rate and Yield Rate - 2009 Data - Public Schools Only",
pch=20)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Clean up R session
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rm(list=ls())
q()
n
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment