Created
January 12, 2011 22:55
-
-
Save Btibert3/777069 to your computer and use it in GitHub Desktop.
Correlate 6-year Grad Rate with Freshmen Yield Rate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################################################################ | |
| # Author: @BrockTibert | |
| # Date: January 2011 | |
| # | |
| # Purpose: Study the relationship of freshmen yield rate and grad rate | |
| # Comes from Chronicle blog post on research done about college selection | |
| # and more likely to be selected if have higher grad rate. Let's use | |
| # actual data to see if this is true. | |
| # | |
| # http://chronicle.com/blogs/headcount/how-graduation-rates-shape-college-choice/27770 | |
| # | |
| # Windows 7 Pro, R 2.10 | |
| # | |
| ################################################################ | |
| # initial setup | |
| DIR <- "C:/Users/Brock/Documents/My Dropbox/Eclipse/Projects/R/IPEDS/Grad and Yield Rate/" | |
| setwd(DIR) | |
| # Base URL for the IPEDS data raw datasets | |
| BASE_URL <- 'http://nces.ed.gov/ipeds/datacenter/data/' | |
| # the files of interest | |
| FILES <- c('GR2009', 'IC2009', 'HD2009') | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # Grab and unzip the datasets | |
| # An example path: http://nces.ed.gov/ipeds/datacenter/data/EF2004D.zip | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # download the files | |
| for(i in FILES) { | |
| # build the URL String | |
| URL <- paste(BASE_URL, i, ".zip", sep="") | |
| # build the destination -- needs to include path AND file | |
| dest <- paste(DIR, i, ".zip", sep="") | |
| # fetch the file | |
| download.file(URL, destfile=dest, mode="wb") | |
| } | |
| # unzip the files | |
| for (i in dir(pattern="\\.zip$")) | |
| unzip(i) | |
| # delete the zip files if you want | |
| for (i in dir(pattern="\\.zip$")) | |
| unlink(i) | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # Read in the datasets, filter, clean as needed | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| #------------------------------------------ | |
| # app and yield data | |
| #------------------------------------------ | |
| # read in the data | |
| app.raw <- read.table("ic2009.csv", header=T, stringsAsFactors=F, sep=",") | |
| dim(app.raw) | |
| # filter columns -- pull some extra fields in case I want to use them | |
| app <- app.raw[c("unitid", "applcn", "admssn", "enrlt", "satactdt", | |
| "satnum", "satpct", "actnum", "actpct", "satvr25", | |
| "satvr75", "satmt25", "satmt75", "actcm25", "actcm75", | |
| "acten25", "acten75", "actmt25", "actmt75", | |
| "tuitvary")] | |
| rm(app.raw) | |
| #------------------------------------------ | |
| # directory info so we can filter on 4-year, etc. | |
| #------------------------------------------ | |
| # read in the raw data | |
| dir.raw <- read.table("hd2009.csv", header=T, sep=",", stringsAsFactors=F) | |
| dim(dir.raw) | |
| # filter the rows/columns I want | |
| dir.raw <- dir.raw[dir.raw$sector %in% c(1,2), ] | |
| dir.raw <- dir.raw[dir.raw$obereg %in% c(1,2), ] | |
| dir.raw <- dir.raw[dir.raw$deggrant==1, ] | |
| dir.raw <- dir.raw[dir.raw$pset4flg==1, ] | |
| inst <- dir.raw[c("unitid", "instnm", "sector", "addr", "stabbr", "zip", | |
| "carnegie", "locale")] | |
| rm(dir.raw) | |
| #------------------------------------------ | |
| # directory info so we can filter on 6-year rate | |
| # prefer to do 4-year, but 6 is more common | |
| # | |
| # definition: total cohort considered = GRRACE24 when GRTYPE=8 | |
| # grad in 6-years = GRRACE24 when GRTYPE=12 | |
| # | |
| # NOTES: These data are multi-record per school | |
| # need to restructure - will use reshape package | |
| #------------------------------------------ | |
| # read in the data | |
| grad.raw <- read.table("gr2009.csv", header=T, sep=",", stringsAsFactors=F) | |
| dim(grad.raw) | |
| # filter the rows | |
| grad.temp <- grad.raw[grad.raw$GRTYPE %in% c(8,12),] | |
| grad.temp <- grad.temp[c("UNITID", "GRTYPE", "GRTOTLT")] # there isn't a GRRACE24 | |
| # reshape the data | |
| library(reshape) | |
| grad <- cast(grad.temp, UNITID ~ GRTYPE) | |
| colnames(grad) <- c("unitid", "cohort", "grad") | |
| grad$sixyear<- grad$grad/grad$cohort | |
| rm(grad.raw, grad.temp) | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # Merge the datasets | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # merge all app data that matches are filtered school list | |
| ds <- merge(x=inst, y=app, by.x="unitid", by.y="unitid", all.x=T) | |
| # merge on the grad rate data | |
| ds <- merge(x=ds, y=grad, by.x="unitid", by.y="unitid", all.x=T) | |
| # calculate a variable for yield rate | |
| ds$yield <- ds$enrlt/ds$admssn | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # Finally.... let's play with the data! | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # quick look at the variables of interest | |
| summary(ds[c("sector", "sixyear", "yield")]) | |
| # lets correlate the two, ignoring sector | |
| cor(ds$sixyear, ds$yield, use="complete.obs", method="pearson") | |
| #hmmmm - not what I thought we would see | |
| # just keep the complete observations for sixyear and yield | |
| # could use ?complete.cases or na.omit, but looks across whole data frame | |
| ds.comp <- ds[!is.na(ds$sixyear), ] | |
| ds.comp <- ds.comp[!is.na(ds.comp$yield), ] | |
| ds.comp <- ds.comp[ds.comp$yield <= .85, ] # throw out cases with very high yield, Harvard=80% | |
| # it's easier to see the relationships visually | |
| plot(ds.comp$sixyear, ds.comp$yield, | |
| xlab="6-year Rate", ylab="Yield Rate", | |
| main="Grad Rate and Yield Rate - 2009 Data", | |
| pch=20) | |
| # No real pattern - article says higher grad rates increased chances of selection as | |
| # as grad rates increased. To be fair, they used public schools and pitted | |
| # one school against another, all else equal. Reality is that | |
| # the selection set is much larger | |
| # filter on public | |
| public <- ds.comp[ds.comp$sector==1, ] | |
| plot(public$sixyear, public$yield, | |
| xlab="6-year Rate", ylab="Yield Rate", | |
| main="Grad Rate and Yield Rate - 2009 Data - Public Schools Only", | |
| pch=20) | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| # Clean up R session | |
| #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
| rm(list=ls()) | |
| q() | |
| n |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment