Btibert3 · January 12, 2011 22:55
diff --git a/IPEDS Grad and Yield Rate.R b/IPEDS Grad and Yield Rate.R
 ################################################################
 # Author:  	@BrockTibert
 # Date: 	January 2011
 # 
 # Purpose: 	Study the relationship of freshmen yield rate and grad rate
 # 			Comes from Chronicle blog post on research done about college selection
 #			and more likely to be selected if have higher grad rate.  Let's use
 #			actual data to see if this is true.
 #
 # http://chronicle.com/blogs/headcount/how-graduation-rates-shape-college-choice/27770		
 #
 # Windows 7 Pro, R 2.10
 #
 ################################################################

 # initial setup
 DIR <- "C:/Users/Brock/Documents/My Dropbox/Eclipse/Projects/R/IPEDS/Grad and Yield Rate/"
 setwd(DIR)

 # Base URL for the IPEDS data raw datasets
 BASE_URL <- 'http://nces.ed.gov/ipeds/datacenter/data/'

 # the files of interest
 FILES <- c('GR2009', 'IC2009', 'HD2009')


 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Grab and unzip the datasets
 # An example path: http://nces.ed.gov/ipeds/datacenter/data/EF2004D.zip
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 # download the files
 for(i in FILES) {
 	
 	# build the URL String
 	URL <- paste(BASE_URL, i, ".zip", sep="")
 	
 	# build the destination -- needs to include path AND file
 	dest <- paste(DIR, i, ".zip", sep="")
 	
 	# fetch the file
 	download.file(URL, destfile=dest, mode="wb")
 	
 }

 # unzip the files
 for (i in dir(pattern="\\.zip$"))
 	unzip(i)

 # delete the zip files if you want
 for (i in dir(pattern="\\.zip$"))
 	unlink(i)



 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Read in the datasets, filter, clean as needed
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 #------------------------------------------
 # app and yield data
 #------------------------------------------

 # read in the data
 app.raw <- read.table("ic2009.csv", header=T, stringsAsFactors=F, sep=",")
 dim(app.raw)

 # filter columns -- pull some extra fields in case I want to use them
 app <- app.raw[c("unitid", "applcn", "admssn", "enrlt", "satactdt",
 				"satnum", "satpct", "actnum", "actpct", "satvr25",
 				"satvr75", "satmt25", "satmt75", "actcm25", "actcm75",
 				"acten25", "acten75", "actmt25", "actmt75",
 				"tuitvary")]
 rm(app.raw)



 #------------------------------------------
 # directory info so we can filter on 4-year, etc.
 #------------------------------------------

 # read in the raw data
 dir.raw <- read.table("hd2009.csv", header=T, sep=",", stringsAsFactors=F)
 dim(dir.raw)

 # filter the rows/columns I want
 dir.raw <- dir.raw[dir.raw$sector %in% c(1,2), ]
 dir.raw <- dir.raw[dir.raw$obereg %in% c(1,2), ]
 dir.raw <- dir.raw[dir.raw$deggrant==1, ]
 dir.raw <- dir.raw[dir.raw$pset4flg==1, ]
 inst <- dir.raw[c("unitid", "instnm", "sector", "addr", "stabbr", "zip",
 				"carnegie", "locale")]
 rm(dir.raw)



 #------------------------------------------
 # directory info so we can filter on 6-year rate
 # prefer to do 4-year, but 6 is more common
 #
 # definition:  	total cohort considered = GRRACE24 when GRTYPE=8
 #				grad in 6-years = GRRACE24 when GRTYPE=12
 #
 # NOTES:		These data are multi-record per school
 #				need to restructure - will use reshape package
 #------------------------------------------

 # read in the data
 grad.raw <- read.table("gr2009.csv", header=T, sep=",", stringsAsFactors=F)
 dim(grad.raw)

 # filter the rows
 grad.temp <- grad.raw[grad.raw$GRTYPE %in% c(8,12),]
 grad.temp <- grad.temp[c("UNITID", "GRTYPE", "GRTOTLT")]  # there isn't a GRRACE24

 # reshape the data
 library(reshape)
 grad <- cast(grad.temp, UNITID ~ GRTYPE)
 colnames(grad) <- c("unitid", "cohort", "grad")
 grad$sixyear<- grad$grad/grad$cohort

 rm(grad.raw, grad.temp)




 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Merge the datasets
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 # merge all app data that matches are filtered school list
 ds <- merge(x=inst, y=app, by.x="unitid", by.y="unitid", all.x=T)

 # merge on the grad rate data
 ds <- merge(x=ds, y=grad, by.x="unitid", by.y="unitid", all.x=T)

 # calculate a variable for yield rate
 ds$yield <- ds$enrlt/ds$admssn


 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Finally.... let's play with the data!
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 # quick look at the variables of interest
 summary(ds[c("sector", "sixyear", "yield")])

 # lets correlate the two, ignoring sector
 cor(ds$sixyear, ds$yield, use="complete.obs", method="pearson")
 #hmmmm - not what I thought we would see

 # just keep the complete observations for sixyear and yield
 # could use ?complete.cases or na.omit, but looks across whole data frame
 ds.comp <- ds[!is.na(ds$sixyear), ]
 ds.comp <- ds.comp[!is.na(ds.comp$yield), ]
 ds.comp <- ds.comp[ds.comp$yield <= .85, ]  # throw out cases with very high yield, Harvard=80%


 # it's easier to see the relationships visually
 plot(ds.comp$sixyear, ds.comp$yield,
 		xlab="6-year Rate", ylab="Yield Rate",
 		main="Grad Rate and Yield Rate - 2009 Data",
 		pch=20)

 # No real pattern - article says higher grad rates increased chances of selection as
 #					as grad rates increased. To be fair, they used public schools and pitted
 #					one school against another, all else equal.  Reality is that 
 #					the selection set is much larger

 # filter on public
 public <- ds.comp[ds.comp$sector==1, ]
 plot(public$sixyear, public$yield,
 		xlab="6-year Rate", ylab="Yield Rate",
 		main="Grad Rate and Yield Rate - 2009 Data - Public Schools Only",
 		pch=20)


 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Clean up R session
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 rm(list=ls())
 q()
 n
	################################################################
	# Author: @BrockTibert
	# Date: January 2011
	#
	# Purpose: Study the relationship of freshmen yield rate and grad rate
	# Comes from Chronicle blog post on research done about college selection
	# and more likely to be selected if have higher grad rate. Let's use
	# actual data to see if this is true.
	#
	# http://chronicle.com/blogs/headcount/how-graduation-rates-shape-college-choice/27770
	#
	# Windows 7 Pro, R 2.10
	#
	################################################################

	# initial setup
	DIR <- "C:/Users/Brock/Documents/My Dropbox/Eclipse/Projects/R/IPEDS/Grad and Yield Rate/"
	setwd(DIR)

	# Base URL for the IPEDS data raw datasets
	BASE_URL <- 'http://nces.ed.gov/ipeds/datacenter/data/'

	# the files of interest
	FILES <- c('GR2009', 'IC2009', 'HD2009')


	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Grab and unzip the datasets
	# An example path: http://nces.ed.gov/ipeds/datacenter/data/EF2004D.zip
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	# download the files
	for(i in FILES) {

	# build the URL String
	URL <- paste(BASE_URL, i, ".zip", sep="")

	# build the destination -- needs to include path AND file
	dest <- paste(DIR, i, ".zip", sep="")

	# fetch the file
	download.file(URL, destfile=dest, mode="wb")

	}

	# unzip the files
	for (i in dir(pattern="\\.zip$"))
	unzip(i)

	# delete the zip files if you want
	for (i in dir(pattern="\\.zip$"))
	unlink(i)



	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Read in the datasets, filter, clean as needed
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	#------------------------------------------
	# app and yield data
	#------------------------------------------

	# read in the data
	app.raw <- read.table("ic2009.csv", header=T, stringsAsFactors=F, sep=",")
	dim(app.raw)

	# filter columns -- pull some extra fields in case I want to use them
	app <- app.raw[c("unitid", "applcn", "admssn", "enrlt", "satactdt",
	"satnum", "satpct", "actnum", "actpct", "satvr25",
	"satvr75", "satmt25", "satmt75", "actcm25", "actcm75",
	"acten25", "acten75", "actmt25", "actmt75",
	"tuitvary")]
	rm(app.raw)



	#------------------------------------------
	# directory info so we can filter on 4-year, etc.
	#------------------------------------------

	# read in the raw data
	dir.raw <- read.table("hd2009.csv", header=T, sep=",", stringsAsFactors=F)
	dim(dir.raw)

	# filter the rows/columns I want
	dir.raw <- dir.raw[dir.raw$sector %in% c(1,2), ]
	dir.raw <- dir.raw[dir.raw$obereg %in% c(1,2), ]
	dir.raw <- dir.raw[dir.raw$deggrant==1, ]
	dir.raw <- dir.raw[dir.raw$pset4flg==1, ]
	inst <- dir.raw[c("unitid", "instnm", "sector", "addr", "stabbr", "zip",
	"carnegie", "locale")]
	rm(dir.raw)



	#------------------------------------------
	# directory info so we can filter on 6-year rate
	# prefer to do 4-year, but 6 is more common
	#
	# definition: total cohort considered = GRRACE24 when GRTYPE=8
	# grad in 6-years = GRRACE24 when GRTYPE=12
	#
	# NOTES: These data are multi-record per school
	# need to restructure - will use reshape package
	#------------------------------------------

	# read in the data
	grad.raw <- read.table("gr2009.csv", header=T, sep=",", stringsAsFactors=F)
	dim(grad.raw)

	# filter the rows
	grad.temp <- grad.raw[grad.raw$GRTYPE %in% c(8,12),]
	grad.temp <- grad.temp[c("UNITID", "GRTYPE", "GRTOTLT")] # there isn't a GRRACE24

	# reshape the data
	library(reshape)
	grad <- cast(grad.temp, UNITID ~ GRTYPE)
	colnames(grad) <- c("unitid", "cohort", "grad")
	grad$sixyear<- grad$grad/grad$cohort

	rm(grad.raw, grad.temp)




	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Merge the datasets
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	# merge all app data that matches are filtered school list
	ds <- merge(x=inst, y=app, by.x="unitid", by.y="unitid", all.x=T)

	# merge on the grad rate data
	ds <- merge(x=ds, y=grad, by.x="unitid", by.y="unitid", all.x=T)

	# calculate a variable for yield rate
	ds$yield <- ds$enrlt/ds$admssn


	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Finally.... let's play with the data!
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	# quick look at the variables of interest
	summary(ds[c("sector", "sixyear", "yield")])

	# lets correlate the two, ignoring sector
	cor(ds$sixyear, ds$yield, use="complete.obs", method="pearson")
	#hmmmm - not what I thought we would see

	# just keep the complete observations for sixyear and yield
	# could use ?complete.cases or na.omit, but looks across whole data frame
	ds.comp <- ds[!is.na(ds$sixyear), ]
	ds.comp <- ds.comp[!is.na(ds.comp$yield), ]
	ds.comp <- ds.comp[ds.comp$yield <= .85, ] # throw out cases with very high yield, Harvard=80%


	# it's easier to see the relationships visually
	plot(ds.comp$sixyear, ds.comp$yield,
	xlab="6-year Rate", ylab="Yield Rate",
	main="Grad Rate and Yield Rate - 2009 Data",
	pch=20)

	# No real pattern - article says higher grad rates increased chances of selection as
	# as grad rates increased. To be fair, they used public schools and pitted
	# one school against another, all else equal. Reality is that
	# the selection set is much larger

	# filter on public
	public <- ds.comp[ds.comp$sector==1, ]
	plot(public$sixyear, public$yield,
	xlab="6-year Rate", ylab="Yield Rate",
	main="Grad Rate and Yield Rate - 2009 Data - Public Schools Only",
	pch=20)


	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Clean up R session
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	rm(list=ls())
	q()
	n
No results found