Last active
August 29, 2015 14:05
-
-
Save technickle/2fc673c3a3222a1d93ae to your computer and use it in GitHub Desktop.
Import and process Federal OPE Campus Security Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This R script grabs 5 years of OPE security data for a specified state | |
# | |
# How to use: | |
# 1) Go to http://www.ope.ed.gov/security/GetDownloadFile.aspx | |
# 2) download SPSS version of 3 most recent sets (past 5 years of data) | |
# 3) from each zip file, copy oncampuscrime*.sav, noncampuscrime*.sav, | |
# Residencehallcrime*.sav, and Publicpropertycrime*.sav | |
# out to a folder of your choosing. It should have 12 files when done | |
# 4) use the one of the most recent year's .sav files to geocode addresses; | |
# geocoding should be in a CSV with at least UNITID_P, x, y, and county; | |
# place this file in the same folder with the .sav files | |
# 5) modify this script to accommodate new year values (beyond the config) | |
# 6) run this script: rscript state-ope-security-data.r | |
# | |
# this script produces almost 200 columns of data; | |
# 14 general columns + (5 years x 4 location types x 9 crime types) | |
# | |
# TODO: Sort output data by institution name and branch | |
# | |
# FUTURE: fix merges so they don't require "x = crimedata" declaration | |
# FUTURE: fix SPSS import so it drops text field whitespace and remove workaround | |
# FUTURE: hyphenate the zipcode column (source data has 9-digit numbers for ZIP+4s) | |
# FUTURE: have the script grab the data directly from the website | |
# FUTURE: make the year selections and merges configuration-driven | |
# FUTURE: make the entire script driven by configuration parameters | |
# ############################################################################## | |
# Configuration | |
# set this to the folder where you placed your OPE .sav files & geocoded .csv | |
# you must have write access to this folder; the output file gets put here too | |
setwd("~/Campus-Crime/") | |
# set this to the 2-letter abbreviation for the state you want to compile | |
statefilter = "NY" | |
# set this to the name of the file you want to output | |
crimedatafilename = "crimedata.csv" | |
# ############################################################################## | |
require(foreign) | |
#### load most last 5 years of data from files | |
# most recent 3 years are in the latest file | |
oncampusprimary = read.spss("oncampuscrime101112.sav", to.data.frame=TRUE) | |
offcampusprimary = read.spss("noncampuscrime101112.sav", to.data.frame=TRUE) | |
residencehallprimary = read.spss("Residencehallcrime101112.sav", to.data.frame=TRUE) | |
publicpropertyprimary = read.spss("Publicpropertycrime101112.sav", to.data.frame=TRUE) | |
# 2 oldest years are in the second-latest and third-latest set of files, respectively | |
oncampussecondary = read.spss("oncampuscrime091011.sav", to.data.frame=TRUE) | |
offcampussecondary = read.spss("noncampuscrime091011.sav", to.data.frame=TRUE) | |
residencehallsecondary = read.spss("Residencehallcrime091011.sav", to.data.frame=TRUE) | |
publicpropertysecondary = read.spss("Publicpropertycrime091011.sav", to.data.frame=TRUE) | |
oncampustertiary = read.spss("oncampuscrime080910.sav", to.data.frame=TRUE) | |
offcampustertiary = read.spss("noncampuscrime080910.sav", to.data.frame=TRUE) | |
residencehalltertiary = read.spss("Residencehallcrime080910.sav", to.data.frame=TRUE) | |
publicpropertytertiary = read.spss("Publicpropertycrime080910.sav", to.data.frame=TRUE) | |
#### load campus locations | |
campuslocations = read.csv("campuslocations.csv") | |
#### filter all years for just NYS data | |
oncampusprimary_state = subset(oncampusprimary, State == statefilter) | |
offcampusprimary_state = subset(offcampusprimary, State == statefilter) | |
residencehallprimary_state = subset(residencehallprimary, State == statefilter) | |
publicpropertyprimary_state = subset(publicpropertyprimary, State == statefilter) | |
oncampussecondary_state = subset(oncampussecondary, State == statefilter) | |
offcampussecondary_state = subset(offcampussecondary, State == statefilter) | |
residencehallsecondary_state = subset(residencehallsecondary, State == statefilter) | |
publicpropertysecondary_state = subset(publicpropertysecondary, State == statefilter) | |
oncampustertiary_state = subset(oncampustertiary, State == statefilter) | |
offcampustertiary_state = subset(offcampustertiary, State == statefilter) | |
residencehalltertiary_state = subset(residencehalltertiary, State == statefilter) | |
publicpropertytertiary_state = subset(publicpropertytertiary, State == statefilter) | |
#### build new data | |
# create a new dataframe with the common fields from all records | |
# we assume that the most recent year is the master list of institutions we want. Any institutions/campuses in previous years that are not in current year are ignored | |
crimedata = subset(oncampusprimary_state, select=c("UNITID_P", "INSTNM", "BRANCH", "Address", "City")) | |
# rename column headers in this new dataframe | |
colnames(crimedata)[1:5] = c("UNITID_P", "institution_name", "branch", "address", "city") | |
print("initial campus columns...") | |
# merge county column from campuslocations file | |
crimedata = merge(x = crimedata, subset(campuslocations, select=c("UNITID_P","County")), by="UNITID_P", all.x=TRUE) | |
colnames(crimedata)[6] = "county" | |
print("merged county column...") | |
# merge rest of basic campus data | |
crimedata = merge(x = crimedata, subset(oncampusprimary_state, select=c("UNITID_P", "State", "ZIP", "sector_cd", "Sector_desc", "men_total", "women_total", "Total")), by="UNITID_P",all.x=TRUE) | |
colnames(crimedata)[7:13] = c("state", "zip", "sector_cd", "sector_desc", "men_total", "women_total", "total") | |
print("additional campus columns...") | |
# trim specific text columns (SPSS import includes whitespace) | |
trim = function (x) gsub("^\\s+|\\s+$", "", x) | |
crimedata$institution_name = trim(crimedata$institution_name) | |
crimedata$branch = trim(crimedata$branch) | |
crimedata$address = trim(crimedata$address) | |
crimedata$city = trim(crimedata$city) | |
crimedata$sector_desc = trim(crimedata$sector_desc) | |
# strip down zip codes to 5 digits (removes trailing spaces and +4 codes) | |
crimedata$zip = substr(crimedata$zip,1,5) | |
# begin merging (appending) crime data columns from each year (36 columns per year!) | |
# current year | |
# merge the data for the first year by each of the location types | |
crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "NEG_M12", "MURD12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2012 | |
# rename the column headers for the newly merged year/location - column order MUST match the above subset | |
# there is probably a more clever way to do this using substitution/regex | |
colnames(crimedata)[14:22] = c("2012_oncampus_aggravated_assault","2012_oncampus_arson","2012_oncampus_burglary", "2012_oncampus_forciblesexualoffense", "2012_oncampus_negligenthomicide", "2012_oncampus_murder", "2012_oncampus_nonforciblesexualoffense", "2012_oncampus_robbery", "2012_oncampus_vehicletheft") | |
# rinse and repeat for each year and location type | |
crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2012 | |
colnames(crimedata)[23:31] = c("2012_offcampus_aggravated_assault","2012_offcampus_arson","2012_offcampus_burglary", "2012_offcampus_forciblesexualoffense", "2012_offcampus_negligenthomicide", "2012_offcampus_murder", "2012_offcampus_nonforciblesexualoffense", "2012_offcampus_robbery", "2012_offcampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2012 | |
colnames(crimedata)[32:40] = c("2012_residencehall_aggravated_assault","2012_residencehall_arson","2012_residencehall_burglary", "2012_residencehall_forciblesexualoffense", "2012_residencehall_negligenthomicide", "2012_residencehall_murder", "2012_residencehall_nonforciblesexualoffense", "2012_residencehall_robbery", "2012_residencehall_vehicletheft") | |
crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #public-property 2012 | |
colnames(crimedata)[41:49] = c("2012_publicproperty_aggravated_assault","2012_publicproperty_arson","2012_publicproperty_burglary", "2012_publicproperty_forciblesexualoffense", "2012_publicproperty_negligenthomicide", "2012_publicproperty_murder", "2012_publicproperty_nonforciblesexualoffense", "2012_publicproperty_robbery", "2012_publicproperty_vehicletheft") | |
# previous year | |
crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "NEG_M11", "MURD11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2011 | |
colnames(crimedata)[50:58] = c("2011_oncampus_aggravated_assault","2011_oncampus_arson","2011_oncampus_burglary", "2011_oncampus_forciblesexualoffense", "2011_oncampus_negligenthomicide", "2011_oncampus_murder", "2011_oncampus_nonforciblesexualoffense", "2011_oncampus_robbery", "2011_oncampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2011 | |
colnames(crimedata)[59:67] = c("2011_offcampus_aggravated_assault","2011_offcampus_arson","2011_offcampus_burglary", "2011_offcampus_forciblesexualoffense", "2011_offcampus_negligenthomicide", "2011_offcampus_murder", "2011_offcampus_nonforciblesexualoffense", "2011_offcampus_robbery", "2011_offcampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2011 | |
colnames(crimedata)[68:76] = c("2011_residencehall_aggravated_assault","2011_residencehall_arson","2011_residencehall_burglary", "2011_residencehall_forciblesexualoffense", "2011_residencehall_negligenthomicide", "2011_residencehall_murder", "2011_residencehall_nonforciblesexualoffense", "2011_residencehall_robbery", "2011_residencehall_vehicletheft") | |
crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #public-property 2011 | |
colnames(crimedata)[77:85] = c("2011_publicproperty_aggravated_assault","2011_publicproperty_arson","2011_publicproperty_burglary", "2011_publicproperty_forciblesexualoffense", "2011_publicproperty_negligenthomicide", "2011_publicproperty_murder", "2011_publicproperty_nonforciblesexualoffense", "2011_publicproperty_robbery", "2011_publicproperty_vehicletheft") | |
# 2nd previous year | |
crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "NEG_M10", "MURD10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2010 | |
colnames(crimedata)[86:94] = c("2010_oncampus_aggravated_assault","2010_oncampus_arson","2010_oncampus_burglary", "2010_oncampus_forciblesexualoffense", "2010_oncampus_negligenthomicide", "2010_oncampus_murder", "2010_oncampus_nonforciblesexualoffense", "2010_oncampus_robbery", "2010_oncampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2010 | |
colnames(crimedata)[95:103] = c("2010_offcampus_aggravated_assault","2010_offcampus_arson","2010_offcampus_burglary", "2010_offcampus_forciblesexualoffense", "2010_offcampus_negligenthomicide", "2010_offcampus_murder", "2010_offcampus_nonforciblesexualoffense", "2010_offcampus_robbery", "2010_offcampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2010 | |
colnames(crimedata)[104:112] = c("2010_residencehall_aggravated_assault","2010_residencehall_arson","2010_residencehall_burglary", "2010_residencehall_forciblesexualoffense", "2010_residencehall_negligenthomicide", "2010_residencehall_murder", "2010_residencehall_nonforciblesexualoffense", "2010_residencehall_robbery", "2010_residencehall_vehicletheft") | |
crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #public-property 2010 | |
colnames(crimedata)[113:121] = c("2010_publicproperty_aggravated_assault","2010_publicproperty_arson","2010_publicproperty_burglary", "2010_publicproperty_forciblesexualoffense", "2010_publicproperty_negligenthomicide", "2010_publicproperty_murder", "2010_publicproperty_nonforciblesexualoffense", "2010_publicproperty_robbery", "2010_publicproperty_vehicletheft") | |
# 3rd previous year | |
crimedata = merge(x = crimedata, subset(oncampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "NEG_M9", "MURD9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2009 | |
colnames(crimedata)[122:130] = c("2009_oncampus_aggravated_assault","2009_oncampus_arson","2009_oncampus_burglary", "2009_oncampus_forciblesexualoffense", "2009_oncampus_negligenthomicide", "2009_oncampus_murder", "2009_oncampus_nonforciblesexualoffense", "2009_oncampus_robbery", "2009_oncampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(offcampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2009 | |
colnames(crimedata)[131:139] = c("2009_offcampus_aggravated_assault","2009_offcampus_arson","2009_offcampus_burglary", "2009_offcampus_forciblesexualoffense", "2009_offcampus_negligenthomicide", "2009_offcampus_murder", "2009_offcampus_nonforciblesexualoffense", "2009_offcampus_robbery", "2009_offcampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(residencehallsecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2009 | |
colnames(crimedata)[140:148] = c("2009_residencehall_aggravated_assault","2009_residencehall_arson","2009_residencehall_burglary", "2009_residencehall_forciblesexualoffense", "2009_residencehall_negligenthomicide", "2009_residencehall_murder", "2009_residencehall_nonforciblesexualoffense", "2009_residencehall_robbery", "2009_residencehall_vehicletheft") | |
crimedata = merge(x = crimedata, subset(publicpropertysecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #public-property 2009 | |
colnames(crimedata)[149:157] = c("2009_publicproperty_aggravated_assault","2009_publicproperty_arson","2009_publicproperty_burglary", "2009_publicproperty_forciblesexualoffense", "2009_publicproperty_negligenthomicide", "2009_publicproperty_murder", "2009_publicproperty_nonforciblesexualoffense", "2009_publicproperty_robbery", "2009_publicproperty_vehicletheft") | |
# 4th previous year | |
crimedata = merge(x = crimedata, subset(oncampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "NEG_M8", "MURD8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2008 | |
colnames(crimedata)[158:166] = c("2008_oncampus_aggravated_assault","2008_oncampus_arson","2008_oncampus_burglary", "2008_oncampus_forciblesexualoffense", "2008_oncampus_negligenthomicide", "2008_oncampus_murder", "2008_oncampus_nonforciblesexualoffense", "2008_oncampus_robbery", "2008_oncampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(offcampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2008 | |
colnames(crimedata)[167:175] = c("2008_offcampus_aggravated_assault","2008_offcampus_arson","2008_offcampus_burglary", "2008_offcampus_forciblesexualoffense", "2008_offcampus_negligenthomicide", "2008_offcampus_murder", "2008_offcampus_nonforciblesexualoffense", "2008_offcampus_robbery", "2008_offcampus_vehicletheft") | |
crimedata = merge(x = crimedata, subset(residencehalltertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2008 | |
colnames(crimedata)[176:184] = c("2008_residencehall_aggravated_assault","2008_residencehall_arson","2008_residencehall_burglary", "2008_residencehall_forciblesexualoffense", "2008_residencehall_negligenthomicide", "2008_residencehall_murder", "2008_residencehall_nonforciblesexualoffense", "2008_residencehall_robbery", "2008_residencehall_vehicletheft") | |
crimedata = merge(x = crimedata, subset(publicpropertytertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #public-property 2008 | |
colnames(crimedata)[185:193] = c("2008_publicproperty_aggravated_assault","2008_publicproperty_arson","2008_publicproperty_burglary", "2008_publicproperty_forciblesexualoffense", "2008_publicproperty_negligenthomicide", "2008_publicproperty_murder", "2008_publicproperty_nonforciblesexualoffense", "2008_publicproperty_robbery", "2008_publicproperty_vehicletheft") | |
# locations | |
crimedata = merge(x = crimedata, subset(campuslocations,select=c("UNITID_P","x","y")), by="UNITID_P", all.x=TRUE) | |
colnames(crimedata)[194:195] = c("longitude","latitude") | |
#### save new data to disk | |
# don't include row headers, and write blanks instead of "NA" for missing data values | |
write.csv(crimedata, crimedatafilename, row.names=FALSE, na="") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment