# This R script grabs 5 years of OPE security data for a specified state # # How to use: # 1) Go to http://www.ope.ed.gov/security/GetDownloadFile.aspx # 2) download SPSS version of 3 most recent sets (past 5 years of data) # 3) from each zip file, copy oncampuscrime*.sav, noncampuscrime*.sav, # Residencehallcrime*.sav, and Publicpropertycrime*.sav # out to a folder of your choosing. It should have 12 files when done # 4) use the one of the most recent year's .sav files to geocode addresses; # geocoding should be in a CSV with at least UNITID_P, x, y, and county; # place this file in the same folder with the .sav files # 5) modify this script to accommodate new year values (beyond the config) # 6) run this script: rscript state-ope-security-data.r # # this script produces almost 200 columns of data; # 14 general columns + (5 years x 4 location types x 9 crime types) # # TODO: Sort output data by institution name and branch # # FUTURE: fix merges so they don't require "x = crimedata" declaration # FUTURE: fix SPSS import so it drops text field whitespace and remove workaround # FUTURE: hyphenate the zipcode column (source data has 9-digit numbers for ZIP+4s) # FUTURE: have the script grab the data directly from the website # FUTURE: make the year selections and merges configuration-driven # FUTURE: make the entire script driven by configuration parameters # ############################################################################## # Configuration # set this to the folder where you placed your OPE .sav files & geocoded .csv # you must have write access to this folder; the output file gets put here too setwd("~/Campus-Crime/") # set this to the 2-letter abbreviation for the state you want to compile statefilter = "NY" # set this to the name of the file you want to output crimedatafilename = "crimedata.csv" # ############################################################################## require(foreign) #### load most last 5 years of data from files # most recent 3 years are in the latest file oncampusprimary = read.spss("oncampuscrime101112.sav", to.data.frame=TRUE) offcampusprimary = read.spss("noncampuscrime101112.sav", to.data.frame=TRUE) residencehallprimary = read.spss("Residencehallcrime101112.sav", to.data.frame=TRUE) publicpropertyprimary = read.spss("Publicpropertycrime101112.sav", to.data.frame=TRUE) # 2 oldest years are in the second-latest and third-latest set of files, respectively oncampussecondary = read.spss("oncampuscrime091011.sav", to.data.frame=TRUE) offcampussecondary = read.spss("noncampuscrime091011.sav", to.data.frame=TRUE) residencehallsecondary = read.spss("Residencehallcrime091011.sav", to.data.frame=TRUE) publicpropertysecondary = read.spss("Publicpropertycrime091011.sav", to.data.frame=TRUE) oncampustertiary = read.spss("oncampuscrime080910.sav", to.data.frame=TRUE) offcampustertiary = read.spss("noncampuscrime080910.sav", to.data.frame=TRUE) residencehalltertiary = read.spss("Residencehallcrime080910.sav", to.data.frame=TRUE) publicpropertytertiary = read.spss("Publicpropertycrime080910.sav", to.data.frame=TRUE) #### load campus locations campuslocations = read.csv("campuslocations.csv") #### filter all years for just NYS data oncampusprimary_state = subset(oncampusprimary, State == statefilter) offcampusprimary_state = subset(offcampusprimary, State == statefilter) residencehallprimary_state = subset(residencehallprimary, State == statefilter) publicpropertyprimary_state = subset(publicpropertyprimary, State == statefilter) oncampussecondary_state = subset(oncampussecondary, State == statefilter) offcampussecondary_state = subset(offcampussecondary, State == statefilter) residencehallsecondary_state = subset(residencehallsecondary, State == statefilter) publicpropertysecondary_state = subset(publicpropertysecondary, State == statefilter) oncampustertiary_state = subset(oncampustertiary, State == statefilter) offcampustertiary_state = subset(offcampustertiary, State == statefilter) residencehalltertiary_state = subset(residencehalltertiary, State == statefilter) publicpropertytertiary_state = subset(publicpropertytertiary, State == statefilter) #### build new data # create a new dataframe with the common fields from all records # we assume that the most recent year is the master list of institutions we want. Any institutions/campuses in previous years that are not in current year are ignored crimedata = subset(oncampusprimary_state, select=c("UNITID_P", "INSTNM", "BRANCH", "Address", "City")) # rename column headers in this new dataframe colnames(crimedata)[1:5] = c("UNITID_P", "institution_name", "branch", "address", "city") print("initial campus columns...") # merge county column from campuslocations file crimedata = merge(x = crimedata, subset(campuslocations, select=c("UNITID_P","County")), by="UNITID_P", all.x=TRUE) colnames(crimedata)[6] = "county" print("merged county column...") # merge rest of basic campus data crimedata = merge(x = crimedata, subset(oncampusprimary_state, select=c("UNITID_P", "State", "ZIP", "sector_cd", "Sector_desc", "men_total", "women_total", "Total")), by="UNITID_P",all.x=TRUE) colnames(crimedata)[7:13] = c("state", "zip", "sector_cd", "sector_desc", "men_total", "women_total", "total") print("additional campus columns...") # trim specific text columns (SPSS import includes whitespace) trim = function (x) gsub("^\\s+|\\s+$", "", x) crimedata$institution_name = trim(crimedata$institution_name) crimedata$branch = trim(crimedata$branch) crimedata$address = trim(crimedata$address) crimedata$city = trim(crimedata$city) crimedata$sector_desc = trim(crimedata$sector_desc) # strip down zip codes to 5 digits (removes trailing spaces and +4 codes) crimedata$zip = substr(crimedata$zip,1,5) # begin merging (appending) crime data columns from each year (36 columns per year!) # current year # merge the data for the first year by each of the location types crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "NEG_M12", "MURD12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2012 # rename the column headers for the newly merged year/location - column order MUST match the above subset # there is probably a more clever way to do this using substitution/regex colnames(crimedata)[14:22] = c("2012_oncampus_aggravated_assault","2012_oncampus_arson","2012_oncampus_burglary", "2012_oncampus_forciblesexualoffense", "2012_oncampus_negligenthomicide", "2012_oncampus_murder", "2012_oncampus_nonforciblesexualoffense", "2012_oncampus_robbery", "2012_oncampus_vehicletheft") # rinse and repeat for each year and location type crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2012 colnames(crimedata)[23:31] = c("2012_offcampus_aggravated_assault","2012_offcampus_arson","2012_offcampus_burglary", "2012_offcampus_forciblesexualoffense", "2012_offcampus_negligenthomicide", "2012_offcampus_murder", "2012_offcampus_nonforciblesexualoffense", "2012_offcampus_robbery", "2012_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2012 colnames(crimedata)[32:40] = c("2012_residencehall_aggravated_assault","2012_residencehall_arson","2012_residencehall_burglary", "2012_residencehall_forciblesexualoffense", "2012_residencehall_negligenthomicide", "2012_residencehall_murder", "2012_residencehall_nonforciblesexualoffense", "2012_residencehall_robbery", "2012_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #public-property 2012 colnames(crimedata)[41:49] = c("2012_publicproperty_aggravated_assault","2012_publicproperty_arson","2012_publicproperty_burglary", "2012_publicproperty_forciblesexualoffense", "2012_publicproperty_negligenthomicide", "2012_publicproperty_murder", "2012_publicproperty_nonforciblesexualoffense", "2012_publicproperty_robbery", "2012_publicproperty_vehicletheft") # previous year crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "NEG_M11", "MURD11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2011 colnames(crimedata)[50:58] = c("2011_oncampus_aggravated_assault","2011_oncampus_arson","2011_oncampus_burglary", "2011_oncampus_forciblesexualoffense", "2011_oncampus_negligenthomicide", "2011_oncampus_murder", "2011_oncampus_nonforciblesexualoffense", "2011_oncampus_robbery", "2011_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2011 colnames(crimedata)[59:67] = c("2011_offcampus_aggravated_assault","2011_offcampus_arson","2011_offcampus_burglary", "2011_offcampus_forciblesexualoffense", "2011_offcampus_negligenthomicide", "2011_offcampus_murder", "2011_offcampus_nonforciblesexualoffense", "2011_offcampus_robbery", "2011_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2011 colnames(crimedata)[68:76] = c("2011_residencehall_aggravated_assault","2011_residencehall_arson","2011_residencehall_burglary", "2011_residencehall_forciblesexualoffense", "2011_residencehall_negligenthomicide", "2011_residencehall_murder", "2011_residencehall_nonforciblesexualoffense", "2011_residencehall_robbery", "2011_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #public-property 2011 colnames(crimedata)[77:85] = c("2011_publicproperty_aggravated_assault","2011_publicproperty_arson","2011_publicproperty_burglary", "2011_publicproperty_forciblesexualoffense", "2011_publicproperty_negligenthomicide", "2011_publicproperty_murder", "2011_publicproperty_nonforciblesexualoffense", "2011_publicproperty_robbery", "2011_publicproperty_vehicletheft") # 2nd previous year crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "NEG_M10", "MURD10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2010 colnames(crimedata)[86:94] = c("2010_oncampus_aggravated_assault","2010_oncampus_arson","2010_oncampus_burglary", "2010_oncampus_forciblesexualoffense", "2010_oncampus_negligenthomicide", "2010_oncampus_murder", "2010_oncampus_nonforciblesexualoffense", "2010_oncampus_robbery", "2010_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2010 colnames(crimedata)[95:103] = c("2010_offcampus_aggravated_assault","2010_offcampus_arson","2010_offcampus_burglary", "2010_offcampus_forciblesexualoffense", "2010_offcampus_negligenthomicide", "2010_offcampus_murder", "2010_offcampus_nonforciblesexualoffense", "2010_offcampus_robbery", "2010_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2010 colnames(crimedata)[104:112] = c("2010_residencehall_aggravated_assault","2010_residencehall_arson","2010_residencehall_burglary", "2010_residencehall_forciblesexualoffense", "2010_residencehall_negligenthomicide", "2010_residencehall_murder", "2010_residencehall_nonforciblesexualoffense", "2010_residencehall_robbery", "2010_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #public-property 2010 colnames(crimedata)[113:121] = c("2010_publicproperty_aggravated_assault","2010_publicproperty_arson","2010_publicproperty_burglary", "2010_publicproperty_forciblesexualoffense", "2010_publicproperty_negligenthomicide", "2010_publicproperty_murder", "2010_publicproperty_nonforciblesexualoffense", "2010_publicproperty_robbery", "2010_publicproperty_vehicletheft") # 3rd previous year crimedata = merge(x = crimedata, subset(oncampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "NEG_M9", "MURD9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2009 colnames(crimedata)[122:130] = c("2009_oncampus_aggravated_assault","2009_oncampus_arson","2009_oncampus_burglary", "2009_oncampus_forciblesexualoffense", "2009_oncampus_negligenthomicide", "2009_oncampus_murder", "2009_oncampus_nonforciblesexualoffense", "2009_oncampus_robbery", "2009_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2009 colnames(crimedata)[131:139] = c("2009_offcampus_aggravated_assault","2009_offcampus_arson","2009_offcampus_burglary", "2009_offcampus_forciblesexualoffense", "2009_offcampus_negligenthomicide", "2009_offcampus_murder", "2009_offcampus_nonforciblesexualoffense", "2009_offcampus_robbery", "2009_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallsecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2009 colnames(crimedata)[140:148] = c("2009_residencehall_aggravated_assault","2009_residencehall_arson","2009_residencehall_burglary", "2009_residencehall_forciblesexualoffense", "2009_residencehall_negligenthomicide", "2009_residencehall_murder", "2009_residencehall_nonforciblesexualoffense", "2009_residencehall_robbery", "2009_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertysecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #public-property 2009 colnames(crimedata)[149:157] = c("2009_publicproperty_aggravated_assault","2009_publicproperty_arson","2009_publicproperty_burglary", "2009_publicproperty_forciblesexualoffense", "2009_publicproperty_negligenthomicide", "2009_publicproperty_murder", "2009_publicproperty_nonforciblesexualoffense", "2009_publicproperty_robbery", "2009_publicproperty_vehicletheft") # 4th previous year crimedata = merge(x = crimedata, subset(oncampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "NEG_M8", "MURD8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2008 colnames(crimedata)[158:166] = c("2008_oncampus_aggravated_assault","2008_oncampus_arson","2008_oncampus_burglary", "2008_oncampus_forciblesexualoffense", "2008_oncampus_negligenthomicide", "2008_oncampus_murder", "2008_oncampus_nonforciblesexualoffense", "2008_oncampus_robbery", "2008_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2008 colnames(crimedata)[167:175] = c("2008_offcampus_aggravated_assault","2008_offcampus_arson","2008_offcampus_burglary", "2008_offcampus_forciblesexualoffense", "2008_offcampus_negligenthomicide", "2008_offcampus_murder", "2008_offcampus_nonforciblesexualoffense", "2008_offcampus_robbery", "2008_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehalltertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2008 colnames(crimedata)[176:184] = c("2008_residencehall_aggravated_assault","2008_residencehall_arson","2008_residencehall_burglary", "2008_residencehall_forciblesexualoffense", "2008_residencehall_negligenthomicide", "2008_residencehall_murder", "2008_residencehall_nonforciblesexualoffense", "2008_residencehall_robbery", "2008_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertytertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #public-property 2008 colnames(crimedata)[185:193] = c("2008_publicproperty_aggravated_assault","2008_publicproperty_arson","2008_publicproperty_burglary", "2008_publicproperty_forciblesexualoffense", "2008_publicproperty_negligenthomicide", "2008_publicproperty_murder", "2008_publicproperty_nonforciblesexualoffense", "2008_publicproperty_robbery", "2008_publicproperty_vehicletheft") # locations crimedata = merge(x = crimedata, subset(campuslocations,select=c("UNITID_P","x","y")), by="UNITID_P", all.x=TRUE) colnames(crimedata)[194:195] = c("longitude","latitude") #### save new data to disk # don't include row headers, and write blanks instead of "NA" for missing data values write.csv(crimedata, crimedatafilename, row.names=FALSE, na="")