Last active
August 29, 2015 14:05
-
-
Save technickle/2fc673c3a3222a1d93ae to your computer and use it in GitHub Desktop.
Revisions
-
technickle revised this gist
Dec 2, 2014 . 1 changed file with 49 additions and 35 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,7 +10,7 @@ # geocoding should be in a CSV with at least UNITID_P, x, y, and county; # place this file in the same folder with the .sav files # 5) modify this script to accommodate new year values (beyond the config) # 6) run this script: rscript state-ope-security-data.r # # this script produces almost 200 columns of data; # 14 general columns + (5 years x 4 location types x 9 crime types) @@ -21,13 +21,14 @@ # FUTURE: fix SPSS import so it drops text field whitespace and remove workaround # FUTURE: hyphenate the zipcode column (source data has 9-digit numbers for ZIP+4s) # FUTURE: have the script grab the data directly from the website # FUTURE: make the year selections and merges configuration-driven # FUTURE: make the entire script driven by configuration parameters # ############################################################################## # Configuration # set this to the folder where you placed your OPE .sav files & geocoded .csv # you must have write access to this folder; the output file gets put here too setwd("~/Campus-Crime/") # set this to the 2-letter abbreviation for the state you want to compile statefilter = "NY" @@ -76,10 +77,20 @@ publicpropertytertiary_state = subset(publicpropertytertiary, State == statefilt #### build new data # create a new dataframe with the common fields from all records # we assume that the most recent year is the master list of institutions we want. Any institutions/campuses in previous years that are not in current year are ignored crimedata = subset(oncampusprimary_state, select=c("UNITID_P", "INSTNM", "BRANCH", "Address", "City")) # rename column headers in this new dataframe colnames(crimedata)[1:5] = c("UNITID_P", "institution_name", "branch", "address", "city") print("initial campus columns...") # merge county column from campuslocations file crimedata = merge(x = crimedata, subset(campuslocations, select=c("UNITID_P","County")), by="UNITID_P", all.x=TRUE) colnames(crimedata)[6] = "county" print("merged county column...") # merge rest of basic campus data crimedata = merge(x = crimedata, subset(oncampusprimary_state, select=c("UNITID_P", "State", "ZIP", "sector_cd", "Sector_desc", "men_total", "women_total", "Total")), by="UNITID_P",all.x=TRUE) colnames(crimedata)[7:13] = c("state", "zip", "sector_cd", "sector_desc", "men_total", "women_total", "total") print("additional campus columns...") # trim specific text columns (SPSS import includes whitespace) trim = function (x) gsub("^\\s+|\\s+$", "", x) @@ -89,65 +100,68 @@ crimedata$address = trim(crimedata$address) crimedata$city = trim(crimedata$city) crimedata$sector_desc = trim(crimedata$sector_desc) # strip down zip codes to 5 digits (removes trailing spaces and +4 codes) crimedata$zip = substr(crimedata$zip,1,5) # begin merging (appending) crime data columns from each year (36 columns per year!) # current year # merge the data for the first year by each of the location types crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "NEG_M12", "MURD12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2012 # rename the column headers for the newly merged year/location - column order MUST match the above subset # there is probably a more clever way to do this using substitution/regex colnames(crimedata)[14:22] = c("2012_oncampus_aggravated_assault","2012_oncampus_arson","2012_oncampus_burglary", "2012_oncampus_forciblesexualoffense", "2012_oncampus_negligenthomicide", "2012_oncampus_murder", "2012_oncampus_nonforciblesexualoffense", "2012_oncampus_robbery", "2012_oncampus_vehicletheft") # rinse and repeat for each year and location type crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2012 colnames(crimedata)[23:31] = c("2012_offcampus_aggravated_assault","2012_offcampus_arson","2012_offcampus_burglary", "2012_offcampus_forciblesexualoffense", "2012_offcampus_negligenthomicide", "2012_offcampus_murder", "2012_offcampus_nonforciblesexualoffense", "2012_offcampus_robbery", "2012_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2012 colnames(crimedata)[32:40] = c("2012_residencehall_aggravated_assault","2012_residencehall_arson","2012_residencehall_burglary", "2012_residencehall_forciblesexualoffense", "2012_residencehall_negligenthomicide", "2012_residencehall_murder", "2012_residencehall_nonforciblesexualoffense", "2012_residencehall_robbery", "2012_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #public-property 2012 colnames(crimedata)[41:49] = c("2012_publicproperty_aggravated_assault","2012_publicproperty_arson","2012_publicproperty_burglary", "2012_publicproperty_forciblesexualoffense", "2012_publicproperty_negligenthomicide", "2012_publicproperty_murder", "2012_publicproperty_nonforciblesexualoffense", "2012_publicproperty_robbery", "2012_publicproperty_vehicletheft") # previous year crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "NEG_M11", "MURD11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2011 colnames(crimedata)[50:58] = c("2011_oncampus_aggravated_assault","2011_oncampus_arson","2011_oncampus_burglary", "2011_oncampus_forciblesexualoffense", "2011_oncampus_negligenthomicide", "2011_oncampus_murder", "2011_oncampus_nonforciblesexualoffense", "2011_oncampus_robbery", "2011_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2011 colnames(crimedata)[59:67] = c("2011_offcampus_aggravated_assault","2011_offcampus_arson","2011_offcampus_burglary", "2011_offcampus_forciblesexualoffense", "2011_offcampus_negligenthomicide", "2011_offcampus_murder", "2011_offcampus_nonforciblesexualoffense", "2011_offcampus_robbery", "2011_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2011 colnames(crimedata)[68:76] = c("2011_residencehall_aggravated_assault","2011_residencehall_arson","2011_residencehall_burglary", "2011_residencehall_forciblesexualoffense", "2011_residencehall_negligenthomicide", "2011_residencehall_murder", "2011_residencehall_nonforciblesexualoffense", "2011_residencehall_robbery", "2011_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #public-property 2011 colnames(crimedata)[77:85] = c("2011_publicproperty_aggravated_assault","2011_publicproperty_arson","2011_publicproperty_burglary", "2011_publicproperty_forciblesexualoffense", "2011_publicproperty_negligenthomicide", "2011_publicproperty_murder", "2011_publicproperty_nonforciblesexualoffense", "2011_publicproperty_robbery", "2011_publicproperty_vehicletheft") # 2nd previous year crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "NEG_M10", "MURD10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2010 colnames(crimedata)[86:94] = c("2010_oncampus_aggravated_assault","2010_oncampus_arson","2010_oncampus_burglary", "2010_oncampus_forciblesexualoffense", "2010_oncampus_negligenthomicide", "2010_oncampus_murder", "2010_oncampus_nonforciblesexualoffense", "2010_oncampus_robbery", "2010_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2010 colnames(crimedata)[95:103] = c("2010_offcampus_aggravated_assault","2010_offcampus_arson","2010_offcampus_burglary", "2010_offcampus_forciblesexualoffense", "2010_offcampus_negligenthomicide", "2010_offcampus_murder", "2010_offcampus_nonforciblesexualoffense", "2010_offcampus_robbery", "2010_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2010 colnames(crimedata)[104:112] = c("2010_residencehall_aggravated_assault","2010_residencehall_arson","2010_residencehall_burglary", "2010_residencehall_forciblesexualoffense", "2010_residencehall_negligenthomicide", "2010_residencehall_murder", "2010_residencehall_nonforciblesexualoffense", "2010_residencehall_robbery", "2010_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #public-property 2010 colnames(crimedata)[113:121] = c("2010_publicproperty_aggravated_assault","2010_publicproperty_arson","2010_publicproperty_burglary", "2010_publicproperty_forciblesexualoffense", "2010_publicproperty_negligenthomicide", "2010_publicproperty_murder", "2010_publicproperty_nonforciblesexualoffense", "2010_publicproperty_robbery", "2010_publicproperty_vehicletheft") # 3rd previous year crimedata = merge(x = crimedata, subset(oncampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "NEG_M9", "MURD9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2009 colnames(crimedata)[122:130] = c("2009_oncampus_aggravated_assault","2009_oncampus_arson","2009_oncampus_burglary", "2009_oncampus_forciblesexualoffense", "2009_oncampus_negligenthomicide", "2009_oncampus_murder", "2009_oncampus_nonforciblesexualoffense", "2009_oncampus_robbery", "2009_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2009 colnames(crimedata)[131:139] = c("2009_offcampus_aggravated_assault","2009_offcampus_arson","2009_offcampus_burglary", "2009_offcampus_forciblesexualoffense", "2009_offcampus_negligenthomicide", "2009_offcampus_murder", "2009_offcampus_nonforciblesexualoffense", "2009_offcampus_robbery", "2009_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallsecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2009 colnames(crimedata)[140:148] = c("2009_residencehall_aggravated_assault","2009_residencehall_arson","2009_residencehall_burglary", "2009_residencehall_forciblesexualoffense", "2009_residencehall_negligenthomicide", "2009_residencehall_murder", "2009_residencehall_nonforciblesexualoffense", "2009_residencehall_robbery", "2009_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertysecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #public-property 2009 colnames(crimedata)[149:157] = c("2009_publicproperty_aggravated_assault","2009_publicproperty_arson","2009_publicproperty_burglary", "2009_publicproperty_forciblesexualoffense", "2009_publicproperty_negligenthomicide", "2009_publicproperty_murder", "2009_publicproperty_nonforciblesexualoffense", "2009_publicproperty_robbery", "2009_publicproperty_vehicletheft") # 4th previous year crimedata = merge(x = crimedata, subset(oncampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "NEG_M8", "MURD8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2008 colnames(crimedata)[158:166] = c("2008_oncampus_aggravated_assault","2008_oncampus_arson","2008_oncampus_burglary", "2008_oncampus_forciblesexualoffense", "2008_oncampus_negligenthomicide", "2008_oncampus_murder", "2008_oncampus_nonforciblesexualoffense", "2008_oncampus_robbery", "2008_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2008 colnames(crimedata)[167:175] = c("2008_offcampus_aggravated_assault","2008_offcampus_arson","2008_offcampus_burglary", "2008_offcampus_forciblesexualoffense", "2008_offcampus_negligenthomicide", "2008_offcampus_murder", "2008_offcampus_nonforciblesexualoffense", "2008_offcampus_robbery", "2008_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehalltertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2008 colnames(crimedata)[176:184] = c("2008_residencehall_aggravated_assault","2008_residencehall_arson","2008_residencehall_burglary", "2008_residencehall_forciblesexualoffense", "2008_residencehall_negligenthomicide", "2008_residencehall_murder", "2008_residencehall_nonforciblesexualoffense", "2008_residencehall_robbery", "2008_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertytertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #public-property 2008 colnames(crimedata)[185:193] = c("2008_publicproperty_aggravated_assault","2008_publicproperty_arson","2008_publicproperty_burglary", "2008_publicproperty_forciblesexualoffense", "2008_publicproperty_negligenthomicide", "2008_publicproperty_murder", "2008_publicproperty_nonforciblesexualoffense", "2008_publicproperty_robbery", "2008_publicproperty_vehicletheft") # locations crimedata = merge(x = crimedata, subset(campuslocations,select=c("UNITID_P","x","y")), by="UNITID_P", all.x=TRUE) colnames(crimedata)[194:195] = c("longitude","latitude") #### save new data to disk # don't include row headers, and write blanks instead of "NA" for missing data values -
technickle revised this gist
Aug 22, 2014 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,7 +7,7 @@ # Residencehallcrime*.sav, and Publicpropertycrime*.sav # out to a folder of your choosing. It should have 12 files when done # 4) use the one of the most recent year's .sav files to geocode addresses; # geocoding should be in a CSV with at least UNITID_P, x, y, and county; # place this file in the same folder with the .sav files # 5) modify this script to accommodate new year values (beyond the config) # 6) run it @@ -146,8 +146,8 @@ crimedata = merge(x = crimedata, subset(publicpropertytertiary_state,select=c("U colnames(crimedata)[184:192] = c("2008_publicproperty_aggravated_assault","2008_publicproperty_arson","2008_publicproperty_burglary", "2008_publicproperty_forciblesexualoffense", "2008_publicproperty_negligenthomicide", "2008_publicproperty_murder", "2008_publicproperty_nonforciblesexualoffense", "2008_publicproperty_robbery", "2008_publicproperty_vehicletheft") # locations crimedata = merge(x = crimedata, subset(campuslocations,select=c("UNITID_P","x","y","county")), by="UNITID_P", all.x=TRUE) colnames(crimedata)[193:195] = c("x","y","county") #### save new data to disk # don't include row headers, and write blanks instead of "NA" for missing data values -
technickle revised this gist
Aug 19, 2014 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -115,8 +115,6 @@ colnames(crimedata)[67:75] = c("2011_residencehall_aggravated_assault","2011_res crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #public-property 2011 colnames(crimedata)[76:84] = c("2011_publicproperty_aggravated_assault","2011_publicproperty_arson","2011_publicproperty_burglary", "2011_publicproperty_forciblesexualoffense", "2011_publicproperty_negligenthomicide", "2011_publicproperty_murder", "2011_publicproperty_nonforciblesexualoffense", "2011_publicproperty_robbery", "2011_publicproperty_vehicletheft") # 2nd previous year crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "NEG_M10", "MURD10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2010 colnames(crimedata)[85:93] = c("2010_oncampus_aggravated_assault","2010_oncampus_arson","2010_oncampus_burglary", "2010_oncampus_forciblesexualoffense", "2010_oncampus_negligenthomicide", "2010_oncampus_murder", "2010_oncampus_nonforciblesexualoffense", "2010_oncampus_robbery", "2010_oncampus_vehicletheft") -
technickle revised this gist
Aug 19, 2014 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -37,6 +37,8 @@ crimedatafilename = "crimedata.csv" # ############################################################################## require(foreign) #### load most last 5 years of data from files # most recent 3 years are in the latest file oncampusprimary = read.spss("oncampuscrime101112.sav", to.data.frame=TRUE) -
technickle created this gist
Aug 19, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,154 @@ # This R script grabs 5 years of OPE security data for a specified state # # How to use: # 1) Go to http://www.ope.ed.gov/security/GetDownloadFile.aspx # 2) download SPSS version of 3 most recent sets (past 5 years of data) # 3) from each zip file, copy oncampuscrime*.sav, noncampuscrime*.sav, # Residencehallcrime*.sav, and Publicpropertycrime*.sav # out to a folder of your choosing. It should have 12 files when done # 4) use the one of the most recent year's .sav files to geocode addresses; # geocoding should be in a CSV with at least UNITID_P, x, and y fields; # place this file in the same folder with the .sav files # 5) modify this script to accommodate new year values (beyond the config) # 6) run it # # this script produces almost 200 columns of data; # 14 general columns + (5 years x 4 location types x 9 crime types) # # TODO: Sort output data by institution name and branch # # FUTURE: fix merges so they don't require "x = crimedata" declaration # FUTURE: fix SPSS import so it drops text field whitespace and remove workaround # FUTURE: hyphenate the zipcode column (source data has 9-digit numbers for ZIP+4s) # FUTURE: have the script grab the data directly from the website # FUTURE: make the entire script driven by configuration parameters # ############################################################################## # Configuration # set this to the folder where you placed your OPE .sav files & geocoded .csv # you must have write access to this folder; the output file gets put here too setwd("~/CampusCrime/") # set this to the 2-letter abbreviation for the state you want to compile statefilter = "NY" # set this to the name of the file you want to output crimedatafilename = "crimedata.csv" # ############################################################################## #### load most last 5 years of data from files # most recent 3 years are in the latest file oncampusprimary = read.spss("oncampuscrime101112.sav", to.data.frame=TRUE) offcampusprimary = read.spss("noncampuscrime101112.sav", to.data.frame=TRUE) residencehallprimary = read.spss("Residencehallcrime101112.sav", to.data.frame=TRUE) publicpropertyprimary = read.spss("Publicpropertycrime101112.sav", to.data.frame=TRUE) # 2 oldest years are in the second-latest and third-latest set of files, respectively oncampussecondary = read.spss("oncampuscrime091011.sav", to.data.frame=TRUE) offcampussecondary = read.spss("noncampuscrime091011.sav", to.data.frame=TRUE) residencehallsecondary = read.spss("Residencehallcrime091011.sav", to.data.frame=TRUE) publicpropertysecondary = read.spss("Publicpropertycrime091011.sav", to.data.frame=TRUE) oncampustertiary = read.spss("oncampuscrime080910.sav", to.data.frame=TRUE) offcampustertiary = read.spss("noncampuscrime080910.sav", to.data.frame=TRUE) residencehalltertiary = read.spss("Residencehallcrime080910.sav", to.data.frame=TRUE) publicpropertytertiary = read.spss("Publicpropertycrime080910.sav", to.data.frame=TRUE) #### load campus locations campuslocations = read.csv("campuslocations.csv") #### filter all years for just NYS data oncampusprimary_state = subset(oncampusprimary, State == statefilter) offcampusprimary_state = subset(offcampusprimary, State == statefilter) residencehallprimary_state = subset(residencehallprimary, State == statefilter) publicpropertyprimary_state = subset(publicpropertyprimary, State == statefilter) oncampussecondary_state = subset(oncampussecondary, State == statefilter) offcampussecondary_state = subset(offcampussecondary, State == statefilter) residencehallsecondary_state = subset(residencehallsecondary, State == statefilter) publicpropertysecondary_state = subset(publicpropertysecondary, State == statefilter) oncampustertiary_state = subset(oncampustertiary, State == statefilter) offcampustertiary_state = subset(offcampustertiary, State == statefilter) residencehalltertiary_state = subset(residencehalltertiary, State == statefilter) publicpropertytertiary_state = subset(publicpropertytertiary, State == statefilter) #### build new data # create a new dataframe with the common fields from all records # we assume that the most recent year is the master list of institutions we want. Any institutions/campuses in previous years that are not in current year are ignored crimedata = subset(oncampusprimary_state, select=c(UNITID_P, INSTNM, BRANCH, Address, City, State, ZIP, sector_cd, Sector_desc, men_total, women_total, Total)) # rename column headers in this new dataframe colnames(crimedata) <- c("UNITID_P", "institution_name", "branch", "address", "city", "state", "zip", "sector_cd", "sector_desc", "men_total", "women_total", "total") # trim specific text columns (SPSS import includes whitespace) trim = function (x) gsub("^\\s+|\\s+$", "", x) crimedata$institution_name = trim(crimedata$institution_name) crimedata$branch = trim(crimedata$branch) crimedata$address = trim(crimedata$address) crimedata$city = trim(crimedata$city) crimedata$sector_desc = trim(crimedata$sector_desc) # begin merging (appending) crime data columns from each year (36 columns per year!) # current year #merge the data for the first year and location type crimedata = merge(x = crimedata, subset(oncampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "NEG_M12", "MURD12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2012 #rename the column headers for the newly merged year/location - column order MUST match the above subset #there is probably a more clever way to do this using substitution/regex colnames(crimedata)[13:21] = c("2012_oncampus_aggravated_assault","2012_oncampus_arson","2012_oncampus_burglary", "2012_oncampus_forciblesexualoffense", "2012_oncampus_negligenthomicide", "2012_oncampus_murder", "2012_oncampus_nonforciblesexualoffense", "2012_oncampus_robbery", "2012_oncampus_vehicletheft") #rinse and repeat for each year and location type crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2012 colnames(crimedata)[22:30] = c("2012_offcampus_aggravated_assault","2012_offcampus_arson","2012_offcampus_burglary", "2012_offcampus_forciblesexualoffense", "2012_offcampus_negligenthomicide", "2012_offcampus_murder", "2012_offcampus_nonforciblesexualoffense", "2012_offcampus_robbery", "2012_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2012 colnames(crimedata)[31:39] = c("2012_residencehall_aggravated_assault","2012_residencehall_arson","2012_residencehall_burglary", "2012_residencehall_forciblesexualoffense", "2012_residencehall_negligenthomicide", "2012_residencehall_murder", "2012_residencehall_nonforciblesexualoffense", "2012_residencehall_robbery", "2012_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A12", "ARSON12", "BURGLA12", "FORCIB12", "MURD12", "NEG_M12", "NONFOR12", "ROBBE12", "VEHIC12")), by=c("UNITID_P"), all.x=TRUE) #public-property 2012 colnames(crimedata)[40:48] = c("2012_publicproperty_aggravated_assault","2012_publicproperty_arson","2012_publicproperty_burglary", "2012_publicproperty_forciblesexualoffense", "2012_publicproperty_negligenthomicide", "2012_publicproperty_murder", "2012_publicproperty_nonforciblesexualoffense", "2012_publicproperty_robbery", "2012_publicproperty_vehicletheft") # previous year crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "NEG_M11", "MURD11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2011 colnames(crimedata)[49:57] = c("2011_oncampus_aggravated_assault","2011_oncampus_arson","2011_oncampus_burglary", "2011_oncampus_forciblesexualoffense", "2011_oncampus_negligenthomicide", "2011_oncampus_murder", "2011_oncampus_nonforciblesexualoffense", "2011_oncampus_robbery", "2011_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2011 colnames(crimedata)[58:66] = c("2011_offcampus_aggravated_assault","2011_offcampus_arson","2011_offcampus_burglary", "2011_offcampus_forciblesexualoffense", "2011_offcampus_negligenthomicide", "2011_offcampus_murder", "2011_offcampus_nonforciblesexualoffense", "2011_offcampus_robbery", "2011_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2011 colnames(crimedata)[67:75] = c("2011_residencehall_aggravated_assault","2011_residencehall_arson","2011_residencehall_burglary", "2011_residencehall_forciblesexualoffense", "2011_residencehall_negligenthomicide", "2011_residencehall_murder", "2011_residencehall_nonforciblesexualoffense", "2011_residencehall_robbery", "2011_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A11", "ARSON11", "BURGLA11", "FORCIB11", "MURD11", "NEG_M11", "NONFOR11", "ROBBE11", "VEHIC11")), by=c("UNITID_P"), all.x=TRUE) #public-property 2011 colnames(crimedata)[76:84] = c("2011_publicproperty_aggravated_assault","2011_publicproperty_arson","2011_publicproperty_burglary", "2011_publicproperty_forciblesexualoffense", "2011_publicproperty_negligenthomicide", "2011_publicproperty_murder", "2011_publicproperty_nonforciblesexualoffense", "2011_publicproperty_robbery", "2011_publicproperty_vehicletheft") print(nrow(crimedata)) # 2nd previous year crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "NEG_M10", "MURD10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2010 colnames(crimedata)[85:93] = c("2010_oncampus_aggravated_assault","2010_oncampus_arson","2010_oncampus_burglary", "2010_oncampus_forciblesexualoffense", "2010_oncampus_negligenthomicide", "2010_oncampus_murder", "2010_oncampus_nonforciblesexualoffense", "2010_oncampus_robbery", "2010_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampusprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2010 colnames(crimedata)[94:102] = c("2010_offcampus_aggravated_assault","2010_offcampus_arson","2010_offcampus_burglary", "2010_offcampus_forciblesexualoffense", "2010_offcampus_negligenthomicide", "2010_offcampus_murder", "2010_offcampus_nonforciblesexualoffense", "2010_offcampus_robbery", "2010_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2010 colnames(crimedata)[103:111] = c("2010_residencehall_aggravated_assault","2010_residencehall_arson","2010_residencehall_burglary", "2010_residencehall_forciblesexualoffense", "2010_residencehall_negligenthomicide", "2010_residencehall_murder", "2010_residencehall_nonforciblesexualoffense", "2010_residencehall_robbery", "2010_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertyprimary_state,select=c("UNITID_P", "AGG_A10", "ARSON10", "BURGLA10", "FORCIB10", "MURD10", "NEG_M10", "NONFOR10", "ROBBE10", "VEHIC10")), by=c("UNITID_P"), all.x=TRUE) #public-property 2010 colnames(crimedata)[112:120] = c("2010_publicproperty_aggravated_assault","2010_publicproperty_arson","2010_publicproperty_burglary", "2010_publicproperty_forciblesexualoffense", "2010_publicproperty_negligenthomicide", "2010_publicproperty_murder", "2010_publicproperty_nonforciblesexualoffense", "2010_publicproperty_robbery", "2010_publicproperty_vehicletheft") # 3rd previous year crimedata = merge(x = crimedata, subset(publicpropertysecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "NEG_M9", "MURD9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2009 colnames(crimedata)[121:129] = c("2009_oncampus_aggravated_assault","2009_oncampus_arson","2009_oncampus_burglary", "2009_oncampus_forciblesexualoffense", "2009_oncampus_negligenthomicide", "2009_oncampus_murder", "2009_oncampus_nonforciblesexualoffense", "2009_oncampus_robbery", "2009_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampussecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2009 colnames(crimedata)[130:138] = c("2009_offcampus_aggravated_assault","2009_offcampus_arson","2009_offcampus_burglary", "2009_offcampus_forciblesexualoffense", "2009_offcampus_negligenthomicide", "2009_offcampus_murder", "2009_offcampus_nonforciblesexualoffense", "2009_offcampus_robbery", "2009_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehallsecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2009 colnames(crimedata)[139:147] = c("2009_residencehall_aggravated_assault","2009_residencehall_arson","2009_residencehall_burglary", "2009_residencehall_forciblesexualoffense", "2009_residencehall_negligenthomicide", "2009_residencehall_murder", "2009_residencehall_nonforciblesexualoffense", "2009_residencehall_robbery", "2009_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertysecondary_state,select=c("UNITID_P", "AGG_A9", "ARSON9", "BURGLA9", "FORCIB9", "MURD9", "NEG_M9", "NONFOR9", "ROBBE9", "VEHIC9")), by=c("UNITID_P"), all.x=TRUE) #public-property 2009 colnames(crimedata)[148:156] = c("2009_publicproperty_aggravated_assault","2009_publicproperty_arson","2009_publicproperty_burglary", "2009_publicproperty_forciblesexualoffense", "2009_publicproperty_negligenthomicide", "2009_publicproperty_murder", "2009_publicproperty_nonforciblesexualoffense", "2009_publicproperty_robbery", "2009_publicproperty_vehicletheft") # 4th previous year crimedata = merge(x = crimedata, subset(publicpropertytertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "NEG_M8", "MURD8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #on-campus 2008 colnames(crimedata)[157:165] = c("2008_oncampus_aggravated_assault","2008_oncampus_arson","2008_oncampus_burglary", "2008_oncampus_forciblesexualoffense", "2008_oncampus_negligenthomicide", "2008_oncampus_murder", "2008_oncampus_nonforciblesexualoffense", "2008_oncampus_robbery", "2008_oncampus_vehicletheft") crimedata = merge(x = crimedata, subset(offcampustertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #off-campus 2008 colnames(crimedata)[166:174] = c("2008_offcampus_aggravated_assault","2008_offcampus_arson","2008_offcampus_burglary", "2008_offcampus_forciblesexualoffense", "2008_offcampus_negligenthomicide", "2008_offcampus_murder", "2008_offcampus_nonforciblesexualoffense", "2008_offcampus_robbery", "2008_offcampus_vehicletheft") crimedata = merge(x = crimedata, subset(residencehalltertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #residence-hall 2008 colnames(crimedata)[175:183] = c("2008_residencehall_aggravated_assault","2008_residencehall_arson","2008_residencehall_burglary", "2008_residencehall_forciblesexualoffense", "2008_residencehall_negligenthomicide", "2008_residencehall_murder", "2008_residencehall_nonforciblesexualoffense", "2008_residencehall_robbery", "2008_residencehall_vehicletheft") crimedata = merge(x = crimedata, subset(publicpropertytertiary_state,select=c("UNITID_P", "AGG_A8", "ARSON8", "BURGLA8", "FORCIB8", "MURD8", "NEG_M8", "NONFOR8", "ROBBE8", "VEHIC8")), by=c("UNITID_P"), all.x=TRUE) #public-property 2008 colnames(crimedata)[184:192] = c("2008_publicproperty_aggravated_assault","2008_publicproperty_arson","2008_publicproperty_burglary", "2008_publicproperty_forciblesexualoffense", "2008_publicproperty_negligenthomicide", "2008_publicproperty_murder", "2008_publicproperty_nonforciblesexualoffense", "2008_publicproperty_robbery", "2008_publicproperty_vehicletheft") # locations crimedata = merge(x = crimedata, subset(campuslocations,select=c("UNITID_P","x","y")), by="UNITID_P", all.x=TRUE) colnames(crimedata)[193:194] = c("x","y") #### save new data to disk # don't include row headers, and write blanks instead of "NA" for missing data values write.csv(crimedata, crimedatafilename, row.names=FALSE, na="")