dannguyen · November 22, 2016 06:55
diff --git a/Notes-on-stop-and-frisk-data.md b/Notes-on-stop-and-frisk-data.md
diff --git a/nypd-stop-and-frisk-cleanup.bash b/nypd-stop-and-frisk-cleanup.bash
 mkdir -p stop-and-frisk
 cd stop-and-frisk
 # This zip contains data files that I've already converted from the
 # original SAS into CSV fields
 curl http://stash.compciv.org.s3.amazonaws.com/nypd-stop-and-frisk/2003-2013.zip -o data.zip && unzip data.zip

 # Get the common fields, i.e. that are common in the 11 files
 for y in $(seq 2003 2013); do  head -n 1 $y.csv; done | grep -oE '[^,]+' | sort | uniq -c | grep -E '\b11\b'

 # Get the fields that are not common:

 for y in $(seq 2003 2013); do  head -n 1 $y.csv; done | grep -oE '[^,"]+' | sort | uniq -c | grep -v '\b11\b' | sort

 #   1 adrnum
 #   1 adrpct
 #   1 detail1_
 #   1 details_
 #   1 dettyp_c
 #   1 premtyp
 #   1 prenam
 #   1 rescod
 #   1 strintr
 #   1 strname
 #   1 wepfound
 #   3 forceuse
 #  10 addrnum
 #  10 addrpct
 #  10 detailcm
 #  10 dettypcm
 #  10 linecm
 #  10 premname
 #  10 premtype
 #  10 rescode
 #  10 stinter
 #  10 stname
 #
 # file 2006.csv has all the weird things
 # detail1_ and details_ are exact copies of each other

 # and 2006.csv does not have a field for linecm
 # and only 2006 has wepfound field, so we take it out

 mv 2006.csv 2006.csv.old

 head -n 1 2006.csv.old | csvfix exclude -f 80,83 | sed -e  s/adrnum/addrnum/ -e s/adrpct/addrpct/ -e s/details_/detailcm/ \
 -e s/dettyp_c/dettypcm/ -e s/premtyp/premtype/ -e s/prenam/premname/ -e s/rescod/rescode/ -e s/strintr/stinter/ -e s/strname/stname/ |\
 csvfix put -v "linecm" | csvfix put -v 'forceuse' > 2006.csv

 tail -n +2 2006.csv.old  | csvfix exclude -f 80,83 | csvfix put -v "" | csvfix put -v "" >> 2006.csv


 # pad all of the pre-2011 tables with an empty forceuse column
 for y in 2003 2004 2005 2007 2008 2009 2010; do 
  echo $y
  mv $y.csv $y.csv.old
  head -n 1 $y.csv.old  | csvfix put -v 'forceuse' > $y.csv
  tail -n +2 $y.csv.old | csvfix put -v "" >> $y.csv
 done

 # now merge all the files into one big file, with common fields
 common_headers=$(csvfix echo -smq 2013.csv | head -n 1)
 echo $common_headers > 2003-2013.csv
 for y in $(seq 2003 2013); do 
  echo "Adding $y"
  csvfix order -fn "$common_headers" $y.csv | sed 1d | csvfix trim -l -t |  csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' | csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' >> 2003-2013.csv
 done


 # now make a file with the timestamps set up
 export LC_CTYPE=C 
 export LANG=C
 # http://stackoverflow.com/questions/19242275/re-error-illegal-byte-sequence-on-mac-os-x

 # fixing the datestop and timestop fields
 csvfix trim -l -t 2003-2013.csv |  csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' | csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' > 2003-2013.timefixed.csv

 # Now load into BigQuery
 gsutil -m cp 2003-2013.csv gs://nypd-stop-and-frisk/2003-2013.csv
 # first, create the table with 2013's data, since it has the most fields
 bq load --project_id='nypd-data' --skip_leading_rows=1 'nypd-data:nypd.stop_and_frisk' gs://nypd-stop-and-frisk/2003-2013.csv year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,perstop,typeofid,explnstp,othpers,arstmade,arstoffn,sumissue,sumoffen,compyear,comppct,offunif,officrid,frisked,searched,contrabn,adtlrept,pistol,riflshot,asltweap,knifcuti,machgun,othrweap,pf_hands,pf_wall,pf_grnd,pf_drwep,pf_ptwep,pf_baton,pf_hcuff,pf_pepsp,pf_other,radio,ac_rept,ac_inves,rf_vcrim,rf_othsw,ac_proxm,rf_attir,cs_objcs,cs_descr,cs_casng,cs_lkout,rf_vcact,cs_cloth,cs_drgtr,ac_evasv,ac_assoc,cs_furtv,rf_rfcmp,ac_cgdir,rf_verbl,cs_vcrim,cs_bulge,cs_other,ac_incid,ac_time,rf_knowl,ac_stsnd,ac_other,sb_hdobj,sb_outln,sb_admis,sb_other,repcmd,revcmd,rf_furt,rf_bulg,offverb,offshld,forceuse,sex,race,dob,age,ht_feet,ht_inch,weight,haircolr,eyecolor,build,othfeatr,addrtyp,rescode,premtype,premname,addrnum,stname,stinter,crossst,aptnum,city,state,zip,addrpct,sector,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm
diff --git a/stop-and-frisk-by-month-year-race.r b/stop-and-frisk-by-month-year-race.r
 library('bigrquery')
 library('dplyr')
 library('ggplot2')
 project_id = 'nypd-data'

 sql = "select datestop, race, count(*) AS stops from [nypd-data:nypd.stop_and_frisk] group by datestop, race"
 data <- query_exec(sql, project = project_id, max_pages = Inf)

 datax <- filter(data, datestop != "") %>%
  mutate(date = as.POSIXct(datestop, tz = 'UTC', origin="1970-01-01")) %>%
   mutate(year = format(date, '%Y'))   %>%
   mutate(month = format(date, '%m')) %>%
   mutate(general_race = ifelse(race == 'B', 'Black',  ifelse(race=='Q' | race == 'P', 'Hispanic', 'Other'))) %>%
  group_by(year, month, general_race) %>%
  summarize(total_stops = sum(stops)) %>%
  mutate(yearmonthday = as.POSIXct(paste(year,month,'01', sep='-'), tz='UTC', origin = '1970-01-01')) %>%
  filter(year >= 2003) # ugly hack, I know

 ggplot(datax, aes(x = yearmonthday, y = total_stops,fill=factor(general_race) )) +
  geom_bar(stat="identity",colour ="gray") + scale_fill_brewer() + geom_smooth() +
  facet_grid(general_race ~ .) +
  theme_minimal() +
  theme(legend.position = "none",text = element_text(family="Gill Sans"), plot.title = element_text(size=20)) +
  ggtitle(expression(atop("NYPD Stop and Frisk data", atop(italic("By Dan Nguyen / Stanford Journalism"), "http://bit.ly/nypdstopnfrisk")))) +
  ylab("Stops per month") + xlab("") + scale_y_continuous(labels = comma)
	mkdir -p stop-and-frisk
	cd stop-and-frisk
	# This zip contains data files that I've already converted from the
	# original SAS into CSV fields
	curl http://stash.compciv.org.s3.amazonaws.com/nypd-stop-and-frisk/2003-2013.zip -o data.zip && unzip data.zip

	# Get the common fields, i.e. that are common in the 11 files
	for y in $(seq 2003 2013); do head -n 1 $y.csv; done \| grep -oE '[^,]+' \| sort \| uniq -c \| grep -E '\b11\b'

	# Get the fields that are not common:

	for y in $(seq 2003 2013); do head -n 1 $y.csv; done \| grep -oE '[^,"]+' \| sort \| uniq -c \| grep -v '\b11\b' \| sort

	# 1 adrnum
	# 1 adrpct
	# 1 detail1_
	# 1 details_
	# 1 dettyp_c
	# 1 premtyp
	# 1 prenam
	# 1 rescod
	# 1 strintr
	# 1 strname
	# 1 wepfound
	# 3 forceuse
	# 10 addrnum
	# 10 addrpct
	# 10 detailcm
	# 10 dettypcm
	# 10 linecm
	# 10 premname
	# 10 premtype
	# 10 rescode
	# 10 stinter
	# 10 stname
	#
	# file 2006.csv has all the weird things
	# detail1_ and details_ are exact copies of each other

	# and 2006.csv does not have a field for linecm
	# and only 2006 has wepfound field, so we take it out

	mv 2006.csv 2006.csv.old

	head -n 1 2006.csv.old \| csvfix exclude -f 80,83 \| sed -e s/adrnum/addrnum/ -e s/adrpct/addrpct/ -e s/details_/detailcm/ \
	-e s/dettyp_c/dettypcm/ -e s/premtyp/premtype/ -e s/prenam/premname/ -e s/rescod/rescode/ -e s/strintr/stinter/ -e s/strname/stname/ \|\
	csvfix put -v "linecm" \| csvfix put -v 'forceuse' > 2006.csv

	tail -n +2 2006.csv.old \| csvfix exclude -f 80,83 \| csvfix put -v "" \| csvfix put -v "" >> 2006.csv


	# pad all of the pre-2011 tables with an empty forceuse column
	for y in 2003 2004 2005 2007 2008 2009 2010; do
	echo $y
	mv $y.csv $y.csv.old
	head -n 1 $y.csv.old \| csvfix put -v 'forceuse' > $y.csv
	tail -n +2 $y.csv.old \| csvfix put -v "" >> $y.csv
	done

	# now merge all the files into one big file, with common fields
	common_headers=$(csvfix echo -smq 2013.csv \| head -n 1)
	echo $common_headers > 2003-2013.csv
	for y in $(seq 2003 2013); do
	echo "Adding $y"
	csvfix order -fn "$common_headers" $y.csv \| sed 1d \| csvfix trim -l -t \| csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' \| csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' >> 2003-2013.csv
	done


	# now make a file with the timestamps set up
	export LC_CTYPE=C
	export LANG=C
	# http://stackoverflow.com/questions/19242275/re-error-illegal-byte-sequence-on-mac-os-x

	# fixing the datestop and timestop fields
	csvfix trim -l -t 2003-2013.csv \| csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' \| csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' > 2003-2013.timefixed.csv

	# Now load into BigQuery
	gsutil -m cp 2003-2013.csv gs://nypd-stop-and-frisk/2003-2013.csv
	# first, create the table with 2013's data, since it has the most fields
	bq load --project_id='nypd-data' --skip_leading_rows=1 'nypd-data:nypd.stop_and_frisk' gs://nypd-stop-and-frisk/2003-2013.csv year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,perstop,typeofid,explnstp,othpers,arstmade,arstoffn,sumissue,sumoffen,compyear,comppct,offunif,officrid,frisked,searched,contrabn,adtlrept,pistol,riflshot,asltweap,knifcuti,machgun,othrweap,pf_hands,pf_wall,pf_grnd,pf_drwep,pf_ptwep,pf_baton,pf_hcuff,pf_pepsp,pf_other,radio,ac_rept,ac_inves,rf_vcrim,rf_othsw,ac_proxm,rf_attir,cs_objcs,cs_descr,cs_casng,cs_lkout,rf_vcact,cs_cloth,cs_drgtr,ac_evasv,ac_assoc,cs_furtv,rf_rfcmp,ac_cgdir,rf_verbl,cs_vcrim,cs_bulge,cs_other,ac_incid,ac_time,rf_knowl,ac_stsnd,ac_other,sb_hdobj,sb_outln,sb_admis,sb_other,repcmd,revcmd,rf_furt,rf_bulg,offverb,offshld,forceuse,sex,race,dob,age,ht_feet,ht_inch,weight,haircolr,eyecolor,build,othfeatr,addrtyp,rescode,premtype,premname,addrnum,stname,stinter,crossst,aptnum,city,state,zip,addrpct,sector,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm
	library('bigrquery')
	library('dplyr')
	library('ggplot2')
	project_id = 'nypd-data'

	sql = "select datestop, race, count(*) AS stops from [nypd-data:nypd.stop_and_frisk] group by datestop, race"
	data <- query_exec(sql, project = project_id, max_pages = Inf)

	datax <- filter(data, datestop != "") %>%
	mutate(date = as.POSIXct(datestop, tz = 'UTC', origin="1970-01-01")) %>%
	mutate(year = format(date, '%Y')) %>%
	mutate(month = format(date, '%m')) %>%
	mutate(general_race = ifelse(race == 'B', 'Black', ifelse(race=='Q' \| race == 'P', 'Hispanic', 'Other'))) %>%
	group_by(year, month, general_race) %>%
	summarize(total_stops = sum(stops)) %>%
	mutate(yearmonthday = as.POSIXct(paste(year,month,'01', sep='-'), tz='UTC', origin = '1970-01-01')) %>%
	filter(year >= 2003) # ugly hack, I know

	ggplot(datax, aes(x = yearmonthday, y = total_stops,fill=factor(general_race) )) +
	geom_bar(stat="identity",colour ="gray") + scale_fill_brewer() + geom_smooth() +
	facet_grid(general_race ~ .) +
	theme_minimal() +
	theme(legend.position = "none",text = element_text(family="Gill Sans"), plot.title = element_text(size=20)) +
	ggtitle(expression(atop("NYPD Stop and Frisk data", atop(italic("By Dan Nguyen / Stanford Journalism"), "http://bit.ly/nypdstopnfrisk")))) +
	ylab("Stops per month") + xlab("") + scale_y_continuous(labels = comma)