Last active
November 28, 2018 18:23
-
-
Save raprasad/f7c373e14854edaacb9311e0bec8be5a to your computer and use it in GitHub Desktop.
Script to check that crime incidents exist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""python3 script to check data files | |
For use with New Haven Crime Files on Dataverse: | |
- https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/18J4ZW | |
""" | |
import json | |
from os.path import isfile | |
import sys | |
# -------------------------------------------- | |
# SET YOUR DATA DIRECTORY HERE CONTAINING these files: | |
# - 02_incident_types.json | |
# - monthly data files | |
# - e.g. "incidents_2006_03.json", etc | |
# -------------------------------------------- | |
DATA_DIRECTORY = 'dataverse_files' | |
def msgt(m): | |
"""print stmt""" | |
print('-' * 40) | |
print(m) | |
print('-' * 40) | |
CODE_LOOKUP = {} | |
def get_incident_lookup(): | |
"""Open the incidents file and return a dict | |
e.g. {1: "MURDER", 2: "MISCONDUCT WITH A M/V"} | |
""" | |
global CODE_LOOKUP | |
if CODE_LOOKUP: | |
return CODE_LOOKUP | |
fname = '%s/02_incident_types.json' % DATA_DIRECTORY | |
if not isfile(fname): | |
print('Incidents file not found: %s' % fname) | |
sys.exit(0) | |
incident_code_info = json.loads(open(fname, 'r').read()) | |
CODE_LOOKUP = {} | |
for info in incident_code_info: | |
# -------------------------------------------- | |
# map incident primary key to incident description | |
# -------------------------------------------- | |
CODE_LOOKUP[info['pk']] = info['fields']['incident_description'] | |
return CODE_LOOKUP | |
def check_file(yyyy, mm, cnt=None): | |
"""Read through a data file. Check 'fields.incident_type' against codes""" | |
monthly_data_file = '%s/incidents_%s_%s.json' % (DATA_DIRECTORY, yyyy, mm) | |
if cnt: | |
msgt('(%s) check file: %s' % (cnt, monthly_data_file)) | |
else: | |
msgt('check file: %s' % monthly_data_file) | |
if not isfile(monthly_data_file): | |
print('file not found') | |
return | |
code_lookup = get_incident_lookup() | |
# -------------------------------------------- | |
# Load monthly data file | |
# -------------------------------------------- | |
monthly_data = json.loads(open(monthly_data_file, 'r').read()) | |
cnt_found = 0 | |
# -------------------------------------------- | |
# Iterate through incidents, check that 'fields.incident_type' | |
# has a corresponding value in the 'codes' dict. | |
# -------------------------------------------- | |
for crime_info in monthly_data: | |
# -------------------------------------------- | |
# get the incident code from the crime data | |
# -------------------------------------------- | |
crime_code = crime_info['fields']['incident_type'] | |
if crime_code in code_lookup: | |
cnt_found += 1 | |
#print('(id: %s) [%s] [%s]' % \ | |
# (crime_info['pk'], crime_code, code_lookup[crime_code])) | |
# -------------------------------------------- | |
# Print # of found codes vs total codes | |
# -------------------------------------------- | |
print('%s/%s' % (cnt_found, len(monthly_data))) | |
if cnt_found == len(monthly_data): | |
print('Looks good!') | |
else: | |
print('Error???') | |
def check_data_files(): | |
"""Iterate through data files and check for incident codes""" | |
fcnt = 0 | |
for year in range(2006, 2015): | |
for mm in range(1, 13): | |
fcnt += 1 | |
mm = str(mm).zfill(2) | |
check_file(year, mm, fcnt) | |
if __name__ == '__main__': | |
# Check single month | |
check_file(2006, '01') | |
# Check all months | |
check_data_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment