Skip to content

Instantly share code, notes, and snippets.

@phreakin
Created July 15, 2022 00:10
Show Gist options
  • Save phreakin/623f0fec08559e8d8aae9ff645f15310 to your computer and use it in GitHub Desktop.
Save phreakin/623f0fec08559e8d8aae9ff645f15310 to your computer and use it in GitHub Desktop.
Simple python script to grab the crime data for Mesa, Arizona
#!/usr/bin/python
import pandas as pd
import os
path = 'D:/IdeaProjects/Dara.gov/data/' # Directory where the data is stored
url = 'https://data.mesaaz.gov/resource/39rt-2rfj.csv' # URL of the data
filename = 'mesa_police_incidents.csv' # Name of the file to save the data
# Check if the path and file exist
try:
# If not, create the directory and download the data
if not os.path.exists(path + filename):
df = pd.read_csv(url)
csv_file = df.to_csv(path + filename, index=False)
print('File not found. Downloading...')
print('File downloaded and saved to ' + path + filename)
# Id both exist, print a message and continue
else:
print('File already exists. Skipping Download.')
print('File is located at ' + path + filename)
# Stop With Error And Print Error Message With Code
except Exception as e:
print(e)
print('Error downloading file. Please try again.')
exit(code=500)
# If download is successful, print a message and continue
finally:
print('Processing Data...')
# Read the data into a dataframe
print('Reading file...')
df = pd.read_csv(path + filename)
print('File read...')
# Drop duplicates in the crime_id and crime_type columns
df.drop_duplicates(subset=['crime_id','crime_type'], keep='first', inplace=False, ignore_index=False)
print('Duplicate crime_id dropped...')
# Drop the columns that have no data in the specified fields
# crime_id is the main identifier for each crime, so if empty the row is worthless
df.dropna(subset=['crime_id'], inplace=True)
# Drop the row if latitude and longitude are empty
# Rgis is done because we need the info to map crime data to a map
df.dropna(subset=['latitude','longitude'], inplace=True)
print('NaN crime_id and latitude,longitude dropped...')
print('Missing values removed...')
# Save the cleaned dataframe to a new csv file with the same path as raw data
df.to_csv(path + 'mesa_police_incidents_clean.csv', index=False)
print('Clean data file saved to ' + path + 'mesa_police_incidents_clean.csv')
# Print a message and exit
# We're done!
print('Processing complete.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment