Created
March 11, 2017 02:47
-
-
Save travishathaway/1e6ffd0544ca7803f0e0ae4c630aaccf to your computer and use it in GitHub Desktop.
Download Civic Apps Crime Report Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# Author: Travis Hathaway | |
# | |
# Description: | |
# This file downloads crime report data from portlandoregon.gov's FTP server | |
# and then loads it in to a postgres database. | |
# | |
# How to use: | |
# Simply change the DB_* variables to fit your local installation. I assume that | |
# you are using a passwordless Postgres account. | |
# | |
# Requirements: | |
# This script requires that "csvkit" be installed. https://csvkit.readthedocs.io/en/1.0.1/ | |
TMP_DIR='.tmp' | |
FTP_URL='ftp://ftp02.portlandoregon.gov/CivicApps/' | |
FIRST_YEAR='2004' | |
YEARS=$(echo 200{4..9} 201{0..4}) | |
FILE_NAME='crime_incident_data_' | |
FILE_EXTENSION='.zip' | |
CSV_FILE_NAME='crime_incident_data.csv' | |
QUOTE_CHAR='"' | |
DELIMITER=',' | |
DB_TABLE='' | |
DB_USER='' | |
DB_NAME='' | |
# Make the tmp dir, that's where the files go | |
mkdir -p "$TMP_DIR" && cd "$TMP_DIR" | |
# Download all the files we need | |
for year in $YEARS ; | |
do | |
echo "$FTP_URL""$FILE_NAME""$year""$FILE_EXTENSION" | |
curl -O "$FTP_URL""$FILE_NAME""$year""$FILE_EXTENSION" | |
done | |
# Unzip all the files | |
for file in $(ls *.zip); | |
do | |
unzip "$file" | |
mv "$CSV_FILE_NAME" $(echo "$file" | sed -e 's/.zip/.csv/') | |
rm "$file" | |
done | |
first_csv_file=$(ls *.csv | head -n 1) | |
right_head=$(head -n 1 "$first_csv_file" | perl -p -e 'tr/A-Z, /a-z,_/') | |
echo "DROP TABLE $DB_TABLE" | psql $DB_NAME -U $DB_USER | |
for year in $YEARS; | |
do | |
echo "Processing year $year..." | |
csv_file="$FILE_NAME""$year".csv | |
# Lower case the first row of CSV headers, because it's easier to make queries | |
# in Postgres this way. | |
sed -e "1s/.*/$right_head/" -i '' $csv_file | |
if [ $FIRST_YEAR == $year ] | |
then | |
head -n 500 "$FILE_NAME$year.csv" > short_file.csv | |
# Okay, so here where stuff gets a little tricky. csvsql is pretty slow, so we | |
# only use it to create our table. That's what it's good at | |
csvsql --db postgresql://$DB_USER@localhost/$DB_NAME \ | |
--table portland_crime_stats \ | |
--no-constraints \ | |
--insert short_file.csv | |
rm short_file.csv | |
# Now we're going to delete the only row that was in that table, we only used | |
# it to get the schema definitions | |
echo "TRUNCATE $DB_TABLE" | psql $DB_NAME -U $DB_USER | |
echo "ALTER TABLE $DB_TABLE ALTER COLUMN police_district TYPE varchar(10)" | psql $DB_NAME -U $DB_USER | |
# Finally, let's try to copy it directly in. | |
echo "COPY $DB_TABLE FROM '$(pwd)/$csv_file' | |
DELIMITER '$DELIMITER' QUOTE '$QUOTE_CHAR' FORCE NULL x_coordinate,y_coordinate CSV HEADER " | \ | |
psql $DB_NAME -U $DB_USER | |
else | |
echo "COPY $DB_TABLE FROM '$(pwd)/$csv_file' | |
DELIMITER '$DELIMITER' QUOTE '$QUOTE_CHAR' FORCE NULL x_coordinate,y_coordinate CSV HEADER " | \ | |
psql $DB_NAME -U $DB_USER | |
fi | |
done | |
# Clean up after ourselves | |
rm -rf "$TMP_DIR" | |
# Get us back to where we belong | |
cd - |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment