Last active
February 13, 2021 21:32
-
-
Save bhelx/36be62f1ed50609ff6fdd802c86b684b to your computer and use it in GitHub Desktop.
Refactoring the [neighborhood annotation script](https://github.com/codefornola/nola-neighborhood-annotation) to utilize [the nolabase](https://github.com/codefornola/nolabase/) instead of using all the data and doing the computation locally. This demonstrates using the nolabase as a dependency to a community run application or tool. Run it by p…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding: utf-8 -*- | |
# Refactoring the [neighborhood annotation script](https://github.com/codefornola/nola-neighborhood-annotation) to | |
# utilize [the nolabase](https://github.com/codefornola/nolabase/) instead of using all the data | |
# and doing the computation locally. | |
# This demonstrates using the nolabase as a dependency to a community run application or tool. | |
# Run it by passing a connection string and an input and output csv file: | |
# ``` | |
# # get 2021 calls for service data as a test | |
# curl https://data.nola.gov/resource/3pha-hum9.csv > calls.csv | |
# python3 annotate.py postgresql://nolabaseuser:[email protected]/nolabase calls.csv output.csv | |
# ``` | |
import sys | |
import csv | |
import re | |
import psycopg2 | |
from urllib.parse import urlparse | |
lat_lng_rg = re.compile('.*?([+-]?\\d*\\.\\d+)(?![-+0-9\\.]).*?([+-]?\\d*\\.\\d+)(?![-+0-9\\.])') | |
def parse_lat_lng(lat_lng_string): | |
""" | |
Turns the Location column into (lat, lng) floats | |
May look like this "(29.98645605, -90.06910049)" | |
May have degree symbol "(29.98645605°,-90.06910049°)" | |
""" | |
m = lat_lng_rg.search(lat_lng_string) | |
if m: | |
return (float(m.group(1)), float(m.group(2))) | |
else: | |
return (None, None) | |
# If I were to use this in production, I'd change this to be a batch | |
# call to the database instead of doing one row at a time | |
def find_neighborhood(conn, lat, lng): | |
cur = conn.cursor() | |
sql = """ | |
SELECT name FROM geometries.neighborhoods AS n WHERE | |
st_within(ST_GeomFromText('POINT(%f %f)', 4326), n.geom) LIMIT 1 | |
""" % (lat, lng) | |
cur.execute(sql) | |
rows = cur.fetchone() | |
if rows is None or len(rows) <= 0: | |
return "N/A" | |
return rows[0] | |
def annotate_csv(conn, in_file, out_file): | |
""" | |
Goes row by row through the in_file and | |
writes out the row to the out_file with | |
the new Neighbhorhood column | |
""" | |
reader = csv.reader(in_file) | |
writer = csv.writer(out_file) | |
# Write headers first, add new neighborhood column | |
headers = next(reader) | |
headers.append('Neighborhood') | |
writer.writerow(headers) | |
for row in reader: | |
# WGS84 point, "Location" column, is last element | |
lat, lng = parse_lat_lng(row[-1]) | |
if lat and lng: | |
neighborhood = find_neighborhood(conn, lat, lng) | |
else: | |
neighborhood = 'N/A' | |
row.append(neighborhood) | |
writer.writerow(row) | |
print("#%s lat: %s lng: %s -> %s" % (reader.line_num, lat, lng, | |
neighborhood)) | |
def print_help(): | |
help = """ | |
Usage: | |
python annotate.py postgresql://username:[email protected]/nolabase input.csv output.csv | |
""" | |
print(help) | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print_help() | |
sys.exit() | |
parts = urlparse(sys.argv[1]) | |
conn = psycopg2.connect( | |
user=parts.username, | |
password=parts.password, | |
dbname=parts.path[1:], | |
host=parts.hostname, | |
port=parts.port | |
) | |
in_file_path = sys.argv[2] | |
out_file_path = sys.argv[3] | |
with open(in_file_path, 'r') as in_file: | |
with open(out_file_path, 'w') as out_file: | |
annotate_csv(conn, in_file, out_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment