|
#!/usr/bin/env python3 |
|
# Require running in Python3 |
|
|
|
# Load our dependencies |
|
import csv |
|
import sys |
|
|
|
# Define our constants |
|
COUNTER_INTERVAL = 100 |
|
|
|
# Define our main function |
|
def main(csv_file): |
|
csv_reader = csv.DictReader(csv_file) |
|
rows_by_mailing_address = {} |
|
for i, row in enumerate(csv_reader): |
|
# TODO: Add counter to invocations |
|
# Read out our addresses in reverse order so we can match nearby |
|
mailing_address = [row['MAILADRS4'], row['MAILADRS3'], row['MAILADRS2'], row['MAILADRS1']] |
|
mailing_address = [item for item in mailing_address if item != ''] |
|
# TODO: Figure out to properly read PROPLOC (e.g. '0000 0000 0000' seems useless) |
|
property_location = row['PROPLOC'] |
|
|
|
# TODO: Handle close but not direct matches |
|
# ['MICHEL M ELKAIM 2002 REVOC', 'BOMAR NICOLE SUCCESSOR TRUS', '2711 OCTAVIA ST APT 403', 'SAN FRANCISCO CA'] |
|
# ['MICHEL M ELKAIM 2002 REVOC', '2711 OCTAVIA ST', 'SAN FRANCISCO CA', ''] |
|
# Ugh, there's a surprisingly large amount of these... |
|
|
|
# If we're at a counter step, log it out |
|
if i % COUNTER_INTERVAL == 0: |
|
print('Processed {i} rows...'.format(i=i), file=sys.stderr) |
|
|
|
# Add our property location to our list of aggregates |
|
# DEV: We're not using `collections.defaultdict` to make porting to JS easier if we need to |
|
mailing_key = '__'.join(mailing_address[0:2]) |
|
if mailing_key not in rows_by_mailing_address: |
|
rows_by_mailing_address[mailing_key] = [] |
|
rows_by_mailing_address[mailing_key].append(row) |
|
|
|
# Dump keys in order of most to least rows |
|
mailing_keys = list(rows_by_mailing_address.keys()) |
|
mailing_keys.sort(key=lambda key: len(rows_by_mailing_address[key]), reverse=True) |
|
|
|
# Dump a simplified version of our aggregated rows |
|
for mailing_key in mailing_keys: |
|
rows = rows_by_mailing_address[mailing_key] |
|
print(len(rows), ' - ', mailing_key) |
|
for row in rows: |
|
print('\t', row['PROPLOC'], row['MAILADRS1'], row['MAILADRS2'], row['MAILADRS3'], row['MAILADRS4']) |
|
|
|
# TODO: Take final list and compare it to business registry |
|
|
|
# If being run directly, invoke our main function with our desired file |
|
if __name__ == '__main__': |
|
with open('2019.1.15__SF_ASR_Secured_Roll_Data_2014-2015.csv', 'r') as csv_file: |
|
main(csv_file) |