Created
October 3, 2013 16:48
-
-
Save sampottinger/6813034 to your computer and use it in GitHub Desktop.
A very simple script to combine entries from the TRACER political finance
activity database that have the exact same names. Should be run from the
command line with two arguments: the location of the CSV file to read entries
from and the location where the resulting de-duplicated / sorted list should
be written to as CSV.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Combine TRACER entries that have the exact same name. | |
A very simple script to combine entries from the TRACER political finance | |
activity database that have the exact same names. Should be run from the | |
command line with two arguments: the location of the CSV file to read entries | |
from and the location where the resulting deduplicated / sorted list should | |
be written to as CSV. | |
@author: Sam Pottinger (samnsparky, gleap.org) | |
@license: MIT | |
""" | |
import csv | |
import math | |
import sys | |
import constants | |
RESULT_CSV_FIELDS = ['firstName', 'lastName', 'address', 'city', 'state', 'zip'] | |
class NameIndexedEntry: | |
"""An decorator around an entry providing access to a simple name hash. | |
A decorator around a TRACER database entry that provides access both to the | |
original data and a "name hash" or the first name joined with the last name. | |
""" | |
def __init__(self, content): | |
"""Create a new decorator around a TRACER database entry. | |
@param content: The original data loaded from a TRACER entry. | |
@type content: dict | |
""" | |
self.__content = content | |
hash_components = (content['firstName'], content['lastName']) | |
self.__hash = ','.join(hash_components) | |
def get_content(self): | |
"""Get the original TRACER database entry. | |
@return: The data originally loaded from the TRACER database. | |
@rtype: dict | |
""" | |
return self.__content | |
def get_str_hash(self): | |
"""Get the name "hash" for this entry (combined first and last name). | |
@return: A string with the "full name" of the individual or organization | |
included in this database entry. | |
@rtype: str | |
""" | |
return self.__hash | |
def __hash__(self): | |
return hash(self.get_str_hash()) | |
def __cmp__(self, other): | |
return cmp(self.get_str_hash(), other.get_str_hash()) | |
def cluster_entries(entries): | |
"""Count the number of times a given entry appears in the dataset. | |
Count the number of times a record in the database appears that has a | |
name "hash" perfectly matching the name "hash" of another database record. | |
@param entries: The database records to look through. | |
@type entries: List of NameIndexedEntry | |
""" | |
counts = {} | |
for entry in entries: | |
if not entry in counts: | |
counts[entry] = 0 | |
counts[entry] += 1 | |
return counts | |
def main(in_file, out_file): | |
"""Deduplicate entries based on having the exact same name "hash" value. | |
@param in_file: The location of the CSV file of entries to deduplicate and | |
sort. | |
@type in_file: str | |
@param out_file: The location where the resulting deduplicated and sorted | |
list should be written as a CSV file. | |
@type out_file: str | |
""" | |
with open(in_file) as f: | |
in_file_contents = csv.DictReader(f) | |
entries = map(NameIndexedEntry, in_file_contents) | |
clustered_entries = cluster_entries(entries) | |
deduped_entries = clustered_entries.keys() | |
deduped_entries.sort() | |
deduped_entries_content = map(lambda x: x.get_content(), deduped_entries) | |
with open(out_file, 'w') as f: | |
result_csv = csv.DictWriter(f, RESULT_CSV_FIELDS) | |
result_csv.writeheader() | |
result_csv.writerows(deduped_entries_content) | |
if len(sys.argv) != 3: | |
print 'USAGE: python dedup_and_sort.py input_file output_file' | |
else: | |
main(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment