Skip to content

Instantly share code, notes, and snippets.

@sampottinger
Created October 3, 2013 16:48
Show Gist options
  • Save sampottinger/6813034 to your computer and use it in GitHub Desktop.
Save sampottinger/6813034 to your computer and use it in GitHub Desktop.
A very simple script to combine entries from the TRACER political finance activity database that have the exact same names. Should be run from the command line with two arguments: the location of the CSV file to read entries from and the location where the resulting de-duplicated / sorted list should be written to as CSV.
"""Combine TRACER entries that have the exact same name.
A very simple script to combine entries from the TRACER political finance
activity database that have the exact same names. Should be run from the
command line with two arguments: the location of the CSV file to read entries
from and the location where the resulting deduplicated / sorted list should
be written to as CSV.
@author: Sam Pottinger (samnsparky, gleap.org)
@license: MIT
"""
import csv
import math
import sys
import constants
RESULT_CSV_FIELDS = ['firstName', 'lastName', 'address', 'city', 'state', 'zip']
class NameIndexedEntry:
"""An decorator around an entry providing access to a simple name hash.
A decorator around a TRACER database entry that provides access both to the
original data and a "name hash" or the first name joined with the last name.
"""
def __init__(self, content):
"""Create a new decorator around a TRACER database entry.
@param content: The original data loaded from a TRACER entry.
@type content: dict
"""
self.__content = content
hash_components = (content['firstName'], content['lastName'])
self.__hash = ','.join(hash_components)
def get_content(self):
"""Get the original TRACER database entry.
@return: The data originally loaded from the TRACER database.
@rtype: dict
"""
return self.__content
def get_str_hash(self):
"""Get the name "hash" for this entry (combined first and last name).
@return: A string with the "full name" of the individual or organization
included in this database entry.
@rtype: str
"""
return self.__hash
def __hash__(self):
return hash(self.get_str_hash())
def __cmp__(self, other):
return cmp(self.get_str_hash(), other.get_str_hash())
def cluster_entries(entries):
"""Count the number of times a given entry appears in the dataset.
Count the number of times a record in the database appears that has a
name "hash" perfectly matching the name "hash" of another database record.
@param entries: The database records to look through.
@type entries: List of NameIndexedEntry
"""
counts = {}
for entry in entries:
if not entry in counts:
counts[entry] = 0
counts[entry] += 1
return counts
def main(in_file, out_file):
"""Deduplicate entries based on having the exact same name "hash" value.
@param in_file: The location of the CSV file of entries to deduplicate and
sort.
@type in_file: str
@param out_file: The location where the resulting deduplicated and sorted
list should be written as a CSV file.
@type out_file: str
"""
with open(in_file) as f:
in_file_contents = csv.DictReader(f)
entries = map(NameIndexedEntry, in_file_contents)
clustered_entries = cluster_entries(entries)
deduped_entries = clustered_entries.keys()
deduped_entries.sort()
deduped_entries_content = map(lambda x: x.get_content(), deduped_entries)
with open(out_file, 'w') as f:
result_csv = csv.DictWriter(f, RESULT_CSV_FIELDS)
result_csv.writeheader()
result_csv.writerows(deduped_entries_content)
if len(sys.argv) != 3:
print 'USAGE: python dedup_and_sort.py input_file output_file'
else:
main(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment