Created
April 28, 2011 21:55
-
-
Save joshz/947439 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def fix_addresses(): | |
""" | |
checks file that already only have multiple addresses | |
""" | |
print "reading input file" | |
input_file = open('inputfile.csv') | |
reader = csv.reader(input_file, | |
dialect='excel', | |
delimiter=',') | |
addresses = defaultdict(list) | |
header_temp = dict() | |
fixed_entry_list =[] | |
print "loading up dictionary" | |
# load up all the addresses into a dictionary {company_name: [[row], [row], ..., [row]]} | |
for row in reader: | |
if(row[COMPANY_ADDRESS] not in address(addresses[row[COMPANY_NAME]])): | |
addresses[row[COMPANY_NAME]].append(row) | |
fixed_entry_list.append(addresses["LCA_CASE_EMPLOYER_NAME"][0]) | |
del(addresses["LCA_CASE_EMPLOYER_NAME"]) | |
for company_name, address_list in addresses.items(): | |
addrlist = [] | |
print "recafoobling %s" %company_name | |
# create a list of only address tupples | |
for addr in address_list: | |
addrlist.append((addr[COMPANY_ADDRESS], | |
addr[COMPANY_CITY], | |
addr[COMPANY_STATE], | |
addr[COMPANY_ZIP])) | |
new_addr_list = group_most_similar(addrlist, 0.75) | |
for i, addr in enumerate(address_list): | |
entry = addr | |
entry[COMPANY_ADDRESS] = new_addr_list[i][0] | |
fixed_entry_list.append(entry) | |
return fixed_entry_list |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def group_most_similar(addrlist, threshold): | |
"""group most similar addresses""" | |
sm = [] | |
for item in addrlist: | |
row = [] | |
for citem in addrlist: | |
sim = ngram.NGram.compare(" ".join(item), " ".join(citem) , N=1) | |
row.append(sim) | |
sm.append(row) | |
# groups of sets of similar and unique addresses | |
groups = [] | |
for rindex, row in enumerate(sm): | |
group = set() | |
group.add(addrlist[rindex]) | |
for cindex, item in enumerate(row): | |
if item >= threshold: | |
group.add(addrlist[cindex]) | |
groups.append(group) | |
# holds all similar addresses, this is a union of two sets ex. {2, 3}U{1, 2, 3}={1, 2, 3} | |
allsimilar = set() | |
# holds all addresses, including above sets | |
allsimilargroup = [] | |
# combinations of sets, to check if any set is a subset of another set | |
pool = combinations(groups, 2) | |
for item in pool: | |
if item[0].issubset(item[1]): | |
allsimilar = item[0].union(item[1]) | |
# some similar sets are duplicated, don't include them in group | |
if allsimilar not in allsimilargroup: | |
# convert to list so it can be indexed later | |
allsimilargroup.append(list(allsimilar)) | |
# creates final list of hopefully fixed addresses | |
for addr_set in allsimilargroup: | |
# picks first address from | |
for address in addr_set: | |
for i, addr in enumerate(addrlist): | |
if addr == address: | |
addrlist[i] = addr_set[0] | |
# list looks just like addrlist, but with similar addresses collapsed to a single address | |
return addrlist |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
COMPANY_ADDRESS = 5 | |
COMPANY_NAME = 4 | |
COMPANY_CITY = 6 | |
COMPANY_STATE = 7 | |
COMPANY_ZIP = 8 | |
def split(): | |
input_file = open('input_file.csv') | |
reader = csv.reader(input_file, | |
dialect='excel', | |
delimiter=',') | |
addresses = defaultdict(list) | |
# put addresses in a dictionary of {company_name: [(address1), (address2), ... ] | |
for row in reader: | |
# compare address in row with list of addresses for company, flatten list of tuples, extract street | |
if(row[COMPANY_ADDRESS].lower().strip().rstrip() not in [sublist[0] for sublist in addresses[row[4]]]): | |
addresses[row[COMPANY_NAME]].append((row[COMPANY_ADDRESS].lower().strip().rstrip(), | |
row[COMPANY_CITY], | |
row[COMPANY_STATE], | |
row[COMPANY_ZIP])) | |
multiple = {} | |
mul = 0 | |
for k, v in addresses.items(): | |
if len(v) > 1: | |
multiple[k] = v | |
mul += 1 | |
print mul | |
writer_unique = csv.writer(open("companies_unique2.csv", 'wb'), | |
delimiter=',', | |
dialect='excel', | |
quotechar='"', | |
quoting=csv.QUOTE_MINIMAL) | |
writer_multiple = csv.writer(open("companies_multiple2.csv", 'wb'), | |
delimiter=',', | |
dialect='excel', | |
quotechar='"', | |
quoting=csv.QUOTE_MINIMAL) | |
input_file.seek(0) | |
# write companies with multiple addresses to a separate file from companies with single address | |
for row in reader: | |
header = row | |
break | |
writer_unique.writerow(header) | |
writer_multiple.writerow(header) | |
for row in reader: | |
if multiple.has_key(row[COMPANY_NAME]): | |
writer_multiple.writerow(row) | |
else: | |
writer_unique.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment