Skip to content

Instantly share code, notes, and snippets.

@joshz
Created April 28, 2011 21:55
Show Gist options
  • Save joshz/947439 to your computer and use it in GitHub Desktop.
Save joshz/947439 to your computer and use it in GitHub Desktop.
def fix_addresses():
"""
checks file that already only have multiple addresses
"""
print "reading input file"
input_file = open('inputfile.csv')
reader = csv.reader(input_file,
dialect='excel',
delimiter=',')
addresses = defaultdict(list)
header_temp = dict()
fixed_entry_list =[]
print "loading up dictionary"
# load up all the addresses into a dictionary {company_name: [[row], [row], ..., [row]]}
for row in reader:
if(row[COMPANY_ADDRESS] not in address(addresses[row[COMPANY_NAME]])):
addresses[row[COMPANY_NAME]].append(row)
fixed_entry_list.append(addresses["LCA_CASE_EMPLOYER_NAME"][0])
del(addresses["LCA_CASE_EMPLOYER_NAME"])
for company_name, address_list in addresses.items():
addrlist = []
print "recafoobling %s" %company_name
# create a list of only address tupples
for addr in address_list:
addrlist.append((addr[COMPANY_ADDRESS],
addr[COMPANY_CITY],
addr[COMPANY_STATE],
addr[COMPANY_ZIP]))
new_addr_list = group_most_similar(addrlist, 0.75)
for i, addr in enumerate(address_list):
entry = addr
entry[COMPANY_ADDRESS] = new_addr_list[i][0]
fixed_entry_list.append(entry)
return fixed_entry_list
def group_most_similar(addrlist, threshold):
"""group most similar addresses"""
sm = []
for item in addrlist:
row = []
for citem in addrlist:
sim = ngram.NGram.compare(" ".join(item), " ".join(citem) , N=1)
row.append(sim)
sm.append(row)
# groups of sets of similar and unique addresses
groups = []
for rindex, row in enumerate(sm):
group = set()
group.add(addrlist[rindex])
for cindex, item in enumerate(row):
if item >= threshold:
group.add(addrlist[cindex])
groups.append(group)
# holds all similar addresses, this is a union of two sets ex. {2, 3}U{1, 2, 3}={1, 2, 3}
allsimilar = set()
# holds all addresses, including above sets
allsimilargroup = []
# combinations of sets, to check if any set is a subset of another set
pool = combinations(groups, 2)
for item in pool:
if item[0].issubset(item[1]):
allsimilar = item[0].union(item[1])
# some similar sets are duplicated, don't include them in group
if allsimilar not in allsimilargroup:
# convert to list so it can be indexed later
allsimilargroup.append(list(allsimilar))
# creates final list of hopefully fixed addresses
for addr_set in allsimilargroup:
# picks first address from
for address in addr_set:
for i, addr in enumerate(addrlist):
if addr == address:
addrlist[i] = addr_set[0]
# list looks just like addrlist, but with similar addresses collapsed to a single address
return addrlist
COMPANY_ADDRESS = 5
COMPANY_NAME = 4
COMPANY_CITY = 6
COMPANY_STATE = 7
COMPANY_ZIP = 8
def split():
input_file = open('input_file.csv')
reader = csv.reader(input_file,
dialect='excel',
delimiter=',')
addresses = defaultdict(list)
# put addresses in a dictionary of {company_name: [(address1), (address2), ... ]
for row in reader:
# compare address in row with list of addresses for company, flatten list of tuples, extract street
if(row[COMPANY_ADDRESS].lower().strip().rstrip() not in [sublist[0] for sublist in addresses[row[4]]]):
addresses[row[COMPANY_NAME]].append((row[COMPANY_ADDRESS].lower().strip().rstrip(),
row[COMPANY_CITY],
row[COMPANY_STATE],
row[COMPANY_ZIP]))
multiple = {}
mul = 0
for k, v in addresses.items():
if len(v) > 1:
multiple[k] = v
mul += 1
print mul
writer_unique = csv.writer(open("companies_unique2.csv", 'wb'),
delimiter=',',
dialect='excel',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
writer_multiple = csv.writer(open("companies_multiple2.csv", 'wb'),
delimiter=',',
dialect='excel',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
input_file.seek(0)
# write companies with multiple addresses to a separate file from companies with single address
for row in reader:
header = row
break
writer_unique.writerow(header)
writer_multiple.writerow(header)
for row in reader:
if multiple.has_key(row[COMPANY_NAME]):
writer_multiple.writerow(row)
else:
writer_unique.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment