Last active
July 18, 2017 10:07
-
-
Save cwurld/bbd138c953b416644ed66901d1e9196b to your computer and use it in GitHub Desktop.
Example of Python Dedupe Gazetteer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Gazetteer --------------------------------------------------------------------------------------------------------- | |
def load_deduped_output_for_gazetteer(filename): | |
""" | |
Parse data from dedupe into: | |
1. Canonical dataset (no dupes) | |
2. messy data - all the dupes | |
3. markPairs. | |
:return: | |
""" | |
with open(filename) as f_input: | |
reader = csv.DictReader(f_input) | |
field_names = get_dedupe_field_names() # a function used by Dedupe and Gazetteer | |
# For markPairs | |
dups = {} # dups[cluster_id] = [a record from the cluster, another record from the cluster] | |
singletons = [] | |
# For Gazetteer.sample() | |
data_d = {} | |
messy_d = {} | |
clusters_used = set([]) | |
# Parse results from output from dedupe. | |
for row in reader: | |
cluster_id = row['Cluster ID'] | |
row_id = int(row['id']) | |
# The data is already clean, replace blank fields with None | |
clean_row = dict([(k, row[k] or None) for k in field_names]) | |
# Put one item from each cluster into data_d, the rest go into messy_d | |
if cluster_id in clusters_used: | |
messy_d[row_id] = dict(clean_row) | |
else: | |
clusters_used.add(cluster_id) | |
data_d[row_id] = dict(clean_row) | |
# Singletons do not have a confidence score. | |
if row['confidence_score']: | |
if cluster_id not in dups: | |
dups[cluster_id] = [] | |
if len(dups[cluster_id]) < 2: | |
dups[cluster_id].append(clean_row) | |
else: | |
singletons.append(clean_row) | |
# markPairs wants this dict | |
labeled_examples = {'match': [], 'distinct': []} | |
# Load matches ------------------------------------------------------------------------------------------ | |
for k, v in six.iteritems(dups): | |
labeled_examples['match'].append((v[0], v[1])) | |
# Make a set of distinct pairs -------------------------------------------------------------------------- | |
# Make a shuffled set of indices into singletons | |
ss = list(range(len(singletons))) | |
random.shuffle(ss) # shuffle is in-place | |
n_singletons = 50 * len(dups) | |
while True: | |
try: | |
i1 = ss.pop() | |
except IndexError: | |
break | |
try: | |
i2 = ss.pop() | |
except IndexError: | |
break | |
labeled_examples['distinct'].append((singletons[i1], singletons[i2])) | |
if len(labeled_examples['distinct']) > n_singletons: | |
break | |
return data_d, messy_d, labeled_examples | |
# https://www.snip2code.com/Snippet/460447/address_matcher-py | |
def my_gazetteer(): | |
filename = 'dedupe_params/test_output.csv' # file after dedupe | |
g_settings_file = 'dedupe_params/g_learned_settings.dat' | |
g_training_file = 'dedupe_params/g_training.json' | |
data_d, messy_d, labeled_examples = load_deduped_output_for_gazetteer(filename) | |
# If a settings file already exists, we'll just load that and skip training | |
if os.path.exists(g_settings_file): | |
print('reading from', g_settings_file) | |
with open(settings_file, 'rb') as f: | |
linker = dedupe.StaticGazetteer(f) | |
else: | |
fields = get_dedupe_fields() | |
linker = dedupe.Gazetteer(fields) | |
linker.sample(messy_d, data_d, 3000) | |
# Since we are using markPairs, we do not need to do manual matching with dedupe.consoleLabel(deduper) | |
linker.markPairs(labeled_examples) | |
linker.train() | |
with open(g_training_file, 'w') as tf: | |
linker.writeTraining(tf) | |
with open(g_settings_file, 'wb') as sf: | |
linker.writeSettings(sf) | |
linker.cleanupTraining() | |
linker.index(data_d) | |
threshold = linker.threshold(data_d) | |
clustered_dupes = linker.match(messy_d, threshold=threshold) | |
print('Found {} duplicate sets out of {}'.format(len(clustered_dupes), len(messy_d))) | |
test = {1234: {'id': 1234, 'first_name': 'Groucho', 'last_name': 'Marx', 'account_name': 'ROI Park', | |
'email1': '[email protected]', | |
'phone_work': '', 'phone_mobile': '', 'primary_address_state': 'WI'}} | |
c_test = {1234: clean_record(test[1234])} | |
# If the possible dup is very different from the training set, it can generate a ValueError. It turns out this | |
# can happen even for records that look similar to the training set. | |
try: | |
matches = linker.match(c_test, threshold) | |
except ValueError: | |
return [] | |
if matches: | |
print('{} has matches'.format(test)) | |
else: | |
print('{} does not match any record'.format(test)) | |
if __name__ == '__main__': | |
my_gazetteer() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment