Skip to content

Instantly share code, notes, and snippets.

@walkerdb
Forked from eckardm/makenamedictionary.py
Last active August 29, 2015 14:25
Show Gist options
  • Save walkerdb/ca22319840109c817370 to your computer and use it in GitHub Desktop.
Save walkerdb/ca22319840109c817370 to your computer and use it in GitHub Desktop.
Creates a dictionary using the Name and LC Record Link columns from OpenRefine.
# import what we need
import csv
from fuzzywuzzy import fuzz
csv.field_size_limit(1000000000)
# what's coming form openrefine?
openrefine_persname_1 = 'openrefine_persname_1.csv'
openrefine_persname_2 = 'openrefine_persname_2.csv'
openrefine_corpname = 'openrefine_corpname.csv'
# empty dictionaries
persnames_dictionary = {}
corpnames_dictionary = {}
# print 'Creating persname dictionary.'
# persnames
def persnames(openrefine_persname):
with open(openrefine_persname, 'rb') as persnames:
openrefine_persname_reader = csv.reader(persnames)
# skip the first row
next(openrefine_persname_reader, None)
# make dictionary
for row in openrefine_persname_reader:
print '\rWorking on it... |',
print '\rWorking on it... /',
print '\rWorking on it... -',
print '\rWorking on it... \\',
print '\rWorking on it... |',
print '\rWorking on it... /',
print '\rWorking on it... -',
print '\rWorking on it... -',
print '\rWorking on it... \\',
original = row[0]
authority = row[1]
link = row[2]
if link is not None and link.startswith('http://id.loc.gov/authorities/names/'):
fuzz_ratio = fuzz.ratio(original, authority)
if fuzz_ratio >= 80:
persnames_dictionary[original] = link
persnames(openrefine_persname_1)
persnames(openrefine_persname_2)
print '\rPersname dictionary created.'
print 'Creating corpname dictionary.'
# corpnames
with open(openrefine_corpname, 'rb') as corpnames:
openrefine_corpname_reader = csv.reader(corpnames)
# skip the first row
next(openrefine_corpname_reader, None)
# make dictionary
for row in openrefine_corpname_reader:
print '\rWorking on it... |',
print '\rWorking on it... /',
print '\rWorking on it... -',
print '\rWorking on it... \\',
print '\rWorking on it... |',
print '\rWorking on it... /',
print '\rWorking on it... -',
print '\rWorking on it... -',
print '\rWorking on it... \\',
original = row[0]
authority = row[1]
link = row[2]
if link is not None and link.startswith('http://id.loc.gov/authorities/names/'):
fuzz_ratio = fuzz.ratio(original, authority)
if fuzz_ratio >= 80:
corpnames_dictionary[original] = link
print '\rCorpname dictionary created.'
print 'Writing to constants.'
# where is constants.py?
constants = 'constants.py'
# put the dictionaries in constants.py
with open(constants, "a") as txt_file:
txt_file.write('# dictionary for persnames\npersnames_dictionary = ' + str(persnames_dictionary))
txt_file.write('\n\n# dictionary for corpnames\ncorpnames_dictionary = ' + str(corpnames_dictionary))
print 'Written to constants.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment