This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import what we need | |
import csv | |
from fuzzywuzzy import fuzz | |
csv.field_size_limit(1000000000) | |
# what's coming form openrefine? | |
openrefine_persname_1 = 'openrefine_persname_1.csv' | |
openrefine_persname_2 = 'openrefine_persname_2.csv' | |
openrefine_corpname = 'openrefine_corpname.csv' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib2 import urlopen, quote | |
# if you're running python 3, replace the above with the following: | |
# from urllib.request import urlopen | |
# from urllib.parse import quote | |
def retrieve_viaf_search_results(search_index, search_term, auth_source): | |
# url search template formatted to allow easy variable insertion | |
search_url_template = 'http://viaf.org/viaf/search/viaf?query=local.{0}+all+{1}+and+local.sources+any+{2}&sortKeys=holdingscount&httpAccept=application/xml' | |
# since we'll be inserting the three passed variables into the |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
def get_lc_auth_from_viaf_data(response): | |
lc_auth = "" | |
# parse the returned xml into an lxml etree | |
tree = etree.fromstring(response) | |
# extract a list of the VIAF search result nodes using an xpath query | |
results = tree.xpath("//*[local-name()='record']") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
def get_lc_term_name(lc_auth_number): | |
# create the LoC address by inserting the auth id into a template | |
lc_template = "http://id.loc.gov/authorities/names/{0}.html" | |
lc_address = lc_template.format(lc_auth_number) | |
# get the html for that address | |
response = urlopen(lc_address).read() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from fuzzywuzzy import fuzz | |
>>> bentley_term = "Emily Dickinson (1830-1886)" | |
>>> lc_term = "Dickinson, Emily, 1830-1886" | |
>>> fuzz.ratio(bentley_term, lc_term) | |
70 | |
>>> fuzz.ratio("Clark Kent", "Superman") | |
22 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> fuzz.token_sort_ratio(bentley_term, lc_term) | |
100 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def is_same_entity(local_term, lc_term, controlaccess_type): | |
if "geogname" in controlaccess_type: | |
# geognames are a simple check. Returns true if the | |
# similarity is > 95; else false | |
similarity = fuzz.token_sort_ratio(local_term, lc_term) | |
return similarity > 95 | |
elif "corpname" in controlaccess_type: | |
# replace some common abbreviations with their full forms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# we're only going to use the etree module (short for "element tree") | |
from lxml import etree | |
tree = etree.parse("path/to/gargoyle.xml") # replace the path text with your own filesystem path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extents = tree.xpath("//extent") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# to find all unitid elements whose parent is a did tag: | |
tree.xpath("//did/unitid") | |
# using an absolute path to find exact locations: | |
tree.xpath("/ead/archdesc/did/physdesc/extent") | |
## if there are multiple "extent" tags in the parent physdesc, | |
## you can find specific tags by designating an index | |
## unlike any other language ever, xpath indexes start at 1, not zero |
OlderNewer