walkerdb’s gists

walkerdb / makedictionary.py

Last active August 29, 2015 14:25 — forked from eckardm/makenamedictionary.py

Creates a dictionary using the Name and LC Record Link columns from OpenRefine.

	# import what we need
	import csv
	from fuzzywuzzy import fuzz

	csv.field_size_limit(1000000000)

	# what's coming form openrefine?
	openrefine_persname_1 = 'openrefine_persname_1.csv'
	openrefine_persname_2 = 'openrefine_persname_2.csv'
	openrefine_corpname = 'openrefine_corpname.csv'

walkerdb / viaf_snippet_1.py

Last active August 29, 2015 14:25

	from urllib2 import urlopen, quote
	# if you're running python 3, replace the above with the following:
	# from urllib.request import urlopen
	# from urllib.parse import quote

	def retrieve_viaf_search_results(search_index, search_term, auth_source):
	# url search template formatted to allow easy variable insertion
	search_url_template = 'http://viaf.org/viaf/search/viaf?query=local.{0}+all+{1}+and+local.sources+any+{2}&sortKeys=holdingscount&httpAccept=application/xml'

	# since we'll be inserting the three passed variables into the

walkerdb / viaf_snippet_2.py

Last active August 29, 2015 14:25

	from lxml import etree

	def get_lc_auth_from_viaf_data(response):
	lc_auth = ""

	# parse the returned xml into an lxml etree
	tree = etree.fromstring(response)

	# extract a list of the VIAF search result nodes using an xpath query
	results = tree.xpath("//*[local-name()='record']")

walkerdb / viaf_snippet_3.py

Last active August 29, 2015 14:25

	from bs4 import BeautifulSoup

	def get_lc_term_name(lc_auth_number):
	# create the LoC address by inserting the auth id into a template
	lc_template = "http://id.loc.gov/authorities/names/{0}.html"
	lc_address = lc_template.format(lc_auth_number)

	# get the html for that address
	response = urlopen(lc_address).read()

walkerdb / fuzzy_snippet_1.py

Last active August 29, 2015 14:26

	>>> from fuzzywuzzy import fuzz

	>>> bentley_term = "Emily Dickinson (1830-1886)"
	>>> lc_term = "Dickinson, Emily, 1830-1886"

	>>> fuzz.ratio(bentley_term, lc_term)
	70

	>>> fuzz.ratio("Clark Kent", "Superman")
	22

walkerdb / fuzzy_snippet_2.py

Last active August 29, 2015 14:26

	>>> fuzz.token_sort_ratio(bentley_term, lc_term)
	100

walkerdb / false_positive_check.py

Last active August 29, 2015 14:26

	def is_same_entity(local_term, lc_term, controlaccess_type):

	if "geogname" in controlaccess_type:
	# geognames are a simple check. Returns true if the
	# similarity is > 95; else false
	similarity = fuzz.token_sort_ratio(local_term, lc_term)
	return similarity > 95

	elif "corpname" in controlaccess_type:
	# replace some common abbreviations with their full forms

walkerdb / lxml_1.py

Created October 2, 2015 16:37

	# we're only going to use the etree module (short for "element tree")
	from lxml import etree

	tree = etree.parse("path/to/gargoyle.xml") # replace the path text with your own filesystem path

walkerdb / lxml_2.py

Created October 2, 2015 17:04

extents = tree.xpath("//extent")

walkerdb / lxml_3.py

Last active October 2, 2015 17:24

	# to find all unitid elements whose parent is a did tag:
	tree.xpath("//did/unitid")

	# using an absolute path to find exact locations:
	tree.xpath("/ead/archdesc/did/physdesc/extent")

	## if there are multiple "extent" tags in the parent physdesc,
	## you can find specific tags by designating an index
	## unlike any other language ever, xpath indexes start at 1, not zero

Walker Boyle walkerdb