Created
August 31, 2017 17:11
-
-
Save pmgreen/8ed4146309607eec2196c5f972cde4ad to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: utf-8 -*- | |
""" | |
Query dbpedia. Input is csv which includes authorized names ... | |
... | |
96,"Abdelamir, Chowki" | |
9,"Abraham, Nicolas" | |
31,"Abraham, Nicolas" | |
... | |
Queries local copy of LCNAF, then viaf for viaf uri, then dbpedia for various values (sparql). | |
Cobbled together for jwb's lmg presentation 20161118 | |
from 20161115 | |
pmg | |
""" | |
import csv | |
import os | |
import re | |
import requests | |
import sys | |
import unicodedata | |
from lxml import etree | |
from SPARQLWrapper import SPARQLWrapper, JSON | |
def query_local(label, scheme, thesaurus): | |
''' | |
Queries lcnaf in Fuseki | |
''' | |
src = 'local' | |
tri = [] | |
# SPARQL endpoint(s), one for each scheme (names, subjects) | |
if scheme == 'nam': | |
host = "http://localhost:3030/lcnaf2/" | |
elif scheme == 'sub': | |
host = "http://localhost:3030/lcsaf/" | |
try: | |
label = label.strip() | |
label = re.sub('\s+',' ',label) | |
label = label.replace('"','\u005C\u0022') # e.g. bib 568 "Problemna..." (heading with double quotes). | |
#label = label.replace('"','\u0022') # 4store | |
# replace combined characters (id does this automatically) | |
label = unicodedata.normalize('NFC', label.decode('utf8')) | |
# Modify the Voyager heading variable. TODO: do we want to change them in record, if found, or flag as needing edi? | |
## label = re.sub('(\s[A-Z]\.)([A-Z]\.)',r'\g<1> \g<2>',label) # insert space between initials | |
label = re.sub('(\sb\.)([^\s])',r'\g<1> \g<2>',label) # insert space after ' b.' | |
label = re.sub('^([A-Z]\.)\s([A-Z]\.)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string | |
label = re.sub('\(([A-Z]\.)\s([A-Z]\.)\)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string or in parens | |
label = re.sub('(\sCo$)',r'\g<1>.',label) # insert period after " Co" | |
label = re.sub('(\s[A-Z]$)',r'\g<1>.',label) # insert period after concluding initial ' A.' | |
label = re.sub('\s(\,)',r'\g<1>',label) # replace ' ,' | |
label = re.sub('([a-z])(\()',r'\g<1> \g<2>',label) # replace 'a(' with 'a (' | |
label = label.replace('-\L','-L') # ? replace '\L' 509574 Mulher-\Libertação | |
label = re.sub('\\$','',label) # bib 584803 had ' \' at end of subject | |
# query for notes as well, to eliminate headings that are to be used as subdivisions (see e.g. 'Marriage') | |
query = '''SELECT ?s ?match ?note | |
WHERE | |
{ ?s ?p "%s"@en . | |
OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#note> ?note .FILTER(CONTAINS(?note,"subdivision")) .} | |
OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#exactMatch> ?match} | |
}''' % label | |
# query for variants | |
variant_query = '''SELECT distinct ?s WHERE { {?s ?p ?bn . ?bn <http://www.loc.gov/mads/rdf/v1#variantLabel> "%s"@en . }}''' % label | |
data = { 'query': query} | |
headers={ 'Content-Type':'application/x-www-form-urlencoded','Accept':'application/sparql-results+xml' } | |
#print(query) | |
r = requests.post(host + "sparql", data=data, headers=headers) | |
if r.status_code != requests.codes.ok: | |
msg = '%s, %s' % (label, r.text) | |
sys.exit(msg) | |
try: | |
doc = etree.fromstring(r.text) | |
except: | |
return None,src | |
xpth = "//sparql:binding[@name='s' or @name='match'][not(following-sibling::sparql:binding[@name='note'])]/sparql:uri[. != '(null)']" | |
if thesaurus == 1: | |
xpth += "[contains(.,'childrensSubjects')]" | |
else: | |
xpth += "[not(contains(.,'childrensSubjects'))]" | |
#print(r.text) | |
triples = [] | |
if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0: | |
for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}): | |
triples.append(triple.text) | |
return triples,src | |
else: | |
return None,src | |
data = { 'query': variant_query} | |
r = requests.post(host + "sparql/", data=data, headers=headers ) | |
if r.status_code != requests.codes.ok: | |
msg = '%s, %s' % (label, r.text) | |
sys.exit(msg) | |
try: | |
doc = etree.fromstring(r.text) | |
if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0: | |
for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}): | |
#print "----",triple.text | |
return triple.text, src | |
else: | |
None,src | |
except: | |
return None,src | |
except: | |
etype,evalue,etraceback = sys.exc_info() | |
#print("query_local problem %s %s %s line: %s" % (etype,evalue,etraceback,etraceback.tb_lineno)) | |
def viaf_from_idloc(lccn): | |
""" | |
Query viaf for translation of id.loc uri to viafid | |
""" | |
resp = requests.get("http://viaf.org/viaf/lccn/"+lccn) | |
responses = [] | |
for r in resp.history: | |
viafid = r.url | |
print viafid | |
return viafid | |
def query_dbp(viafid): | |
""" | |
Query dbpedia | |
""" | |
sparql = SPARQLWrapper("http://dbpedia.org/sparql") | |
sparql.setReturnFormat(JSON) | |
occupation = '' | |
occupation2 = '' | |
abstract = '' | |
knownfor = '' | |
philos = '' | |
bdate = '' | |
bplace = '' | |
ddate = '' | |
dplace = '' | |
p = '' | |
viafid = str(viafid) | |
#===================================== | |
# SPARQL queries | |
#===================================== | |
# ?p a <http://dbpedia.org/ontology/Person> . | |
query = """SELECT ?p (xsd:string(?ti)) as ?occupation (xsd:string(?l)) as ?occupation2 ?knownforlabel ?abstract | |
WHERE { | |
?p owl:sameAs <%s> . | |
OPTIONAL { ?p dbp:occupation ?occupation . | |
?occupation dbo:title ?ti . | |
FILTER(LANGMATCHES(LANG(?ti), "en"))} | |
OPTIONAL { ?p dbo:occupation ?occupation2 . | |
?occupation2 rdfs:label ?l . | |
FILTER(LANGMATCHES(LANG(?l), "en"))} | |
OPTIONAL { ?p dbo:knownFor ?knownFor. | |
?knownFor rdfs:label ?knownforlabel . | |
FILTER(LANGMATCHES(LANG(?knownforlabel),"en"))} . | |
OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en")) } . | |
}""" % viafid | |
query_philos = """SELECT ?p xsd:string(?philos) as ?philos ?bdate ?bplacename ?ddate ?dplacename ?abstract | |
WHERE { | |
?p owl:sameAs <%s> . | |
?p dbo:birthDate ?bdate . | |
?p dbo:birthPlace ?bplace . | |
?bplace rdfs:label ?bplacename . | |
FILTER(LANGMATCHES(LANG(?bplacename),"en")). | |
OPTIONAL {?p dbo:deathDate ?ddate . | |
?p dbo:deathPlace ?dplace . | |
?dplace rdfs:label ?dplacename . | |
FILTER(LANGMATCHES(LANG(?dplacename),"en")) }. | |
OPTIONAL { ?p dbo:philosophicalSchool ?philo. | |
?philo rdfs:label ?philos . | |
FILTER(LANGMATCHES(LANG(?philos),"en"))} . | |
OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en")) } . | |
}""" % viafid | |
#print(query_philos) | |
sparql.setQuery(query_philos) # the previous query as a literal string | |
results = sparql.query().convert() | |
#===================================== | |
# get values from returned triples | |
#===================================== | |
try: | |
for result in results['results']['bindings']: | |
p = result['p']['value'] | |
p = p.encode('utf-8') | |
except: | |
p = "not found" | |
try: | |
for result in results['results']['bindings']: | |
occupation = result['occupation']['value'] | |
except: | |
occupation = "not found" | |
try: | |
for result in results['results']['bindings']: | |
occupation2 = result['occupation2']['value'] | |
except: | |
occupation2 = "not found" | |
try: | |
for result in results['results']['bindings']: | |
knownfor = result['knownforlabel']['value'] | |
except: | |
knownfor = "not found" | |
try: | |
for result in results['results']['bindings']: | |
abstract = result['abstract']['value'] | |
abstract = abstract.encode('utf-8') | |
except: | |
abstract = "not found" | |
try: | |
for result in results['results']['bindings']: | |
philos = result['philos']['value'] | |
except: | |
philos = "not found" | |
try: | |
for result in results['results']['bindings']: | |
bdate = result['bdate']['value'] | |
except: | |
bdate = "not found" | |
try: | |
for result in results['results']['bindings']: | |
bplace = result['bplacename']['value'] | |
bplace = bplace.encode('utf-8') | |
#print bplace | |
except: | |
bplace = "not found" | |
try: | |
for result in results['results']['bindings']: | |
ddate = result['ddate']['value'] | |
except: | |
ddate = "not found" | |
try: | |
for result in results['results']['bindings']: | |
dplace = result['dplacename']['value'] | |
dplace = dplace.encode('utf-8') | |
#print dplace | |
except: | |
dplace = "not found" | |
return p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace | |
if __name__ == "__main__": | |
# The first csv is the input, the second is the output which could be uploaded to Palladio | |
with open('../cataloger_review_20170601.csv','rb') as legit, open('../deadicators_20170601.csv','wb+') as legit_w_uri: | |
reader = csv.reader(legit,delimiter=',',quotechar='"') | |
auth = '' | |
iduri = '' | |
viafid = '' | |
occupation = '' | |
for row in reader: | |
auth = row[1] | |
localid = row[0] | |
auth = '%s' % auth | |
iduri = '' | |
viafid = '' | |
#print('%s' % auth) | |
uris,src = query_local(auth,'nam',0) | |
if uris: | |
iduri = uris[0] | |
if uris is not None: | |
lccn = os.path.basename(os.path.normpath(uris[0])) | |
viafid = viaf_from_idloc(lccn) | |
print(auth, uris[0], viafid, row[1]) | |
p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace = query_dbp(viafid) | |
#print(p,occupation, occupation2, knownfor, abstract, philos) | |
writer = csv.writer(legit_w_uri) | |
writer.writerow([auth,iduri,viafid,localid]) | |
#writer.writerow([auth,p,iduri,viafid, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment