Skip to content

Instantly share code, notes, and snippets.

@pmgreen
Created August 31, 2017 17:11
Show Gist options
  • Save pmgreen/8ed4146309607eec2196c5f972cde4ad to your computer and use it in GitHub Desktop.
Save pmgreen/8ed4146309607eec2196c5f972cde4ad to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""
Query dbpedia. Input is csv which includes authorized names ...
...
96,"Abdelamir, Chowki"
9,"Abraham, Nicolas"
31,"Abraham, Nicolas"
...
Queries local copy of LCNAF, then viaf for viaf uri, then dbpedia for various values (sparql).
Cobbled together for jwb's lmg presentation 20161118
from 20161115
pmg
"""
import csv
import os
import re
import requests
import sys
import unicodedata
from lxml import etree
from SPARQLWrapper import SPARQLWrapper, JSON
def query_local(label, scheme, thesaurus):
'''
Queries lcnaf in Fuseki
'''
src = 'local'
tri = []
# SPARQL endpoint(s), one for each scheme (names, subjects)
if scheme == 'nam':
host = "http://localhost:3030/lcnaf2/"
elif scheme == 'sub':
host = "http://localhost:3030/lcsaf/"
try:
label = label.strip()
label = re.sub('\s+',' ',label)
label = label.replace('"','\u005C\u0022') # e.g. bib 568 "Problemna..." (heading with double quotes).
#label = label.replace('"','\u0022') # 4store
# replace combined characters (id does this automatically)
label = unicodedata.normalize('NFC', label.decode('utf8'))
# Modify the Voyager heading variable. TODO: do we want to change them in record, if found, or flag as needing edi?
## label = re.sub('(\s[A-Z]\.)([A-Z]\.)',r'\g<1> \g<2>',label) # insert space between initials
label = re.sub('(\sb\.)([^\s])',r'\g<1> \g<2>',label) # insert space after ' b.'
label = re.sub('^([A-Z]\.)\s([A-Z]\.)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string
label = re.sub('\(([A-Z]\.)\s([A-Z]\.)\)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string or in parens
label = re.sub('(\sCo$)',r'\g<1>.',label) # insert period after " Co"
label = re.sub('(\s[A-Z]$)',r'\g<1>.',label) # insert period after concluding initial ' A.'
label = re.sub('\s(\,)',r'\g<1>',label) # replace ' ,'
label = re.sub('([a-z])(\()',r'\g<1> \g<2>',label) # replace 'a(' with 'a ('
label = label.replace('-\L','-L') # ? replace '\L' 509574 Mulher-\Libertação
label = re.sub('\\$','',label) # bib 584803 had ' \' at end of subject
# query for notes as well, to eliminate headings that are to be used as subdivisions (see e.g. 'Marriage')
query = '''SELECT ?s ?match ?note
WHERE
{ ?s ?p "%s"@en .
OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#note> ?note .FILTER(CONTAINS(?note,"subdivision")) .}
OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#exactMatch> ?match}
}''' % label
# query for variants
variant_query = '''SELECT distinct ?s WHERE { {?s ?p ?bn . ?bn <http://www.loc.gov/mads/rdf/v1#variantLabel> "%s"@en . }}''' % label
data = { 'query': query}
headers={ 'Content-Type':'application/x-www-form-urlencoded','Accept':'application/sparql-results+xml' }
#print(query)
r = requests.post(host + "sparql", data=data, headers=headers)
if r.status_code != requests.codes.ok:
msg = '%s, %s' % (label, r.text)
sys.exit(msg)
try:
doc = etree.fromstring(r.text)
except:
return None,src
xpth = "//sparql:binding[@name='s' or @name='match'][not(following-sibling::sparql:binding[@name='note'])]/sparql:uri[. != '(null)']"
if thesaurus == 1:
xpth += "[contains(.,'childrensSubjects')]"
else:
xpth += "[not(contains(.,'childrensSubjects'))]"
#print(r.text)
triples = []
if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0:
for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}):
triples.append(triple.text)
return triples,src
else:
return None,src
data = { 'query': variant_query}
r = requests.post(host + "sparql/", data=data, headers=headers )
if r.status_code != requests.codes.ok:
msg = '%s, %s' % (label, r.text)
sys.exit(msg)
try:
doc = etree.fromstring(r.text)
if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0:
for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}):
#print "----",triple.text
return triple.text, src
else:
None,src
except:
return None,src
except:
etype,evalue,etraceback = sys.exc_info()
#print("query_local problem %s %s %s line: %s" % (etype,evalue,etraceback,etraceback.tb_lineno))
def viaf_from_idloc(lccn):
"""
Query viaf for translation of id.loc uri to viafid
"""
resp = requests.get("http://viaf.org/viaf/lccn/"+lccn)
responses = []
for r in resp.history:
viafid = r.url
print viafid
return viafid
def query_dbp(viafid):
"""
Query dbpedia
"""
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)
occupation = ''
occupation2 = ''
abstract = ''
knownfor = ''
philos = ''
bdate = ''
bplace = ''
ddate = ''
dplace = ''
p = ''
viafid = str(viafid)
#=====================================
# SPARQL queries
#=====================================
# ?p a <http://dbpedia.org/ontology/Person> .
query = """SELECT ?p (xsd:string(?ti)) as ?occupation (xsd:string(?l)) as ?occupation2 ?knownforlabel ?abstract
WHERE {
?p owl:sameAs <%s> .
OPTIONAL { ?p dbp:occupation ?occupation .
?occupation dbo:title ?ti .
FILTER(LANGMATCHES(LANG(?ti), "en"))}
OPTIONAL { ?p dbo:occupation ?occupation2 .
?occupation2 rdfs:label ?l .
FILTER(LANGMATCHES(LANG(?l), "en"))}
OPTIONAL { ?p dbo:knownFor ?knownFor.
?knownFor rdfs:label ?knownforlabel .
FILTER(LANGMATCHES(LANG(?knownforlabel),"en"))} .
OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en")) } .
}""" % viafid
query_philos = """SELECT ?p xsd:string(?philos) as ?philos ?bdate ?bplacename ?ddate ?dplacename ?abstract
WHERE {
?p owl:sameAs <%s> .
?p dbo:birthDate ?bdate .
?p dbo:birthPlace ?bplace .
?bplace rdfs:label ?bplacename .
FILTER(LANGMATCHES(LANG(?bplacename),"en")).
OPTIONAL {?p dbo:deathDate ?ddate .
?p dbo:deathPlace ?dplace .
?dplace rdfs:label ?dplacename .
FILTER(LANGMATCHES(LANG(?dplacename),"en")) }.
OPTIONAL { ?p dbo:philosophicalSchool ?philo.
?philo rdfs:label ?philos .
FILTER(LANGMATCHES(LANG(?philos),"en"))} .
OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en")) } .
}""" % viafid
#print(query_philos)
sparql.setQuery(query_philos) # the previous query as a literal string
results = sparql.query().convert()
#=====================================
# get values from returned triples
#=====================================
try:
for result in results['results']['bindings']:
p = result['p']['value']
p = p.encode('utf-8')
except:
p = "not found"
try:
for result in results['results']['bindings']:
occupation = result['occupation']['value']
except:
occupation = "not found"
try:
for result in results['results']['bindings']:
occupation2 = result['occupation2']['value']
except:
occupation2 = "not found"
try:
for result in results['results']['bindings']:
knownfor = result['knownforlabel']['value']
except:
knownfor = "not found"
try:
for result in results['results']['bindings']:
abstract = result['abstract']['value']
abstract = abstract.encode('utf-8')
except:
abstract = "not found"
try:
for result in results['results']['bindings']:
philos = result['philos']['value']
except:
philos = "not found"
try:
for result in results['results']['bindings']:
bdate = result['bdate']['value']
except:
bdate = "not found"
try:
for result in results['results']['bindings']:
bplace = result['bplacename']['value']
bplace = bplace.encode('utf-8')
#print bplace
except:
bplace = "not found"
try:
for result in results['results']['bindings']:
ddate = result['ddate']['value']
except:
ddate = "not found"
try:
for result in results['results']['bindings']:
dplace = result['dplacename']['value']
dplace = dplace.encode('utf-8')
#print dplace
except:
dplace = "not found"
return p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace
if __name__ == "__main__":
# The first csv is the input, the second is the output which could be uploaded to Palladio
with open('../cataloger_review_20170601.csv','rb') as legit, open('../deadicators_20170601.csv','wb+') as legit_w_uri:
reader = csv.reader(legit,delimiter=',',quotechar='"')
auth = ''
iduri = ''
viafid = ''
occupation = ''
for row in reader:
auth = row[1]
localid = row[0]
auth = '%s' % auth
iduri = ''
viafid = ''
#print('%s' % auth)
uris,src = query_local(auth,'nam',0)
if uris:
iduri = uris[0]
if uris is not None:
lccn = os.path.basename(os.path.normpath(uris[0]))
viafid = viaf_from_idloc(lccn)
print(auth, uris[0], viafid, row[1])
p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace = query_dbp(viafid)
#print(p,occupation, occupation2, knownfor, abstract, philos)
writer = csv.writer(legit_w_uri)
writer.writerow([auth,iduri,viafid,localid])
#writer.writerow([auth,p,iduri,viafid, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment