pmgreen · August 31, 2017 17:11
diff --git a/get_linked_data.py b/get_linked_data.py
 #!/usr/bin/env python
 #-*- coding: utf-8 -*-
 """
 Query dbpedia. Input is csv which includes authorized names ...
 ...
 96,"Abdelamir, Chowki"
 9,"Abraham, Nicolas"
 31,"Abraham, Nicolas"
 ...
 Queries local copy of LCNAF, then viaf for viaf uri, then dbpedia for various values (sparql).
 Cobbled together for jwb's lmg presentation 20161118
 from 20161115
 pmg
 """
 import csv
 import os
 import re
 import requests
 import sys
 import unicodedata
 from lxml import etree
 from SPARQLWrapper import SPARQLWrapper, JSON

 def query_local(label, scheme, thesaurus):
 	'''
 	Queries lcnaf in Fuseki
 	'''
 	src = 'local'
 	tri = []
 	
 	# SPARQL endpoint(s), one for each scheme (names, subjects)
 	if scheme == 'nam':
 		host = "http://localhost:3030/lcnaf2/"
 	elif scheme == 'sub':
 		host = "http://localhost:3030/lcsaf/"

 	try:
 		label = label.strip()
 		label = re.sub('\s+',' ',label)
 		label = label.replace('"','\u005C\u0022') # e.g. bib 568 "Problemna..." (heading with double quotes).
 		#label = label.replace('"','\u0022') # 4store
 		# replace combined characters (id does this automatically)
 		label = unicodedata.normalize('NFC', label.decode('utf8'))
 		# Modify the Voyager heading variable. TODO: do we want to change them in record, if found, or flag as needing edi?
 		## label = re.sub('(\s[A-Z]\.)([A-Z]\.)',r'\g<1> \g<2>',label) # insert space between initials
 		label = re.sub('(\sb\.)([^\s])',r'\g<1> \g<2>',label) # insert space after ' b.' 
 		label = re.sub('^([A-Z]\.)\s([A-Z]\.)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string
 		label = re.sub('\(([A-Z]\.)\s([A-Z]\.)\)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string or in parens
 		label = re.sub('(\sCo$)',r'\g<1>.',label) # insert period after " Co" 
 		label = re.sub('(\s[A-Z]$)',r'\g<1>.',label) # insert period after concluding initial ' A.'
 		label = re.sub('\s(\,)',r'\g<1>',label) # replace ' ,'
 		label = re.sub('([a-z])(\()',r'\g<1> \g<2>',label) # replace 'a(' with 'a ('
 		label = label.replace('-\L','-L') # ? replace '\L' 509574 Mulher-\Libertação
 		label = re.sub('\\$','',label) # bib 584803 had ' \' at end of subject


 		# query for notes as well, to eliminate headings that are to be used as subdivisions (see e.g. 'Marriage')
 		query = '''SELECT ?s ?match ?note
 		WHERE
 		{ ?s ?p "%s"@en .
 		OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#note> ?note .FILTER(CONTAINS(?note,"subdivision")) .}
 		OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#exactMatch> ?match}	
 		}''' % label
 		
 		# query for variants
 		variant_query = '''SELECT distinct ?s WHERE { {?s ?p ?bn  . ?bn <http://www.loc.gov/mads/rdf/v1#variantLabel> "%s"@en . }}''' % label

 		data = { 'query': query}
 		headers={ 'Content-Type':'application/x-www-form-urlencoded','Accept':'application/sparql-results+xml' }

 		#print(query)

 		r = requests.post(host + "sparql", data=data, headers=headers)
 		
 		if r.status_code != requests.codes.ok:
 			msg = '%s, %s' % (label, r.text)
 			sys.exit(msg)
 		try:
 			doc = etree.fromstring(r.text)
 		except:
 			return None,src

 		xpth = "//sparql:binding[@name='s' or @name='match'][not(following-sibling::sparql:binding[@name='note'])]/sparql:uri[. != '(null)']"
 		
 		if thesaurus == 1:
 			xpth += "[contains(.,'childrensSubjects')]"
 		else: 
 			xpth += "[not(contains(.,'childrensSubjects'))]"

 		#print(r.text)
 		
 		triples = []
 		if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0:
 			for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}):
 				triples.append(triple.text)
 			return triples,src
 		else:
 			return None,src
 			data = { 'query': variant_query}
 			
 			r = requests.post(host + "sparql/", data=data, headers=headers )
 			if r.status_code != requests.codes.ok: 
 				msg = '%s, %s' % (label, r.text)
 				sys.exit(msg)
 			try:
 				doc = etree.fromstring(r.text)
 				if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0:
 					for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}):
 						#print "----",triple.text	
 						return triple.text, src
 				else:
 					None,src
 			except:
 				return None,src

 	except:
 		etype,evalue,etraceback = sys.exc_info()

 		#print("query_local problem %s %s %s line: %s" % (etype,evalue,etraceback,etraceback.tb_lineno))


 def viaf_from_idloc(lccn):
 	"""
 	Query viaf for translation of id.loc uri to viafid
 	"""
 	resp = requests.get("http://viaf.org/viaf/lccn/"+lccn)
 	responses = []
 	for r in resp.history:
 		viafid = r.url
 	print viafid
 	return viafid


 def query_dbp(viafid):
 	"""
 	Query dbpedia
 	"""
 	sparql = SPARQLWrapper("http://dbpedia.org/sparql")
 	sparql.setReturnFormat(JSON)
 	occupation = ''
 	occupation2 = ''
 	abstract = ''
 	knownfor = ''
 	philos = ''
 	bdate = ''
 	bplace = ''
 	ddate = ''
 	dplace = ''
 	p = ''

 	viafid = str(viafid)

 	#=====================================
 	# SPARQL queries
 	#=====================================
 	
 	# ?p a <http://dbpedia.org/ontology/Person> .
 	query = """SELECT ?p (xsd:string(?ti)) as ?occupation (xsd:string(?l)) as ?occupation2 ?knownforlabel ?abstract
 		WHERE {
 		?p owl:sameAs <%s> .
 		OPTIONAL { ?p dbp:occupation ?occupation .
 		?occupation dbo:title ?ti .
 		FILTER(LANGMATCHES(LANG(?ti), "en"))}
 		OPTIONAL { ?p dbo:occupation ?occupation2 .
 		?occupation2 rdfs:label ?l .
 		FILTER(LANGMATCHES(LANG(?l), "en"))}
 		OPTIONAL { ?p dbo:knownFor ?knownFor.
 			?knownFor rdfs:label ?knownforlabel .
 			FILTER(LANGMATCHES(LANG(?knownforlabel),"en"))} .
        OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en"))  } .
 		}""" % viafid

 	query_philos = """SELECT ?p xsd:string(?philos) as ?philos ?bdate ?bplacename ?ddate ?dplacename ?abstract
 		WHERE {
 		?p owl:sameAs <%s> .
 		?p dbo:birthDate ?bdate .
                ?p dbo:birthPlace ?bplace .
                ?bplace rdfs:label ?bplacename .
                FILTER(LANGMATCHES(LANG(?bplacename),"en")).
                OPTIONAL {?p dbo:deathDate ?ddate .
                ?p dbo:deathPlace ?dplace . 
                ?dplace rdfs:label ?dplacename . 
                 FILTER(LANGMATCHES(LANG(?dplacename),"en")) }.
                OPTIONAL { ?p dbo:philosophicalSchool ?philo.
                 ?philo rdfs:label ?philos .
                 FILTER(LANGMATCHES(LANG(?philos),"en"))} .
                OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en"))  } .
        }""" % viafid


 	#print(query_philos)

 	sparql.setQuery(query_philos)  # the previous query as a literal string

 	results = sparql.query().convert()

 	#=====================================
 	# get values from returned triples
 	#=====================================
 	try:
 		for result in results['results']['bindings']:
 			p = result['p']['value']
 			p = p.encode('utf-8')
 	except:
 		p = "not found"

 	try:
 		for result in results['results']['bindings']:
 			occupation = result['occupation']['value']
 	except:
 		occupation = "not found"

 	try:
 		for result in results['results']['bindings']:
 			occupation2 = result['occupation2']['value']
 	except:
 		occupation2 = "not found"

 	try:
 		for result in results['results']['bindings']:
 			knownfor = result['knownforlabel']['value']
 	except:
 		knownfor = "not found"

 	try:
 		for result in results['results']['bindings']:
 			abstract = result['abstract']['value']
 			abstract = abstract.encode('utf-8')
 	except:
 		abstract = "not found"

 	try:
 		for result in results['results']['bindings']:
 			philos = result['philos']['value']
 	except:
 		philos = "not found"

 	try:
 		for result in results['results']['bindings']:
 			bdate = result['bdate']['value']
 	except:
 		bdate = "not found"

 	try:
 		for result in results['results']['bindings']:
 			bplace = result['bplacename']['value']
 			bplace = bplace.encode('utf-8')
 			#print bplace
 	except:
 		bplace = "not found"

 	try:
 		for result in results['results']['bindings']:
 			ddate = result['ddate']['value']
 	except:
 		ddate = "not found"

 	try:
 		for result in results['results']['bindings']:
 			dplace = result['dplacename']['value']
 			dplace = dplace.encode('utf-8')
 			#print dplace
 	except:
 		dplace = "not found"

 	return p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace



 if __name__ == "__main__":

  # The first csv is the input, the second is the output which could be uploaded to Palladio
 	with open('../cataloger_review_20170601.csv','rb') as legit, open('../deadicators_20170601.csv','wb+') as legit_w_uri:
 		reader = csv.reader(legit,delimiter=',',quotechar='"')
 		auth = ''
 		iduri = ''
 		viafid = ''
 		occupation = ''
 		
 		for row in reader:
 			auth = row[1]
 			localid = row[0]
 			auth = '%s' % auth
 			iduri = ''
 			viafid = ''
 			#print('%s' % auth)

 			uris,src = query_local(auth,'nam',0)
 			if uris:
 				iduri = uris[0]

 			if uris is not None:
 				lccn = os.path.basename(os.path.normpath(uris[0]))

 				viafid = viaf_from_idloc(lccn)

 				print(auth, uris[0], viafid, row[1])

 				p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace = query_dbp(viafid)
 				
 			#print(p,occupation, occupation2, knownfor, abstract, philos)
 			
 			writer = csv.writer(legit_w_uri)
 			writer.writerow([auth,iduri,viafid,localid])
 			#writer.writerow([auth,p,iduri,viafid, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace])
	#!/usr/bin/env python
	#-- coding: utf-8 --
	"""
	Query dbpedia. Input is csv which includes authorized names ...
	...
	96,"Abdelamir, Chowki"
	9,"Abraham, Nicolas"
	31,"Abraham, Nicolas"
	...
	Queries local copy of LCNAF, then viaf for viaf uri, then dbpedia for various values (sparql).
	Cobbled together for jwb's lmg presentation 20161118
	from 20161115
	pmg
	"""
	import csv
	import os
	import re
	import requests
	import sys
	import unicodedata
	from lxml import etree
	from SPARQLWrapper import SPARQLWrapper, JSON

	def query_local(label, scheme, thesaurus):
	'''
	Queries lcnaf in Fuseki
	'''
	src = 'local'
	tri = []

	# SPARQL endpoint(s), one for each scheme (names, subjects)
	if scheme == 'nam':
	host = "http://localhost:3030/lcnaf2/"
	elif scheme == 'sub':
	host = "http://localhost:3030/lcsaf/"

	try:
	label = label.strip()
	label = re.sub('\s+',' ',label)
	label = label.replace('"','\u005C\u0022') # e.g. bib 568 "Problemna..." (heading with double quotes).
	#label = label.replace('"','\u0022') # 4store
	# replace combined characters (id does this automatically)
	label = unicodedata.normalize('NFC', label.decode('utf8'))
	# Modify the Voyager heading variable. TODO: do we want to change them in record, if found, or flag as needing edi?
	## label = re.sub('(\s[A-Z]\.)([A-Z]\.)',r'\g<1> \g<2>',label) # insert space between initials
	label = re.sub('(\sb\.)([^\s])',r'\g<1> \g<2>',label) # insert space after ' b.'
	label = re.sub('^([A-Z]\.)\s([A-Z]\.)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string
	label = re.sub('\(([A-Z]\.)\s([A-Z]\.)\)',r'\g<1>\g<2>',label) # remove ' ' between initials at start of string or in parens
	label = re.sub('(\sCo$)',r'\g<1>.',label) # insert period after " Co"
	label = re.sub('(\s[A-Z]$)',r'\g<1>.',label) # insert period after concluding initial ' A.'
	label = re.sub('\s(\,)',r'\g<1>',label) # replace ' ,'
	label = re.sub('([a-z])(\()',r'\g<1> \g<2>',label) # replace 'a(' with 'a ('
	label = label.replace('-\L','-L') # ? replace '\L' 509574 Mulher-\Libertação
	label = re.sub('\\$','',label) # bib 584803 had ' \' at end of subject


	# query for notes as well, to eliminate headings that are to be used as subdivisions (see e.g. 'Marriage')
	query = '''SELECT ?s ?match ?note
	WHERE
	{ ?s ?p "%s"@en .
	OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#note> ?note .FILTER(CONTAINS(?note,"subdivision")) .}
	OPTIONAL {?s <http://www.w3.org/2004/02/skos/core#exactMatch> ?match}
	}''' % label

	# query for variants
	variant_query = '''SELECT distinct ?s WHERE { {?s ?p ?bn . ?bn <http://www.loc.gov/mads/rdf/v1#variantLabel> "%s"@en . }}''' % label

	data = { 'query': query}
	headers={ 'Content-Type':'application/x-www-form-urlencoded','Accept':'application/sparql-results+xml' }

	#print(query)

	r = requests.post(host + "sparql", data=data, headers=headers)

	if r.status_code != requests.codes.ok:
	msg = '%s, %s' % (label, r.text)
	sys.exit(msg)
	try:
	doc = etree.fromstring(r.text)
	except:
	return None,src

	xpth = "//sparql:binding[@name='s' or @name='match'][not(following-sibling::sparql:binding[@name='note'])]/sparql:uri[. != '(null)']"

	if thesaurus == 1:
	xpth += "[contains(.,'childrensSubjects')]"
	else:
	xpth += "[not(contains(.,'childrensSubjects'))]"

	#print(r.text)

	triples = []
	if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0:
	for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}):
	triples.append(triple.text)
	return triples,src
	else:
	return None,src
	data = { 'query': variant_query}

	r = requests.post(host + "sparql/", data=data, headers=headers )
	if r.status_code != requests.codes.ok:
	msg = '%s, %s' % (label, r.text)
	sys.exit(msg)
	try:
	doc = etree.fromstring(r.text)
	if len(doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'})) > 0:
	for triple in doc.xpath(xpth,namespaces={'sparql':'http://www.w3.org/2005/sparql-results#'}):
	#print "----",triple.text
	return triple.text, src
	else:
	None,src
	except:
	return None,src

	except:
	etype,evalue,etraceback = sys.exc_info()

	#print("query_local problem %s %s %s line: %s" % (etype,evalue,etraceback,etraceback.tb_lineno))


	def viaf_from_idloc(lccn):
	"""
	Query viaf for translation of id.loc uri to viafid
	"""
	resp = requests.get("http://viaf.org/viaf/lccn/"+lccn)
	responses = []
	for r in resp.history:
	viafid = r.url
	print viafid
	return viafid


	def query_dbp(viafid):
	"""
	Query dbpedia
	"""
	sparql = SPARQLWrapper("http://dbpedia.org/sparql")
	sparql.setReturnFormat(JSON)
	occupation = ''
	occupation2 = ''
	abstract = ''
	knownfor = ''
	philos = ''
	bdate = ''
	bplace = ''
	ddate = ''
	dplace = ''
	p = ''

	viafid = str(viafid)

	#=====================================
	# SPARQL queries
	#=====================================

	# ?p a <http://dbpedia.org/ontology/Person> .
	query = """SELECT ?p (xsd:string(?ti)) as ?occupation (xsd:string(?l)) as ?occupation2 ?knownforlabel ?abstract
	WHERE {
	?p owl:sameAs <%s> .
	OPTIONAL { ?p dbp:occupation ?occupation .
	?occupation dbo:title ?ti .
	FILTER(LANGMATCHES(LANG(?ti), "en"))}
	OPTIONAL { ?p dbo:occupation ?occupation2 .
	?occupation2 rdfs:label ?l .
	FILTER(LANGMATCHES(LANG(?l), "en"))}
	OPTIONAL { ?p dbo:knownFor ?knownFor.
	?knownFor rdfs:label ?knownforlabel .
	FILTER(LANGMATCHES(LANG(?knownforlabel),"en"))} .
	OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en")) } .
	}""" % viafid

	query_philos = """SELECT ?p xsd:string(?philos) as ?philos ?bdate ?bplacename ?ddate ?dplacename ?abstract
	WHERE {
	?p owl:sameAs <%s> .
	?p dbo:birthDate ?bdate .
	?p dbo:birthPlace ?bplace .
	?bplace rdfs:label ?bplacename .
	FILTER(LANGMATCHES(LANG(?bplacename),"en")).
	OPTIONAL {?p dbo:deathDate ?ddate .
	?p dbo:deathPlace ?dplace .
	?dplace rdfs:label ?dplacename .
	FILTER(LANGMATCHES(LANG(?dplacename),"en")) }.
	OPTIONAL { ?p dbo:philosophicalSchool ?philo.
	?philo rdfs:label ?philos .
	FILTER(LANGMATCHES(LANG(?philos),"en"))} .
	OPTIONAL { ?p dbo:abstract ?abstract . FILTER(LANGMATCHES(LANG(?abstract), "en")) } .
	}""" % viafid


	#print(query_philos)

	sparql.setQuery(query_philos) # the previous query as a literal string

	results = sparql.query().convert()

	#=====================================
	# get values from returned triples
	#=====================================
	try:
	for result in results['results']['bindings']:
	p = result['p']['value']
	p = p.encode('utf-8')
	except:
	p = "not found"

	try:
	for result in results['results']['bindings']:
	occupation = result['occupation']['value']
	except:
	occupation = "not found"

	try:
	for result in results['results']['bindings']:
	occupation2 = result['occupation2']['value']
	except:
	occupation2 = "not found"

	try:
	for result in results['results']['bindings']:
	knownfor = result['knownforlabel']['value']
	except:
	knownfor = "not found"

	try:
	for result in results['results']['bindings']:
	abstract = result['abstract']['value']
	abstract = abstract.encode('utf-8')
	except:
	abstract = "not found"

	try:
	for result in results['results']['bindings']:
	philos = result['philos']['value']
	except:
	philos = "not found"

	try:
	for result in results['results']['bindings']:
	bdate = result['bdate']['value']
	except:
	bdate = "not found"

	try:
	for result in results['results']['bindings']:
	bplace = result['bplacename']['value']
	bplace = bplace.encode('utf-8')
	#print bplace
	except:
	bplace = "not found"

	try:
	for result in results['results']['bindings']:
	ddate = result['ddate']['value']
	except:
	ddate = "not found"

	try:
	for result in results['results']['bindings']:
	dplace = result['dplacename']['value']
	dplace = dplace.encode('utf-8')
	#print dplace
	except:
	dplace = "not found"

	return p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace



	if __name__ == "__main__":

	# The first csv is the input, the second is the output which could be uploaded to Palladio
	with open('../cataloger_review_20170601.csv','rb') as legit, open('../deadicators_20170601.csv','wb+') as legit_w_uri:
	reader = csv.reader(legit,delimiter=',',quotechar='"')
	auth = ''
	iduri = ''
	viafid = ''
	occupation = ''

	for row in reader:
	auth = row[1]
	localid = row[0]
	auth = '%s' % auth
	iduri = ''
	viafid = ''
	#print('%s' % auth)

	uris,src = query_local(auth,'nam',0)
	if uris:
	iduri = uris[0]

	if uris is not None:
	lccn = os.path.basename(os.path.normpath(uris[0]))

	viafid = viaf_from_idloc(lccn)

	print(auth, uris[0], viafid, row[1])

	p, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace = query_dbp(viafid)

	#print(p,occupation, occupation2, knownfor, abstract, philos)

	writer = csv.writer(legit_w_uri)
	writer.writerow([auth,iduri,viafid,localid])
	#writer.writerow([auth,p,iduri,viafid, occupation, occupation2, knownfor, abstract, philos, bdate, bplace, ddate, dplace])