Skip to content

Instantly share code, notes, and snippets.

@markmacgillivray
Created February 3, 2012 18:24
Show Gist options
  • Save markmacgillivray/1731588 to your computer and use it in GitHub Desktop.
Save markmacgillivray/1731588 to your computer and use it in GitHub Desktop.
Convert BNB from xml files into bibjson files
# Used to convert the data at: http://thedatahub.org/dataset/jiscopenbib-bl_bnb-1
# to a JSON format suitable for importing into BibServer
# NOTE - there will be an error in the output files. I noticed this after running,
# so used another script to fix - see the attached file
# also, the final file on this gist shows how to upload to an elasticsearch.
# I also made some changes to the JSON in that file before indexing it.
# these bits and pieces could be put into one file and done without the various writing to disk.
# But it was fine for the way I was doing this (intermittently)
# this is a conversion of https://gist.github.com/1634031 that reads and writes to files
# put the BNB data in a folder called xml then run this
# the files will be converted to bibjson into the json folder
import os, sys, json, re
from xml.etree.ElementTree import iterparse
targetdir = 'json/'
storedir = 'xml/'
dirList=os.listdir(storedir)
count = 0
for filename in dirList:
count += 1
print 'conversion ' + str(count) + ' - doing file ' + filename
outfile = open(targetdir + str(count) + '.json','w')
source = storedir + filename
context = iter(iterparse(source, events=("start", "end")))
event, root = context.next()
idx = 0
desc_buf = []
chunk = {}
text_collect = []
collecting_text = False
subject_in_scheme = None
COLL_TYPES = ('type', 'contributor', 'publisher', 'issued', 'coverage',
'language', 'extent', 'description', 'subject', 'identifier')
COMP_TYPES = {}
outfile.write('[')
for event, elem in context:
root.clear()
if event == 'start':
if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description':
desc_buf.append(True)
if elem.tag == '{http://purl.org/dc/terms/}title':
chunk['title'] = [elem.text]
for t in COLL_TYPES:
if elem.tag == '{http://purl.org/dc/terms/}%s'% t:
collecting_text = True
if collecting_text and elem.text:
t = elem.text.strip()
if t: text_collect.append(t)
if event == 'end':
if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description':
x = desc_buf.pop()
if elem.tag == '{http://www.w3.org/2004/02/skos/core#}inScheme':
tmp = elem.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
if tmp: subject_in_scheme = tmp
for t in COLL_TYPES:
if elem.tag == '{http://purl.org/dc/terms/}%s' % t:
if subject_in_scheme:
chunk.setdefault(t, []).append({ 'about':' '.join(text_collect), 'type':subject_in_scheme })
else:
chunk.setdefault(t, []).append(' '.join(text_collect))
collecting_text, text_collect, subject_in_scheme = False, [], None
if not desc_buf:
idx += 1
new_chunk = {}
for k,v in chunk.items():
if k == 'identifier':
tmp = []
for vv in v:
if vv.startswith('GB'):
tmp.append({'id':vv, 'type':'BNB'})
elif vv.startswith('URN:ISBN:'):
tmp.append({'id':vv, 'type':'ISBN'})
else:
tmp.append({'id':vv})
new_chunk[k] = tmp
elif k == 'contributor':
# Check for birth/death dates in the string
for vv in v:
birth = death = None
bd_re = re.search('(\d\d\d\d)-(\d\d\d\d)', vv)
if bd_re:
birth, death = bd_re.groups()
else:
bd_re = re.search('(\d\d\d\d)', vv)
if bd_re:
birth = bd_re.groups()[0]
tmp = {'name':vv}
if birth: tmp['birth'] = birth
if death: tmp['death'] = death
new_chunk.setdefault('author', []).append(tmp)
elif k == 'subject':
tmp = []
for vv in v:
if type(vv) is dict:
tmp.append(vv)
else:
tmp.append({'about':vv})
new_chunk[k] = tmp
elif k not in ('subject', 'coverage', 'type'):
new_chunk[k] = '\n'.join(x for x in v if x)
else:
new_chunk[k] = v
if len(v) > 1:
COMP_TYPES[k] = True
outfile.write(json.dumps(new_chunk, indent=2))
outfile.write(',')
chunk = {}
outfile.write(']')
outfile.close()
# takes files converted from BNB to bibjson and sends them to an elasticsearch index
# this also performs some transforms before sending the data
# remember to set your ES params correctly
# this version uses ijson so pip install ijson and apt-get install yajl-tools
import os, json, ijson, httplib, uuid
es_url = "localhost:9200"
es_path = "/bibserver_bnb/record"
storedir = 'json/'
dirList=os.listdir(storedir)
count = 0
for filename in dirList:
#if count > 0: break
count += 1
print 'round ' + str(count) + ' doing file ' + filename
infile = open(storedir + filename,'r')
for record in ijson.items(infile,'item'):
if 'issued' in record:
record['year'] = record['issued']
del record['issued']
if 'description' in record:
record['description'] = record['description'].replace('\n',' ')
if 'identifier' in record:
for index,val in enumerate(record['identifier']):
if val.get('type','') == 'ISBN':
#record['identifier'][index]['type'] = 'isbn'
record['identifier'][index]['id'] = val['id'].strip('URN:ISBN:')
if val.get('type','') == 'BNB':
record['cid'] = val['id']
if record:
if 'collection' not in record:
record['collection'] = 'bnb'
if 'owner' not in record:
record['owner'] = 'bibsoup'
record['id'] = uuid.uuid4().hex
c = httplib.HTTPConnection(es_url)
c.request('PUT', es_path+'/'+record['id'], json.dumps(record))
result = c.getresponse()
if result.status != 201:
print result.status, result.reason, record
infile.close()
# fixes the JSON output of bnb2bibjson.py
# NOTE: this was done inefficiently to fix the problem after the fact...
import os
targetdir = 'json2/'
storedir = 'json/'
dirList=os.listdir(storedir)
count = 0
for filename in dirList:
count += 1
print str(count)
infile = open(storedir+filename,'r')
lines = infile.readlines()
infile.close()
outfile = open(targetdir+filename,'w')
outfile.writelines([item for item in lines[:-1]])
outfile.write('}]')
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment