Created
February 3, 2012 18:24
-
-
Save markmacgillivray/1731588 to your computer and use it in GitHub Desktop.
Convert BNB from xml files into bibjson files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Used to convert the data at: http://thedatahub.org/dataset/jiscopenbib-bl_bnb-1 | |
# to a JSON format suitable for importing into BibServer | |
# NOTE - there will be an error in the output files. I noticed this after running, | |
# so used another script to fix - see the attached file | |
# also, the final file on this gist shows how to upload to an elasticsearch. | |
# I also made some changes to the JSON in that file before indexing it. | |
# these bits and pieces could be put into one file and done without the various writing to disk. | |
# But it was fine for the way I was doing this (intermittently) | |
# this is a conversion of https://gist.github.com/1634031 that reads and writes to files | |
# put the BNB data in a folder called xml then run this | |
# the files will be converted to bibjson into the json folder | |
import os, sys, json, re | |
from xml.etree.ElementTree import iterparse | |
targetdir = 'json/' | |
storedir = 'xml/' | |
dirList=os.listdir(storedir) | |
count = 0 | |
for filename in dirList: | |
count += 1 | |
print 'conversion ' + str(count) + ' - doing file ' + filename | |
outfile = open(targetdir + str(count) + '.json','w') | |
source = storedir + filename | |
context = iter(iterparse(source, events=("start", "end"))) | |
event, root = context.next() | |
idx = 0 | |
desc_buf = [] | |
chunk = {} | |
text_collect = [] | |
collecting_text = False | |
subject_in_scheme = None | |
COLL_TYPES = ('type', 'contributor', 'publisher', 'issued', 'coverage', | |
'language', 'extent', 'description', 'subject', 'identifier') | |
COMP_TYPES = {} | |
outfile.write('[') | |
for event, elem in context: | |
root.clear() | |
if event == 'start': | |
if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description': | |
desc_buf.append(True) | |
if elem.tag == '{http://purl.org/dc/terms/}title': | |
chunk['title'] = [elem.text] | |
for t in COLL_TYPES: | |
if elem.tag == '{http://purl.org/dc/terms/}%s'% t: | |
collecting_text = True | |
if collecting_text and elem.text: | |
t = elem.text.strip() | |
if t: text_collect.append(t) | |
if event == 'end': | |
if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description': | |
x = desc_buf.pop() | |
if elem.tag == '{http://www.w3.org/2004/02/skos/core#}inScheme': | |
tmp = elem.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource") | |
if tmp: subject_in_scheme = tmp | |
for t in COLL_TYPES: | |
if elem.tag == '{http://purl.org/dc/terms/}%s' % t: | |
if subject_in_scheme: | |
chunk.setdefault(t, []).append({ 'about':' '.join(text_collect), 'type':subject_in_scheme }) | |
else: | |
chunk.setdefault(t, []).append(' '.join(text_collect)) | |
collecting_text, text_collect, subject_in_scheme = False, [], None | |
if not desc_buf: | |
idx += 1 | |
new_chunk = {} | |
for k,v in chunk.items(): | |
if k == 'identifier': | |
tmp = [] | |
for vv in v: | |
if vv.startswith('GB'): | |
tmp.append({'id':vv, 'type':'BNB'}) | |
elif vv.startswith('URN:ISBN:'): | |
tmp.append({'id':vv, 'type':'ISBN'}) | |
else: | |
tmp.append({'id':vv}) | |
new_chunk[k] = tmp | |
elif k == 'contributor': | |
# Check for birth/death dates in the string | |
for vv in v: | |
birth = death = None | |
bd_re = re.search('(\d\d\d\d)-(\d\d\d\d)', vv) | |
if bd_re: | |
birth, death = bd_re.groups() | |
else: | |
bd_re = re.search('(\d\d\d\d)', vv) | |
if bd_re: | |
birth = bd_re.groups()[0] | |
tmp = {'name':vv} | |
if birth: tmp['birth'] = birth | |
if death: tmp['death'] = death | |
new_chunk.setdefault('author', []).append(tmp) | |
elif k == 'subject': | |
tmp = [] | |
for vv in v: | |
if type(vv) is dict: | |
tmp.append(vv) | |
else: | |
tmp.append({'about':vv}) | |
new_chunk[k] = tmp | |
elif k not in ('subject', 'coverage', 'type'): | |
new_chunk[k] = '\n'.join(x for x in v if x) | |
else: | |
new_chunk[k] = v | |
if len(v) > 1: | |
COMP_TYPES[k] = True | |
outfile.write(json.dumps(new_chunk, indent=2)) | |
outfile.write(',') | |
chunk = {} | |
outfile.write(']') | |
outfile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# takes files converted from BNB to bibjson and sends them to an elasticsearch index | |
# this also performs some transforms before sending the data | |
# remember to set your ES params correctly | |
# this version uses ijson so pip install ijson and apt-get install yajl-tools | |
import os, json, ijson, httplib, uuid | |
es_url = "localhost:9200" | |
es_path = "/bibserver_bnb/record" | |
storedir = 'json/' | |
dirList=os.listdir(storedir) | |
count = 0 | |
for filename in dirList: | |
#if count > 0: break | |
count += 1 | |
print 'round ' + str(count) + ' doing file ' + filename | |
infile = open(storedir + filename,'r') | |
for record in ijson.items(infile,'item'): | |
if 'issued' in record: | |
record['year'] = record['issued'] | |
del record['issued'] | |
if 'description' in record: | |
record['description'] = record['description'].replace('\n',' ') | |
if 'identifier' in record: | |
for index,val in enumerate(record['identifier']): | |
if val.get('type','') == 'ISBN': | |
#record['identifier'][index]['type'] = 'isbn' | |
record['identifier'][index]['id'] = val['id'].strip('URN:ISBN:') | |
if val.get('type','') == 'BNB': | |
record['cid'] = val['id'] | |
if record: | |
if 'collection' not in record: | |
record['collection'] = 'bnb' | |
if 'owner' not in record: | |
record['owner'] = 'bibsoup' | |
record['id'] = uuid.uuid4().hex | |
c = httplib.HTTPConnection(es_url) | |
c.request('PUT', es_path+'/'+record['id'], json.dumps(record)) | |
result = c.getresponse() | |
if result.status != 201: | |
print result.status, result.reason, record | |
infile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# fixes the JSON output of bnb2bibjson.py | |
# NOTE: this was done inefficiently to fix the problem after the fact... | |
import os | |
targetdir = 'json2/' | |
storedir = 'json/' | |
dirList=os.listdir(storedir) | |
count = 0 | |
for filename in dirList: | |
count += 1 | |
print str(count) | |
infile = open(storedir+filename,'r') | |
lines = infile.readlines() | |
infile.close() | |
outfile = open(targetdir+filename,'w') | |
outfile.writelines([item for item in lines[:-1]]) | |
outfile.write('}]') | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment