Last active
September 21, 2016 20:39
-
-
Save adjam/792dc25456e2fde73503d08017700ef1 to your computer and use it in GitHub Desktop.
Extract TOCs from Syndetics ICE XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from lxml import etree | |
import glob | |
import json | |
import re | |
class Cleaner(object): | |
"""Callable class to extract valid-ish ISBNs from text""" | |
def __init__(self): | |
self.r = re.compile(r"(\d{9}[0-9X]|\d{13})") | |
def __call__(self, value): | |
m = self.r.search(value.upper()) | |
if m: | |
return m.group(1) | |
with open(glob.glob("*.xml")[0]) as f: | |
isbncleaner = Cleaner() | |
elements = etree.iterparse(f, events=('end',), tag="USMARC") | |
for action, element in elements: | |
dfld = element.xpath("VarFlds/VarDFlds[1]")[0] | |
ssifld = dfld.xpath("SSIFlds[1]")[0] | |
isbns = dfld.xpath("NumbCode/Fld020/a/text()") | |
isbns = [x for x in [isbncleaner(y) for y in (isbns)] if x] | |
titles = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()") | |
if not titles: | |
print etree.tostring(element) | |
1 / 0 | |
title = titles[0] | |
if title is None: | |
print titles | |
chapters = [] | |
for chapter in ssifld.xpath("Fld970[@I1 != '0']"): | |
d = {} | |
authors = chapter.xpath("e|f/text()") | |
titles = chapter.xpath("t/text()") | |
if authors: | |
d['authors'] = authors | |
if titles: | |
d['title'] = titles[0] | |
if d: | |
chapters.append(d) | |
rec = {"isbn": isbns, | |
"title": title, | |
"chapters": chapters | |
} | |
print json.dumps(rec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# note for this to work nokogiri (XML processing) and lisbn gems | |
# need to be installed; nokogiri uses libxml2 and libxslt native | |
# libraries so on linux make sure that the -devel packages are installed | |
# along with development tools such as a C/C++ compiler | |
# | |
require 'nokogiri' | |
require 'lisbn' | |
require 'json' | |
module Argot | |
module XML | |
# Implementation of a SAX Parser that yields an element | |
# as its end tag is encountered in the source document. | |
# | |
# Allows memory-efficient iteration over large XML files | |
# while still allowing use of Nokogiri's API to query | |
# and update the structure of returned elements | |
# | |
# If the `tag` parameter is supplied, will only yield | |
# elements matching the supplied name | |
# | |
# Usage: | |
# handler = Argot::XML::EventParser.new("record") { |rec| | |
# ... process each `record` element | |
class EventParser < Nokogiri::XML::SAX::Document | |
@context | |
@current | |
@tag | |
@current_doc | |
# Creates a new instance | |
# Parameters: | |
# | |
# +tag+ the tag to look for | |
# +block+ the block to be executed as each record's end element | |
# is encountered | |
def initialize(tag='*', &block) | |
@context = [] | |
@tag = tag | |
@block = block | |
end | |
def start_element(name, attributes=[])#:nodoc | |
@context.push(name) | |
if @tag != '*' | |
if @current_doc.nil? | |
@current_doc = Nokogiri::XML::Document.new | |
end | |
el = Nokogiri::XML::Element.new(name,@current_doc) | |
attributes.each do |attr| | |
el.set_attribute(attr[0],attr[1]) | |
end | |
if @tag == name | |
@current = el | |
elsif @current | |
el.parent = @current | |
@current = el | |
end | |
end | |
end | |
def text_node(string) | |
Nokogiri::XML::Text.new(string,@current_doc) | |
end | |
def characters(string) | |
if not @current.nil? | |
@current.add_child( text_node(string)) | |
end | |
end | |
# get the current context of the document | |
# from the current element up to the root, with each | |
# element name separated by a `/` | |
def show_context() | |
@context.join("/") | |
end | |
def end_element(name)#:nodoc | |
if name == @tag | |
@block.call @current | |
@current_doc = nil | |
@current = nil | |
elsif @current | |
@current = @current.parent | |
end | |
@context.pop | |
end | |
end | |
end | |
end | |
def good_isbn?(value) | |
v = value.upcase().gsub(/[^0-9X]/, '') | |
[ Lisbn.new(v).valid?, v ] | |
end | |
ice_data = Dir['*.xml'][0] | |
p = Argot::XML::EventParser.new("USMARC") { |el| | |
dfld = el.xpath("VarFlds/VarDFlds[1]")[0] | |
ssifld = dfld.xpath("SSIFlds[1]")[0] | |
isbns = dfld.xpath("NumbCode/Fld020/a/text()") | |
.map{|i| good_isbn?(i.text) } | |
.select { |good,v| good } | |
.map { |t,v| v } | |
title = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")[0] | |
chapters = ssifld.xpath("Fld970[@I1 != '0']").map { |field| | |
d = {} | |
authors = field.xpath("e|f/text()") | |
titles = field.xpath("t/text()") | |
d[:authors] = authors if authors | |
d[:title] = titles[0] if titles | |
d | |
}.select { |d| d } | |
rec = { :isbn => isbns, | |
:title => title, | |
:chapters => chapters | |
} | |
puts rec.to_json | |
} | |
parser = Nokogiri::XML::SAX::Parser.new(p) | |
parser.parse( File.open(ice_data) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment