Skip to content

Instantly share code, notes, and snippets.

@adjam
Last active September 21, 2016 20:39
Show Gist options
  • Save adjam/792dc25456e2fde73503d08017700ef1 to your computer and use it in GitHub Desktop.
Save adjam/792dc25456e2fde73503d08017700ef1 to your computer and use it in GitHub Desktop.
Extract TOCs from Syndetics ICE XML
#!/usr/bin/python
from lxml import etree
import glob
import json
import re
class Cleaner(object):
"""Callable class to extract valid-ish ISBNs from text"""
def __init__(self):
self.r = re.compile(r"(\d{9}[0-9X]|\d{13})")
def __call__(self, value):
m = self.r.search(value.upper())
if m:
return m.group(1)
with open(glob.glob("*.xml")[0]) as f:
isbncleaner = Cleaner()
elements = etree.iterparse(f, events=('end',), tag="USMARC")
for action, element in elements:
dfld = element.xpath("VarFlds/VarDFlds[1]")[0]
ssifld = dfld.xpath("SSIFlds[1]")[0]
isbns = dfld.xpath("NumbCode/Fld020/a/text()")
isbns = [x for x in [isbncleaner(y) for y in (isbns)] if x]
titles = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")
if not titles:
print etree.tostring(element)
1 / 0
title = titles[0]
if title is None:
print titles
chapters = []
for chapter in ssifld.xpath("Fld970[@I1 != '0']"):
d = {}
authors = chapter.xpath("e|f/text()")
titles = chapter.xpath("t/text()")
if authors:
d['authors'] = authors
if titles:
d['title'] = titles[0]
if d:
chapters.append(d)
rec = {"isbn": isbns,
"title": title,
"chapters": chapters
}
print json.dumps(rec)
#!/usr/bin/env ruby
# note for this to work nokogiri (XML processing) and lisbn gems
# need to be installed; nokogiri uses libxml2 and libxslt native
# libraries so on linux make sure that the -devel packages are installed
# along with development tools such as a C/C++ compiler
#
require 'nokogiri'
require 'lisbn'
require 'json'
module Argot
module XML
# Implementation of a SAX Parser that yields an element
# as its end tag is encountered in the source document.
#
# Allows memory-efficient iteration over large XML files
# while still allowing use of Nokogiri's API to query
# and update the structure of returned elements
#
# If the `tag` parameter is supplied, will only yield
# elements matching the supplied name
#
# Usage:
# handler = Argot::XML::EventParser.new("record") { |rec|
# ... process each `record` element
class EventParser < Nokogiri::XML::SAX::Document
@context
@current
@tag
@current_doc
# Creates a new instance
# Parameters:
#
# +tag+ the tag to look for
# +block+ the block to be executed as each record's end element
# is encountered
def initialize(tag='*', &block)
@context = []
@tag = tag
@block = block
end
def start_element(name, attributes=[])#:nodoc
@context.push(name)
if @tag != '*'
if @current_doc.nil?
@current_doc = Nokogiri::XML::Document.new
end
el = Nokogiri::XML::Element.new(name,@current_doc)
attributes.each do |attr|
el.set_attribute(attr[0],attr[1])
end
if @tag == name
@current = el
elsif @current
el.parent = @current
@current = el
end
end
end
def text_node(string)
Nokogiri::XML::Text.new(string,@current_doc)
end
def characters(string)
if not @current.nil?
@current.add_child( text_node(string))
end
end
# get the current context of the document
# from the current element up to the root, with each
# element name separated by a `/`
def show_context()
@context.join("/")
end
def end_element(name)#:nodoc
if name == @tag
@block.call @current
@current_doc = nil
@current = nil
elsif @current
@current = @current.parent
end
@context.pop
end
end
end
end
def good_isbn?(value)
v = value.upcase().gsub(/[^0-9X]/, '')
[ Lisbn.new(v).valid?, v ]
end
ice_data = Dir['*.xml'][0]
p = Argot::XML::EventParser.new("USMARC") { |el|
dfld = el.xpath("VarFlds/VarDFlds[1]")[0]
ssifld = dfld.xpath("SSIFlds[1]")[0]
isbns = dfld.xpath("NumbCode/Fld020/a/text()")
.map{|i| good_isbn?(i.text) }
.select { |good,v| good }
.map { |t,v| v }
title = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")[0]
chapters = ssifld.xpath("Fld970[@I1 != '0']").map { |field|
d = {}
authors = field.xpath("e|f/text()")
titles = field.xpath("t/text()")
d[:authors] = authors if authors
d[:title] = titles[0] if titles
d
}.select { |d| d }
rec = { :isbn => isbns,
:title => title,
:chapters => chapters
}
puts rec.to_json
}
parser = Nokogiri::XML::SAX::Parser.new(p)
parser.parse( File.open(ice_data) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment