adjam · September 21, 2016 20:39
diff --git a/extract_ice.py b/extract_ice.py
 #!/usr/bin/python

 from lxml import etree
 import glob
 import json
 import re


 class Cleaner(object):

    """Callable class to extract valid-ish ISBNs from text"""

    def __init__(self):
        self.r = re.compile(r"(\d{9}[0-9X]|\d{13})")

    def __call__(self, value):
        m = self.r.search(value.upper())
        if m:
            return m.group(1)

 with open(glob.glob("*.xml")[0]) as f:
    isbncleaner = Cleaner()
    elements = etree.iterparse(f, events=('end',), tag="USMARC")
    for action, element in elements:
        dfld = element.xpath("VarFlds/VarDFlds[1]")[0]
        ssifld = dfld.xpath("SSIFlds[1]")[0]

        isbns = dfld.xpath("NumbCode/Fld020/a/text()")
        isbns = [x for x in [isbncleaner(y) for y in (isbns)] if x]

        titles = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")
        if not titles:
            print etree.tostring(element)
            1 / 0
        title = titles[0]
        if title is None:
            print titles
        chapters = []
        for chapter in ssifld.xpath("Fld970[@I1 != '0']"):
                d = {}
                authors = chapter.xpath("e|f/text()")
                titles = chapter.xpath("t/text()")
                if authors:
                    d['authors'] = authors
                if titles:
                    d['title'] = titles[0]
                if d:
                    chapters.append(d)

        rec = {"isbn": isbns,
               "title": title,
               "chapters": chapters
               }
        print json.dumps(rec)
diff --git a/extract_ice.rb b/extract_ice.rb
 #!/usr/bin/env ruby

 # note for this to work nokogiri (XML processing) and lisbn gems
 # need to be installed; nokogiri uses libxml2 and libxslt native
 # libraries so on linux make sure that the -devel packages are installed
 # along with development tools such as a C/C++ compiler 
 #
 require 'nokogiri'
 require 'lisbn'
 require 'json'

 module Argot
    module XML 
        # Implementation of a SAX Parser that yields an element
        # as its end tag is encountered in the source document.
        #
        # Allows memory-efficient iteration over large XML files 
        # while still allowing use of Nokogiri's API to query
        # and update the structure of returned elements
        #
        # If the `tag` parameter is supplied, will only yield 
        # elements matching the supplied name
        #
        # Usage:
        #   handler = Argot::XML::EventParser.new("record") { |rec|
        #     ... process each `record` element
        class EventParser < Nokogiri::XML::SAX::Document

            @context 
            @current
            @tag
            @current_doc

            # Creates a new instance
            # Parameters:
            #
            # +tag+ the tag to look for 
            # +block+ the block to be executed as each record's end element
            # is encountered
            def initialize(tag='*', &block)
                @context = []
                @tag = tag
                @block = block
            end

            def start_element(name, attributes=[])#:nodoc
                @context.push(name)
                if @tag != '*'    
                    if @current_doc.nil?
                        @current_doc = Nokogiri::XML::Document.new
                    end
                    el = Nokogiri::XML::Element.new(name,@current_doc)
                    attributes.each do |attr|
                        el.set_attribute(attr[0],attr[1])
                    end
                    if @tag == name
                        @current = el
                    elsif @current
                        el.parent = @current
                        @current = el
                    end
                end
               end

            def text_node(string)
                Nokogiri::XML::Text.new(string,@current_doc)
            end

            def characters(string)
                if not @current.nil?
                    @current.add_child( text_node(string))
                end
            end


            # get the current context of the document
            # from the current element up to the root, with each
            # element name separated by a `/`
            def show_context() 
                @context.join("/")
            end

            def end_element(name)#:nodoc
                if name == @tag
                    @block.call @current
                    @current_doc = nil
                    @current = nil
                elsif @current
                    @current = @current.parent
                end
                @context.pop
            end
        end
    end
 end

 def good_isbn?(value)
        v = value.upcase().gsub(/[^0-9X]/, '')
        [ Lisbn.new(v).valid?, v ]
 end

 ice_data = Dir['*.xml'][0]

 p = Argot::XML::EventParser.new("USMARC") { |el|
    dfld = el.xpath("VarFlds/VarDFlds[1]")[0]
    ssifld = dfld.xpath("SSIFlds[1]")[0]
    isbns = dfld.xpath("NumbCode/Fld020/a/text()")
        .map{|i| good_isbn?(i.text) }
        .select { |good,v| good }
        .map { |t,v| v }

    title = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")[0]
    chapters = ssifld.xpath("Fld970[@I1 != '0']").map { |field|
        d = {}
        authors = field.xpath("e|f/text()")
        titles = field.xpath("t/text()")
        d[:authors] = authors if authors
        d[:title] = titles[0] if titles
        d
    }.select { |d| d }
    rec = { :isbn => isbns,
            :title => title,
            :chapters => chapters 
    }
    puts rec.to_json
 }

 parser = Nokogiri::XML::SAX::Parser.new(p)

 parser.parse( File.open(ice_data) )
	#!/usr/bin/python

	from lxml import etree
	import glob
	import json
	import re


	class Cleaner(object):

	"""Callable class to extract valid-ish ISBNs from text"""

	def __init__(self):
	self.r = re.compile(r"(\d{9}[0-9X]\|\d{13})")

	def __call__(self, value):
	m = self.r.search(value.upper())
	if m:
	return m.group(1)

	with open(glob.glob("*.xml")[0]) as f:
	isbncleaner = Cleaner()
	elements = etree.iterparse(f, events=('end',), tag="USMARC")
	for action, element in elements:
	dfld = element.xpath("VarFlds/VarDFlds[1]")[0]
	ssifld = dfld.xpath("SSIFlds[1]")[0]

	isbns = dfld.xpath("NumbCode/Fld020/a/text()")
	isbns = [x for x in [isbncleaner(y) for y in (isbns)] if x]

	titles = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")
	if not titles:
	print etree.tostring(element)
	1 / 0
	title = titles[0]
	if title is None:
	print titles
	chapters = []
	for chapter in ssifld.xpath("Fld970[@I1 != '0']"):
	d = {}
	authors = chapter.xpath("e\|f/text()")
	titles = chapter.xpath("t/text()")
	if authors:
	d['authors'] = authors
	if titles:
	d['title'] = titles[0]
	if d:
	chapters.append(d)

	rec = {"isbn": isbns,
	"title": title,
	"chapters": chapters
	}
	print json.dumps(rec)
	#!/usr/bin/env ruby

	# note for this to work nokogiri (XML processing) and lisbn gems
	# need to be installed; nokogiri uses libxml2 and libxslt native
	# libraries so on linux make sure that the -devel packages are installed
	# along with development tools such as a C/C++ compiler
	#
	require 'nokogiri'
	require 'lisbn'
	require 'json'

	module Argot
	module XML
	# Implementation of a SAX Parser that yields an element
	# as its end tag is encountered in the source document.
	#
	# Allows memory-efficient iteration over large XML files
	# while still allowing use of Nokogiri's API to query
	# and update the structure of returned elements
	#
	# If the `tag` parameter is supplied, will only yield
	# elements matching the supplied name
	#
	# Usage:
	# handler = Argot::XML::EventParser.new("record") { \|rec\|
	# ... process each `record` element
	class EventParser < Nokogiri::XML::SAX::Document

	@context
	@current
	@tag
	@current_doc

	# Creates a new instance
	# Parameters:
	#
	# +tag+ the tag to look for
	# +block+ the block to be executed as each record's end element
	# is encountered
	def initialize(tag='*', &block)
	@context = []
	@tag = tag
	@block = block
	end

	def start_element(name, attributes=[])#:nodoc
	@context.push(name)
	if @tag != '*'
	if @current_doc.nil?
	@current_doc = Nokogiri::XML::Document.new
	end
	el = Nokogiri::XML::Element.new(name,@current_doc)
	attributes.each do \|attr\|
	el.set_attribute(attr[0],attr[1])
	end
	if @tag == name
	@current = el
	elsif @current
	el.parent = @current
	@current = el
	end
	end
	end

	def text_node(string)
	Nokogiri::XML::Text.new(string,@current_doc)
	end

	def characters(string)
	if not @current.nil?
	@current.add_child( text_node(string))
	end
	end


	# get the current context of the document
	# from the current element up to the root, with each
	# element name separated by a `/`
	def show_context()
	@context.join("/")
	end

	def end_element(name)#:nodoc
	if name == @tag
	@block.call @current
	@current_doc = nil
	@current = nil
	elsif @current
	@current = @current.parent
	end
	@context.pop
	end
	end
	end
	end

	def good_isbn?(value)
	v = value.upcase().gsub(/[^0-9X]/, '')
	[ Lisbn.new(v).valid?, v ]
	end

	ice_data = Dir['*.xml'][0]

	p = Argot::XML::EventParser.new("USMARC") { \|el\|
	dfld = el.xpath("VarFlds/VarDFlds[1]")[0]
	ssifld = dfld.xpath("SSIFlds[1]")[0]
	isbns = dfld.xpath("NumbCode/Fld020/a/text()")
	.map{\|i\| good_isbn?(i.text) }
	.select { \|good,v\| good }
	.map { \|t,v\| v }

	title = dfld.xpath("Titles/Fld245/*[self::a or self::b][1]/text()")[0]
	chapters = ssifld.xpath("Fld970[@I1 != '0']").map { \|field\|
	d = {}
	authors = field.xpath("e\|f/text()")
	titles = field.xpath("t/text()")
	d[:authors] = authors if authors
	d[:title] = titles[0] if titles
	d
	}.select { \|d\| d }
	rec = { :isbn => isbns,
	:title => title,
	:chapters => chapters
	}
	puts rec.to_json
	}

	parser = Nokogiri::XML::SAX::Parser.new(p)

	parser.parse( File.open(ice_data) )