mwmitchell · June 2, 2009 20:21
diff --git a/gistfile1.rb b/gistfile1.rb
 # This is for breaking up XML docs
 # into multiple chunks by a given node type.
 # For example, in TEI the "pb" tag exists to signify page breaks...
 # Using this code, you could break up the doc into multiple documents
 # based on the position/depth of the individual "pb" tags.
 #
 # Think of this as an xml splitter, like String#split
 #

 require 'rubygems'
 require 'nokogiri'

 module NokogiriElementHelpers
  
  # returns a list of nodes starting with this node
  # all the way down through the descendents
  def flatten
    [self] + self.children.map{|child|child.flatten}.flatten
  end
  
 end

 module NokogiriNodeHelpers
  
  # returns all previous siblings
  def previous_siblings
    previous_sibling ? (previous_sibling.previous_siblings + [previous_sibling]) : []
  end
  
  # returns all previous siblings, plus parent previous siblings (recursively)
  def previous_nodes_recursive
    # if the parent is the document
    # return an array with only the previous sibling in it
    # ... then compact (could be nil)
    if parent.is_a?(Nokogiri::XML::Document)
      [previous_sibling].compact
    else
      (
        (parent.previous_nodes_recursive) + 
        (previous_sibling ? previous_sibling.previous_nodes_recursive : []) + 
        [previous_sibling]
      ).flatten.compact
    end
  end
  
 end

 Nokogiri::XML::Document.send :include, NokogiriElementHelpers

 Nokogiri::XML::Element.send :include, NokogiriElementHelpers
 Nokogiri::XML::Element.send :include, NokogiriNodeHelpers

 Nokogiri::XML::Comment.send :include, NokogiriElementHelpers
 Nokogiri::XML::Comment.send :include, NokogiriNodeHelpers

 Nokogiri::XML::Text.send :include, NokogiriElementHelpers
 Nokogiri::XML::Text.send :include, NokogiriNodeHelpers

 class NokogiriFragmenter
  
  class << self
    
    def fragment(source, pattern, &blk)
      pages = []
      source = Nokogiri::XML(source) if source.is_a?(String)
      
      # get the first set of nodes before the first fragment
      first_found = nil
      first_deletes = []
      source_copy = Nokogiri::XML(source.to_xml)
      source_copy.flatten.each do |e|
        first_found ||= e.name == pattern
        if first_found
          first_deletes << e
        end
      end
      first_deletes.each{|r| r.remove}
      pages << source_copy
      
      matching_nodes = source.search(pattern)
      matching_nodes.size.times do |index|
        deletes = []
        source_copy = Nokogiri::XML(source.to_xml)
        snode = matching_nodes[index]
        node = source_copy.at(snode.path)
        found = nil
        after = nil
        source_copy.flatten.each do |e|
          
          # skip the document element
          if e == source_copy
            next
          end
          
          # have we found the current fragment node?
          found ||= e == node
          
          # if the current node is the fragmenter,
          # delete all of its previous siblings, recursively
          if e == node
            deletes += e.previous_nodes_recursive
          end
          
          # are we after the current fragmentor and to the next fragmenter?
          after ||= (found and e != node and e.name == node.name) ? true : nil
          
          deletes << e if after
          
        end
        deletes.each {|node|node.remove}
        pages << source_copy
      end
      pages
    end
  
  end
  
 end


 data = %Q(
 <root>
  <p>START</p>
  <pb id="pb0"/>
  <p>Testing</p>
  <div>
    <pre>
      <span>should not be in the pb#pb0</span>
      <pb id="pb1"/>
      <span>should only be for pb1</span>
    	<div id="one">
    		<p>1 some text</p>
    		<p>2 some text</p>
    		<p>3 some text</p>
    		<p id="prePb2">4 some text</p>
    		<pb id="pb2"/>
    		<p>5 some text</p>
    		<p>6 some text</p>
    		<div id="one-one">
    			<p>1-1 some text</p>
    			<p>1-2 some text</p>
    			<p>1-3 some text</p>
    			<pb id="pb3"/>
    			<p>1-4 some text</p>
    			<p>1-5 some text</p>
    		</div>
    	</div>
    	<pb id="pb4"/>
    	<div id="two">
    		<p>1 some text</p>
    		<p>2 some text</p>
    		<p>3 some text</p>
    		<p>4 some text</p>
    		<pb id="pb5"/>
    		<p>5 some text</p>
    		<p>6 some text</p>
    		<div id="two-one">
    			<pb id="pb6"/>
    			<p>2-1 some text</p>
    			<p>2-2 some text</p>
    			<p>2-3 some text</p>
    			<p>2-4 some text</p>
    			<p>3-5 some text</p>
    		</div>
    		<p id="ending1">ENDING TEXT 1</p>
    	</div>
    </pre>
  	<p id="ending2">ENDING TEXT 2</p>
  </div>
 </root>
 )


 require 'benchmark'
 upto = 1
 c1 = ''
 Benchmark.bmbm do |x|
  x.report('fragmenter') { upto.times{c1 = NokogiriFragmenter.fragment(data, 'pb')} }
 end
 puts c1.inspect
	# This is for breaking up XML docs
	# into multiple chunks by a given node type.
	# For example, in TEI the "pb" tag exists to signify page breaks...
	# Using this code, you could break up the doc into multiple documents
	# based on the position/depth of the individual "pb" tags.
	#
	# Think of this as an xml splitter, like String#split
	#

	require 'rubygems'
	require 'nokogiri'

	module NokogiriElementHelpers

	# returns a list of nodes starting with this node
	# all the way down through the descendents
	def flatten
	[self] + self.children.map{\|child\|child.flatten}.flatten
	end

	end

	module NokogiriNodeHelpers

	# returns all previous siblings
	def previous_siblings
	previous_sibling ? (previous_sibling.previous_siblings + [previous_sibling]) : []
	end

	# returns all previous siblings, plus parent previous siblings (recursively)
	def previous_nodes_recursive
	# if the parent is the document
	# return an array with only the previous sibling in it
	# ... then compact (could be nil)
	if parent.is_a?(Nokogiri::XML::Document)
	[previous_sibling].compact
	else
	(
	(parent.previous_nodes_recursive) +
	(previous_sibling ? previous_sibling.previous_nodes_recursive : []) +
	[previous_sibling]
	).flatten.compact
	end
	end

	end

	Nokogiri::XML::Document.send :include, NokogiriElementHelpers

	Nokogiri::XML::Element.send :include, NokogiriElementHelpers
	Nokogiri::XML::Element.send :include, NokogiriNodeHelpers

	Nokogiri::XML::Comment.send :include, NokogiriElementHelpers
	Nokogiri::XML::Comment.send :include, NokogiriNodeHelpers

	Nokogiri::XML::Text.send :include, NokogiriElementHelpers
	Nokogiri::XML::Text.send :include, NokogiriNodeHelpers

	class NokogiriFragmenter

	class << self

	def fragment(source, pattern, &blk)
	pages = []
	source = Nokogiri::XML(source) if source.is_a?(String)

	# get the first set of nodes before the first fragment
	first_found = nil
	first_deletes = []
	source_copy = Nokogiri::XML(source.to_xml)
	source_copy.flatten.each do \|e\|
	first_found \|\|= e.name == pattern
	if first_found
	first_deletes << e
	end
	end
	first_deletes.each{\|r\| r.remove}
	pages << source_copy

	matching_nodes = source.search(pattern)
	matching_nodes.size.times do \|index\|
	deletes = []
	source_copy = Nokogiri::XML(source.to_xml)
	snode = matching_nodes[index]
	node = source_copy.at(snode.path)
	found = nil
	after = nil
	source_copy.flatten.each do \|e\|

	# skip the document element
	if e == source_copy
	next
	end

	# have we found the current fragment node?
	found \|\|= e == node

	# if the current node is the fragmenter,
	# delete all of its previous siblings, recursively
	if e == node
	deletes += e.previous_nodes_recursive
	end

	# are we after the current fragmentor and to the next fragmenter?
	after \|\|= (found and e != node and e.name == node.name) ? true : nil

	deletes << e if after

	end
	deletes.each {\|node\|node.remove}
	pages << source_copy
	end
	pages
	end

	end

	end


	data = %Q(
	<root>
	<p>START</p>
	<pb id="pb0"/>
	<p>Testing</p>
	<div>
	<pre>
	<span>should not be in the pb#pb0</span>
	<pb id="pb1"/>
	<span>should only be for pb1</span>
	<div id="one">
	<p>1 some text</p>
	<p>2 some text</p>
	<p>3 some text</p>
	<p id="prePb2">4 some text</p>
	<pb id="pb2"/>
	<p>5 some text</p>
	<p>6 some text</p>
	<div id="one-one">
	<p>1-1 some text</p>
	<p>1-2 some text</p>
	<p>1-3 some text</p>
	<pb id="pb3"/>
	<p>1-4 some text</p>
	<p>1-5 some text</p>
	</div>
	</div>
	<pb id="pb4"/>
	<div id="two">
	<p>1 some text</p>
	<p>2 some text</p>
	<p>3 some text</p>
	<p>4 some text</p>
	<pb id="pb5"/>
	<p>5 some text</p>
	<p>6 some text</p>
	<div id="two-one">
	<pb id="pb6"/>
	<p>2-1 some text</p>
	<p>2-2 some text</p>
	<p>2-3 some text</p>
	<p>2-4 some text</p>
	<p>3-5 some text</p>
	</div>
	<p id="ending1">ENDING TEXT 1</p>
	</div>
	</pre>
	<p id="ending2">ENDING TEXT 2</p>
	</div>
	</root>
	)


	require 'benchmark'
	upto = 1
	c1 = ''
	Benchmark.bmbm do \|x\|
	x.report('fragmenter') { upto.times{c1 = NokogiriFragmenter.fragment(data, 'pb')} }
	end
	puts c1.inspect