mwmitchell · April 27, 2009 21:21
diff --git a/gistfile1.rb b/gistfile1.rb
 # This is for breaking up XML docs
 # into multiple chunks by a given node type.
 # For example, in TEI the "pb" tag exists to signify page breaks...
 # Using this code, you could break up the doc into multiple documents
 # based on the position/depth of the individual "pb" tags.

 require 'rubygems'
 require 'nokogiri'

 def next_node(start, e=[], &b)
  e << start
  if start.children.size > 0
    yield start.children.first
    next_node start.children.first, e, &b
  end
  if n = start.next
    yield n
    next_node n, e, &b
  end
  if ! e.include?(start) and n = start.parent.next
    yield n
    next_node n, e, &b
  end
 end

 module XMLMethods
  
  def previous_nodes_r
    ps = previous_sibling
    ((parent.previous_nodes_r rescue []) + (ps.previous_nodes_r rescue []) + [previous_sibling]).compact.uniq
  end
  
 end

 def fragment_doc(source_doc, pattern)
  pages = []
  source_doc.search(pattern).each do |test_pb|
    doc = Nokogiri::XML(source_doc.to_xml)
    pb = doc.at(test_pb.path)
    deletions = []
    mode = nil
    after_target_node = false
    next_node(doc) do |e|
      e.extend XMLMethods
      if e == pb
        deletions += e.previous_nodes_r
        after_target_node = true
      end
      if after_target_node and e != pb and e.name == pb.name
        mode = :delete
      end
      deletions << e if mode == :delete and ! pb.ancestors.include?(e)
    end
    deletions.each do |e|
      e.remove
    end
    pages << doc
  end
  pages
 end

 data = %Q(
 <div>
  <pre>
  	<pb id="pb1"/>
  	<div id="one">
  		<p>1 some text</p>
  		<p>2 some text</p>
  		<p>3 some text</p>
  		<p id="prePb2">4 some text</p>
  		<pb id="pb2"/>
  		<p>5 some text</p>
  		<p>6 some text</p>
  		<div id="one-one">
  			<p>1-1 some text</p>
  			<p>1-2 some text</p>
  			<p>1-3 some text</p>
  			<pb id="pb3"/>
  			<p>1-4 some text</p>
  			<p>1-5 some text</p>
  		</div>
  	</div>
  	<pb id="pb4"/>
  	<div id="two">
  		<p>1 some text</p>
  		<p>2 some text</p>
  		<p>3 some text</p>
  		<p>4 some text</p>
  		<pb id="pb5"/>
  		<p>5 some text</p>
  		<p>6 some text</p>
  		<div id="two-one">
  			<pb id="pb6"/>
  			<p>2-1 some text</p>
  			<p>2-2 some text</p>
  			<p>2-3 some text</p>
  			<p>2-4 some text</p>
  			<p>3-5 some text</p>
  		</div>
  		<p id="ending1">ENDING TEXT 1</p>
  	</div>
  </pre>
 	<p id="ending2">ENDING TEXT 2</p>
 </div>
 )

 xml = Nokogiri::XML(data)
 pages = fragment_doc(xml, 'pb')
 puts pages.inspect
	# This is for breaking up XML docs
	# into multiple chunks by a given node type.
	# For example, in TEI the "pb" tag exists to signify page breaks...
	# Using this code, you could break up the doc into multiple documents
	# based on the position/depth of the individual "pb" tags.

	require 'rubygems'
	require 'nokogiri'

	def next_node(start, e=[], &b)
	e << start
	if start.children.size > 0
	yield start.children.first
	next_node start.children.first, e, &b
	end
	if n = start.next
	yield n
	next_node n, e, &b
	end
	if ! e.include?(start) and n = start.parent.next
	yield n
	next_node n, e, &b
	end
	end

	module XMLMethods

	def previous_nodes_r
	ps = previous_sibling
	((parent.previous_nodes_r rescue []) + (ps.previous_nodes_r rescue []) + [previous_sibling]).compact.uniq
	end

	end

	def fragment_doc(source_doc, pattern)
	pages = []
	source_doc.search(pattern).each do \|test_pb\|
	doc = Nokogiri::XML(source_doc.to_xml)
	pb = doc.at(test_pb.path)
	deletions = []
	mode = nil
	after_target_node = false
	next_node(doc) do \|e\|
	e.extend XMLMethods
	if e == pb
	deletions += e.previous_nodes_r
	after_target_node = true
	end
	if after_target_node and e != pb and e.name == pb.name
	mode = :delete
	end
	deletions << e if mode == :delete and ! pb.ancestors.include?(e)
	end
	deletions.each do \|e\|
	e.remove
	end
	pages << doc
	end
	pages
	end

	data = %Q(
	<div>
	<pre>
	<pb id="pb1"/>
	<div id="one">
	<p>1 some text</p>
	<p>2 some text</p>
	<p>3 some text</p>
	<p id="prePb2">4 some text</p>
	<pb id="pb2"/>
	<p>5 some text</p>
	<p>6 some text</p>
	<div id="one-one">
	<p>1-1 some text</p>
	<p>1-2 some text</p>
	<p>1-3 some text</p>
	<pb id="pb3"/>
	<p>1-4 some text</p>
	<p>1-5 some text</p>
	</div>
	</div>
	<pb id="pb4"/>
	<div id="two">
	<p>1 some text</p>
	<p>2 some text</p>
	<p>3 some text</p>
	<p>4 some text</p>
	<pb id="pb5"/>
	<p>5 some text</p>
	<p>6 some text</p>
	<div id="two-one">
	<pb id="pb6"/>
	<p>2-1 some text</p>
	<p>2-2 some text</p>
	<p>2-3 some text</p>
	<p>2-4 some text</p>
	<p>3-5 some text</p>
	</div>
	<p id="ending1">ENDING TEXT 1</p>
	</div>
	</pre>
	<p id="ending2">ENDING TEXT 2</p>
	</div>
	)

	xml = Nokogiri::XML(data)
	pages = fragment_doc(xml, 'pb')
	puts pages.inspect