Skip to content

Instantly share code, notes, and snippets.

@mwmitchell
Created April 27, 2009 21:21
Show Gist options
  • Save mwmitchell/102747 to your computer and use it in GitHub Desktop.
Save mwmitchell/102747 to your computer and use it in GitHub Desktop.
# This is for breaking up XML docs
# into multiple chunks by a given node type.
# For example, in TEI the "pb" tag exists to signify page breaks...
# Using this code, you could break up the doc into multiple documents
# based on the position/depth of the individual "pb" tags.
require 'rubygems'
require 'nokogiri'
def next_node(start, e=[], &b)
e << start
if start.children.size > 0
yield start.children.first
next_node start.children.first, e, &b
end
if n = start.next
yield n
next_node n, e, &b
end
if ! e.include?(start) and n = start.parent.next
yield n
next_node n, e, &b
end
end
module XMLMethods
def previous_nodes_r
ps = previous_sibling
((parent.previous_nodes_r rescue []) + (ps.previous_nodes_r rescue []) + [previous_sibling]).compact.uniq
end
end
def fragment_doc(source_doc, pattern)
pages = []
source_doc.search(pattern).each do |test_pb|
doc = Nokogiri::XML(source_doc.to_xml)
pb = doc.at(test_pb.path)
deletions = []
mode = nil
after_target_node = false
next_node(doc) do |e|
e.extend XMLMethods
if e == pb
deletions += e.previous_nodes_r
after_target_node = true
end
if after_target_node and e != pb and e.name == pb.name
mode = :delete
end
deletions << e if mode == :delete and ! pb.ancestors.include?(e)
end
deletions.each do |e|
e.remove
end
pages << doc
end
pages
end
data = %Q(
<div>
<pre>
<pb id="pb1"/>
<div id="one">
<p>1 some text</p>
<p>2 some text</p>
<p>3 some text</p>
<p id="prePb2">4 some text</p>
<pb id="pb2"/>
<p>5 some text</p>
<p>6 some text</p>
<div id="one-one">
<p>1-1 some text</p>
<p>1-2 some text</p>
<p>1-3 some text</p>
<pb id="pb3"/>
<p>1-4 some text</p>
<p>1-5 some text</p>
</div>
</div>
<pb id="pb4"/>
<div id="two">
<p>1 some text</p>
<p>2 some text</p>
<p>3 some text</p>
<p>4 some text</p>
<pb id="pb5"/>
<p>5 some text</p>
<p>6 some text</p>
<div id="two-one">
<pb id="pb6"/>
<p>2-1 some text</p>
<p>2-2 some text</p>
<p>2-3 some text</p>
<p>2-4 some text</p>
<p>3-5 some text</p>
</div>
<p id="ending1">ENDING TEXT 1</p>
</div>
</pre>
<p id="ending2">ENDING TEXT 2</p>
</div>
)
xml = Nokogiri::XML(data)
pages = fragment_doc(xml, 'pb')
puts pages.inspect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment