Skip to content

Instantly share code, notes, and snippets.

@mwmitchell
Created June 2, 2009 21:01
Show Gist options
  • Save mwmitchell/122578 to your computer and use it in GitHub Desktop.
Save mwmitchell/122578 to your computer and use it in GitHub Desktop.
# This is for breaking up XML docs
# into multiple chunks by a given node type.
# For example, in TEI the "pb" tag exists to signify page breaks...
# Using this code, you could break up the doc into multiple documents
# based on the position/depth of the individual "pb" tags.
#
# Think of this as an xml splitter, like String#split
#
require 'rubygems'
require 'nokogiri'
module NokogiriElementHelpers
# returns a list of nodes starting with this node
# all the way down through the descendents
def flatten
@flattened ||= (
[self] + self.children.map{|child|child.flatten}.flatten
)
end
end
module NokogiriNodeHelpers
# returns all previous siblings
def previous_siblings
@ps ||= (
previous_sibling ? (previous_sibling.previous_siblings + [previous_sibling]) : []
)
end
# returns all previous siblings, plus parent previous siblings (recursively)
def previous_nodes_recursive
# if the parent is the document
# return an array with only the previous sibling in it
# ... then compact (could be nil)
@pnr ||= (
if parent.is_a?(Nokogiri::XML::Document)
[previous_sibling].compact
else
(
(parent.previous_nodes_recursive) +
(previous_sibling ? previous_sibling.previous_nodes_recursive : []) +
[previous_sibling]
).flatten.compact
end
)
end
end
Nokogiri::XML::Document.send :include, NokogiriElementHelpers
Nokogiri::XML::Element.send :include, NokogiriElementHelpers
Nokogiri::XML::Element.send :include, NokogiriNodeHelpers
Nokogiri::XML::Comment.send :include, NokogiriElementHelpers
Nokogiri::XML::Comment.send :include, NokogiriNodeHelpers
Nokogiri::XML::Text.send :include, NokogiriElementHelpers
Nokogiri::XML::Text.send :include, NokogiriNodeHelpers
class NokogiriFragmenter
class << self
def fragment(source, pattern, &blk)
pages = []
source = Nokogiri::XML(source) if source.is_a?(String)
# get the first set of nodes before the first fragment
first_found = nil
first_deletes = []
source_copy = source.dup
source_copy.flatten.each do |e|
first_found ||= e.name == pattern
if first_found
first_deletes << e
end
end
first_deletes.each{|r| r.remove}
pages << source_copy
matching_nodes = source.search(pattern)
matching_nodes.each do |snode|
deletes = []
source_copy = source.dup
node = source_copy.at(snode.path)
found = nil
after = nil
source_copy.flatten.each do |e|
# skip the document element
if e == source_copy
next
end
# have we found the current fragment node?
found ||= e == node
# if the current node is the fragmenter,
# delete all of its previous siblings, recursively
if e == node
puts 'deleting previous nodes, recursively'
deletes += e.previous_nodes_recursive
else
# are we after the current fragmentor and to the next fragmenter?
after ||= (found and e.name == node.name) ? true : nil
deletes << e if after
end
end
deletes.each {|node|node.remove}
pages << source_copy
end
pages
end
end
end
data = %Q(
<root>
<p>START</p>
<pb id="pb0"/>
<p>Testing</p>
<div>
<pre>
<span>should not be in the pb#pb0</span>
<pb id="pb1"/>
<span>should only be for pb1</span>
<div id="one">
<p>1 some text</p>
<p>2 some text</p>
<p>3 some text</p>
<p id="prePb2">4 some text</p>
<pb id="pb2"/>
<p>5 some text</p>
<p>6 some text</p>
<div id="one-one">
<p>1-1 some text</p>
<p>1-2 some text</p>
<p>1-3 some text</p>
<pb id="pb3"/>
<p>1-4 some text</p>
<p>1-5 some text</p>
</div>
</div>
<pb id="pb4"/>
<div id="two">
<p>1 some text</p>
<p>2 some text</p>
<p>3 some text</p>
<p>4 some text</p>
<pb id="pb5"/>
<p>5 some text</p>
<p>6 some text</p>
<div id="two-one">
<pb id="pb6"/>
<p>2-1 some text</p>
<p>2-2 some text</p>
<p>2-3 some text</p>
<p>2-4 some text</p>
<p>3-5 some text</p>
</div>
<p id="ending1">ENDING TEXT 1</p>
</div>
</pre>
<p id="ending2">ENDING TEXT 2</p>
</div>
</root>
)
require 'benchmark'
upto = 1
c1 = ''
Benchmark.bmbm do |x|
x.report('fragmenter') { upto.times{c1 = NokogiriFragmenter.fragment(data, 'pb')} }
end
puts c1.inspect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment