Created
June 2, 2009 20:21
-
-
Save mwmitchell/122552 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is for breaking up XML docs | |
# into multiple chunks by a given node type. | |
# For example, in TEI the "pb" tag exists to signify page breaks... | |
# Using this code, you could break up the doc into multiple documents | |
# based on the position/depth of the individual "pb" tags. | |
# | |
# Think of this as an xml splitter, like String#split | |
# | |
require 'rubygems' | |
require 'nokogiri' | |
module NokogiriElementHelpers | |
# returns a list of nodes starting with this node | |
# all the way down through the descendents | |
def flatten | |
[self] + self.children.map{|child|child.flatten}.flatten | |
end | |
end | |
module NokogiriNodeHelpers | |
# returns all previous siblings | |
def previous_siblings | |
previous_sibling ? (previous_sibling.previous_siblings + [previous_sibling]) : [] | |
end | |
# returns all previous siblings, plus parent previous siblings (recursively) | |
def previous_nodes_recursive | |
# if the parent is the document | |
# return an array with only the previous sibling in it | |
# ... then compact (could be nil) | |
if parent.is_a?(Nokogiri::XML::Document) | |
[previous_sibling].compact | |
else | |
( | |
(parent.previous_nodes_recursive) + | |
(previous_sibling ? previous_sibling.previous_nodes_recursive : []) + | |
[previous_sibling] | |
).flatten.compact | |
end | |
end | |
end | |
Nokogiri::XML::Document.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Element.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Element.send :include, NokogiriNodeHelpers | |
Nokogiri::XML::Comment.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Comment.send :include, NokogiriNodeHelpers | |
Nokogiri::XML::Text.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Text.send :include, NokogiriNodeHelpers | |
class NokogiriFragmenter | |
class << self | |
def fragment(source, pattern, &blk) | |
pages = [] | |
source = Nokogiri::XML(source) if source.is_a?(String) | |
# get the first set of nodes before the first fragment | |
first_found = nil | |
first_deletes = [] | |
source_copy = Nokogiri::XML(source.to_xml) | |
source_copy.flatten.each do |e| | |
first_found ||= e.name == pattern | |
if first_found | |
first_deletes << e | |
end | |
end | |
first_deletes.each{|r| r.remove} | |
pages << source_copy | |
matching_nodes = source.search(pattern) | |
matching_nodes.size.times do |index| | |
deletes = [] | |
source_copy = Nokogiri::XML(source.to_xml) | |
snode = matching_nodes[index] | |
node = source_copy.at(snode.path) | |
found = nil | |
after = nil | |
source_copy.flatten.each do |e| | |
# skip the document element | |
if e == source_copy | |
next | |
end | |
# have we found the current fragment node? | |
found ||= e == node | |
# if the current node is the fragmenter, | |
# delete all of its previous siblings, recursively | |
if e == node | |
deletes += e.previous_nodes_recursive | |
end | |
# are we after the current fragmentor and to the next fragmenter? | |
after ||= (found and e != node and e.name == node.name) ? true : nil | |
deletes << e if after | |
end | |
deletes.each {|node|node.remove} | |
pages << source_copy | |
end | |
pages | |
end | |
end | |
end | |
data = %Q( | |
<root> | |
<p>START</p> | |
<pb id="pb0"/> | |
<p>Testing</p> | |
<div> | |
<pre> | |
<span>should not be in the pb#pb0</span> | |
<pb id="pb1"/> | |
<span>should only be for pb1</span> | |
<div id="one"> | |
<p>1 some text</p> | |
<p>2 some text</p> | |
<p>3 some text</p> | |
<p id="prePb2">4 some text</p> | |
<pb id="pb2"/> | |
<p>5 some text</p> | |
<p>6 some text</p> | |
<div id="one-one"> | |
<p>1-1 some text</p> | |
<p>1-2 some text</p> | |
<p>1-3 some text</p> | |
<pb id="pb3"/> | |
<p>1-4 some text</p> | |
<p>1-5 some text</p> | |
</div> | |
</div> | |
<pb id="pb4"/> | |
<div id="two"> | |
<p>1 some text</p> | |
<p>2 some text</p> | |
<p>3 some text</p> | |
<p>4 some text</p> | |
<pb id="pb5"/> | |
<p>5 some text</p> | |
<p>6 some text</p> | |
<div id="two-one"> | |
<pb id="pb6"/> | |
<p>2-1 some text</p> | |
<p>2-2 some text</p> | |
<p>2-3 some text</p> | |
<p>2-4 some text</p> | |
<p>3-5 some text</p> | |
</div> | |
<p id="ending1">ENDING TEXT 1</p> | |
</div> | |
</pre> | |
<p id="ending2">ENDING TEXT 2</p> | |
</div> | |
</root> | |
) | |
require 'benchmark' | |
upto = 1 | |
c1 = '' | |
Benchmark.bmbm do |x| | |
x.report('fragmenter') { upto.times{c1 = NokogiriFragmenter.fragment(data, 'pb')} } | |
end | |
puts c1.inspect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment