Created
June 2, 2009 21:01
-
-
Save mwmitchell/122578 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is for breaking up XML docs | |
# into multiple chunks by a given node type. | |
# For example, in TEI the "pb" tag exists to signify page breaks... | |
# Using this code, you could break up the doc into multiple documents | |
# based on the position/depth of the individual "pb" tags. | |
# | |
# Think of this as an xml splitter, like String#split | |
# | |
require 'rubygems' | |
require 'nokogiri' | |
module NokogiriElementHelpers | |
# returns a list of nodes starting with this node | |
# all the way down through the descendents | |
def flatten | |
@flattened ||= ( | |
[self] + self.children.map{|child|child.flatten}.flatten | |
) | |
end | |
end | |
module NokogiriNodeHelpers | |
# returns all previous siblings | |
def previous_siblings | |
@ps ||= ( | |
previous_sibling ? (previous_sibling.previous_siblings + [previous_sibling]) : [] | |
) | |
end | |
# returns all previous siblings, plus parent previous siblings (recursively) | |
def previous_nodes_recursive | |
# if the parent is the document | |
# return an array with only the previous sibling in it | |
# ... then compact (could be nil) | |
@pnr ||= ( | |
if parent.is_a?(Nokogiri::XML::Document) | |
[previous_sibling].compact | |
else | |
( | |
(parent.previous_nodes_recursive) + | |
(previous_sibling ? previous_sibling.previous_nodes_recursive : []) + | |
[previous_sibling] | |
).flatten.compact | |
end | |
) | |
end | |
end | |
Nokogiri::XML::Document.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Element.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Element.send :include, NokogiriNodeHelpers | |
Nokogiri::XML::Comment.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Comment.send :include, NokogiriNodeHelpers | |
Nokogiri::XML::Text.send :include, NokogiriElementHelpers | |
Nokogiri::XML::Text.send :include, NokogiriNodeHelpers | |
class NokogiriFragmenter | |
class << self | |
def fragment(source, pattern, &blk) | |
pages = [] | |
source = Nokogiri::XML(source) if source.is_a?(String) | |
# get the first set of nodes before the first fragment | |
first_found = nil | |
first_deletes = [] | |
source_copy = source.dup | |
source_copy.flatten.each do |e| | |
first_found ||= e.name == pattern | |
if first_found | |
first_deletes << e | |
end | |
end | |
first_deletes.each{|r| r.remove} | |
pages << source_copy | |
matching_nodes = source.search(pattern) | |
matching_nodes.each do |snode| | |
deletes = [] | |
source_copy = source.dup | |
node = source_copy.at(snode.path) | |
found = nil | |
after = nil | |
source_copy.flatten.each do |e| | |
# skip the document element | |
if e == source_copy | |
next | |
end | |
# have we found the current fragment node? | |
found ||= e == node | |
# if the current node is the fragmenter, | |
# delete all of its previous siblings, recursively | |
if e == node | |
puts 'deleting previous nodes, recursively' | |
deletes += e.previous_nodes_recursive | |
else | |
# are we after the current fragmentor and to the next fragmenter? | |
after ||= (found and e.name == node.name) ? true : nil | |
deletes << e if after | |
end | |
end | |
deletes.each {|node|node.remove} | |
pages << source_copy | |
end | |
pages | |
end | |
end | |
end | |
data = %Q( | |
<root> | |
<p>START</p> | |
<pb id="pb0"/> | |
<p>Testing</p> | |
<div> | |
<pre> | |
<span>should not be in the pb#pb0</span> | |
<pb id="pb1"/> | |
<span>should only be for pb1</span> | |
<div id="one"> | |
<p>1 some text</p> | |
<p>2 some text</p> | |
<p>3 some text</p> | |
<p id="prePb2">4 some text</p> | |
<pb id="pb2"/> | |
<p>5 some text</p> | |
<p>6 some text</p> | |
<div id="one-one"> | |
<p>1-1 some text</p> | |
<p>1-2 some text</p> | |
<p>1-3 some text</p> | |
<pb id="pb3"/> | |
<p>1-4 some text</p> | |
<p>1-5 some text</p> | |
</div> | |
</div> | |
<pb id="pb4"/> | |
<div id="two"> | |
<p>1 some text</p> | |
<p>2 some text</p> | |
<p>3 some text</p> | |
<p>4 some text</p> | |
<pb id="pb5"/> | |
<p>5 some text</p> | |
<p>6 some text</p> | |
<div id="two-one"> | |
<pb id="pb6"/> | |
<p>2-1 some text</p> | |
<p>2-2 some text</p> | |
<p>2-3 some text</p> | |
<p>2-4 some text</p> | |
<p>3-5 some text</p> | |
</div> | |
<p id="ending1">ENDING TEXT 1</p> | |
</div> | |
</pre> | |
<p id="ending2">ENDING TEXT 2</p> | |
</div> | |
</root> | |
) | |
require 'benchmark' | |
upto = 1 | |
c1 = '' | |
Benchmark.bmbm do |x| | |
x.report('fragmenter') { upto.times{c1 = NokogiriFragmenter.fragment(data, 'pb')} } | |
end | |
puts c1.inspect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment