Skip to content

Instantly share code, notes, and snippets.

@epitron
Created April 7, 2014 19:15
Show Gist options
  • Save epitron/10033479 to your computer and use it in GitHub Desktop.
Save epitron/10033479 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'pp'
doc0 = %{
<p>Hello <b>DOM!</b></p>
}
doc1 = %{
<div>
<p>Hello <b>DOM!</b></p>
<p>How are you doing?</p>
</div>
}
doc2 = %{
<div>
<p>Hello <b>DOM!</b></p>
<p>How are you doing?</p>
<p>Lovely day, isn't it?</p>
</div>
}
# This one creates an immense amount of repetition
crazy_doc = %{
<div id="entryContent" class="entrybox">
<div xmlns:h="http://www.w3.org/1999/xhtml" class="entry" xml:lang="en">
<span class="form_hwd"><h1><span class="seo">motte-and-bailey</span></h1></span><span class="hom"><span class="sense"><a name="motte-and-bailey__1"> </a><span class="span_sensenum">1</span><span class="gramGrp"><span class="pos" xml:lang="en"> noun</span></span><a href="http://www.focloir.ie/en/domain/ei/archit" onclick="javascript:openDomain(this, 'archit');return false;"><span title="ARCHITECTURE" class="lbl_purple_sc_i"> ARCHIT</span></a><a href="http://www.focloir.ie/en/domain/ei/hist" onclick="javascript:openDomain(this, 'hist');return false;"><span title="HISTORY" class="lbl_purple_sc_i"><span class="span_neutral">, </span>HIST</span></a><span class="cit_translation" xml:lang="ga"><span class="quote">móta agus bábhún</span></span></span></span><span class="hom"><span class="sense"><a name="motte-and-bailey__2"> </a><span class="span_sensenum">2</span><span class="gramGrp"><span class="pos" xml:lang="en"> adjective</span></span><a href="http://www.focloir.ie/en/domain/ei/archit" onclick="javascript:openDomain(this, 'archit');return false;"><span title="ARCHITECTURE" class="lbl_purple_sc_i"> ARCHIT</span></a><a href="http://www.focloir.ie/en/domain/ei/hist" onclick="javascript:openDomain(this, 'hist');return false;"><span title="HISTORY" class="lbl_purple_sc_i"><span class="span_neutral">, </span>HIST</span></a><span class="cit_translation" xml:lang="ga"><span class="quote">móta agus bábhúin</span></span><span class="cit_example"><span class="quote">a motte-and-bailey castle</span><span class="cit_translation_noline" xml:lang="ga"> <span class="quote">caisleán móta agus bábhúin</span></span></span></span></span>
</div>
<!-- End of DIV entry-->
</div>
}
class Nokogiri::XML::NodeSet
def nested_text
map do |node|
print "."
if node.text?
node
else
children.nested_text
end
end
end
end
[doc0, doc1, doc2, crazy_doc].each do |doc|
puts "========== About to walk: ==============="
puts doc
puts
print "Press ENTER to continue..."
$stdin.gets
puts
frag = Nokogiri::HTML.fragment(doc)
print "Walking"
result = frag.children.nested_text
puts
puts
puts "---------- Result: ------------------"
puts
pp result
puts
print "Press ENTER for next example..."
$stdin.gets
puts
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment