Created
June 4, 2018 18:09
-
-
Save baweaver/3cbfd144b525ff32c1f013774a5bfac7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The goal of this problem is to extract headers from a block of text, | |
# and arrange them hierarchically. | |
# | |
# See the specs for more detail on the output | |
def header_hierarchy(html) | |
raise "TODO" | |
end | |
describe '#header_hierarchy' do | |
context 'EASY' do | |
it 'can extract a single header' do | |
expect(header_hierarchy("<h1>Foo</h1>")).to eq(['[h1] Foo']) | |
end | |
it 'can extract one nested level of header' do | |
expect( | |
header_hierarchy("<h1>Foo</h1><h2>Bar</h2>") | |
).to eq([ | |
'[h1] Foo', | |
' [h2] Bar' | |
]) | |
end | |
end | |
context 'MEDIUM' do | |
it 'can extract multiple levels of nested headers' do | |
expect( | |
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>") | |
).to eq([ | |
'[h1] Foo', | |
' [h2] Bar', | |
' [h3] Baz', | |
' [h4] Bam' | |
]) | |
end | |
end | |
context 'HARD' do | |
it 'can extract multiple nested headers in multiple branches' do | |
expect( | |
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>") | |
).to eq([ | |
'[h1] Foo', | |
' [h2] Bar', | |
' [h3] Baz', | |
' [h2] Bam', | |
' [h3] Ba' | |
]) | |
end | |
end | |
end |
My solution:
EDIT - Modified tests to reflect the 'ROOT' element, going to leave the original specification alone for a bit.
require 'nokogiri'
require 'rspec/autorun'
class HeaderNode
attr_reader :parent, :children
def initialize(name, tag_name, parent = nil)
@name = name
@tag_name = tag_name
@children = []
@parent = parent
end
def descendant?(tag_name)
@tag_name < tag_name
end
def add_child(node_name, tag_name)
HeaderNode.new(node_name, tag_name, self).tap { |child| @children << child }
end
def to_h
{ 'name' => @name, 'children' => @children.map(&:to_h) }
end
def to_s(indent_level = 0)
indent = ' ' * indent_level
tag = "#{indent}[#{@tag_name}] #{@name}\n"
tag + @children.map { |c| c.to_s(indent_level + 2) }.join
end
end
def header_extractor(html_partial, header_levels: %w(h1 h2 h3 h4 h5 h6))
root_node = HeaderNode.new('ROOT', 'h0')
Nokogiri("<html>#{html_partial}</html>")
.css(header_levels.join(', '))
.reduce(root_node) do |current_tree, tag|
current_tree = current_tree.parent until current_tree.descendant?(tag.name)
current_tree.add_child(tag.text, tag.name)
end
root_node
end
def header_hierarchy(html_partial)
header_extractor(html_partial).to_s.split("\n")
end
describe '#header_hierarchy' do
context 'EASY' do
it 'can extract a single header' do
expect(header_hierarchy("<h1>Foo</h1>")).to eq([
"[h0] ROOT",
" [h1] Foo"
])
end
it 'can extract one nested level of header' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2>")
).to eq([
"[h0] ROOT",
" [h1] Foo",
" [h2] Bar"
])
end
end
context 'MEDIUM' do
it 'can extract multiple levels of nested headers' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>")
).to eq([
"[h0] ROOT",
" [h1] Foo",
" [h2] Bar",
" [h3] Baz",
" [h4] Bam"
])
end
end
context 'HARD' do
it 'can extract multiple nested headers in multiple branches' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>")
).to eq([
"[h0] ROOT",
" [h1] Foo",
" [h2] Bar",
" [h3] Baz",
" [h2] Bam",
" [h3] Ba"
])
end
end
context 'LIVE' do
it 'can parse an entire document' do
require 'net/http'
html = Net::HTTP.get(URI("https://jquery.com/"))
expect(header_hierarchy(html)).to eq([
"[h0] ROOT",
" [h2] jQuery",
" [h3] Lightweight Footprint",
" [h3] CSS3 Compliant",
" [h3] Cross-Browser",
" [h2] What is jQuery?",
" [h2] Other Related Projects",
" [h3] Resources",
" [h2] A Brief Look",
" [h3] DOM Traversal and Manipulation",
" [h3] Event Handling",
" [h3] Ajax",
" [h3] Books"
])
end
end
end
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Confused by the bonus example. Why does it insert ROOT, an element that DNE? And why is it an "h0" and not an "h1"? Eg what should it return for this?
"<h3>whatevz</h3>"
or for"<h1>a</h1><h5>b</h5>"
?