Skip to content

Instantly share code, notes, and snippets.

@baweaver
Created June 4, 2018 18:09
Show Gist options
  • Save baweaver/3cbfd144b525ff32c1f013774a5bfac7 to your computer and use it in GitHub Desktop.
Save baweaver/3cbfd144b525ff32c1f013774a5bfac7 to your computer and use it in GitHub Desktop.
# The goal of this problem is to extract headers from a block of text,
# and arrange them hierarchically.
#
# See the specs for more detail on the output
def header_hierarchy(html)
raise "TODO"
end
describe '#header_hierarchy' do
context 'EASY' do
it 'can extract a single header' do
expect(header_hierarchy("<h1>Foo</h1>")).to eq(['[h1] Foo'])
end
it 'can extract one nested level of header' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2>")
).to eq([
'[h1] Foo',
' [h2] Bar'
])
end
end
context 'MEDIUM' do
it 'can extract multiple levels of nested headers' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>")
).to eq([
'[h1] Foo',
' [h2] Bar',
' [h3] Baz',
' [h4] Bam'
])
end
end
context 'HARD' do
it 'can extract multiple nested headers in multiple branches' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>")
).to eq([
'[h1] Foo',
' [h2] Bar',
' [h3] Baz',
' [h2] Bam',
' [h3] Ba'
])
end
end
end
@baweaver
Copy link
Author

baweaver commented Jun 4, 2018

Bonus:

require 'net/http'
html = Net::HTTP.get(URI("https://jquery.com/"))

describe 'True parsing' do
  it 'can parse an entire document' do
    expect(header_hierarchy(html)).to eq([
      "[h0] ROOT",
      "  [h2] jQuery",
      "    [h3] Lightweight Footprint",
      "    [h3] CSS3 Compliant",
      "    [h3] Cross-Browser",
      "  [h2] What is jQuery?",
      "  [h2] Other Related Projects",
      "    [h3] Resources",
      "  [h2] A Brief Look",
      "    [h3] DOM Traversal and Manipulation",
      "    [h3] Event Handling",
      "    [h3] Ajax",
      "    [h3] Books"
    ])
  end
end

@JoshCheek
Copy link

Confused by the bonus example. Why does it insert ROOT, an element that DNE? And why is it an "h0" and not an "h1"? Eg what should it return for this? "<h3>whatevz</h3>" or for "<h1>a</h1><h5>b</h5>"?

@baweaver
Copy link
Author

baweaver commented Jun 6, 2018

My solution:

EDIT - Modified tests to reflect the 'ROOT' element, going to leave the original specification alone for a bit.

require 'nokogiri'
require 'rspec/autorun'

class HeaderNode
  attr_reader :parent, :children

  def initialize(name, tag_name, parent = nil)
    @name     = name
    @tag_name = tag_name
    @children = []
    @parent   = parent
  end

  def descendant?(tag_name)
    @tag_name < tag_name
  end

  def add_child(node_name, tag_name)
    HeaderNode.new(node_name, tag_name, self).tap { |child| @children << child }
  end

  def to_h
    { 'name' => @name, 'children' => @children.map(&:to_h) }
  end

  def to_s(indent_level = 0)
    indent = ' ' * indent_level
    tag    = "#{indent}[#{@tag_name}] #{@name}\n"

    tag + @children.map { |c| c.to_s(indent_level + 2) }.join
  end
end

def header_extractor(html_partial, header_levels: %w(h1 h2 h3 h4 h5 h6))
  root_node = HeaderNode.new('ROOT', 'h0')
  
  Nokogiri("<html>#{html_partial}</html>")
    .css(header_levels.join(', '))
    .reduce(root_node) do |current_tree, tag|
      current_tree = current_tree.parent until current_tree.descendant?(tag.name)
      current_tree.add_child(tag.text, tag.name)
    end

  root_node
end

def header_hierarchy(html_partial)
  header_extractor(html_partial).to_s.split("\n")
end

describe '#header_hierarchy' do
  context 'EASY' do
    it 'can extract a single header' do
      expect(header_hierarchy("<h1>Foo</h1>")).to eq([
        "[h0] ROOT",
        "  [h1] Foo"
      ])
    end
    
    it 'can extract one nested level of header' do
      expect(
        header_hierarchy("<h1>Foo</h1><h2>Bar</h2>")
      ).to eq([
        "[h0] ROOT",
        "  [h1] Foo",
        "    [h2] Bar"
      ])
    end
  end
  
  context 'MEDIUM' do
    it 'can extract multiple levels of nested headers' do
      expect(
        header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>")
      ).to eq([
        "[h0] ROOT",
        "  [h1] Foo",
        "    [h2] Bar",
        "      [h3] Baz",
        "        [h4] Bam"
      ])
    end
  end
  
  context 'HARD' do
    it 'can extract multiple nested headers in multiple branches' do
      expect(
        header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>")
      ).to eq([
        "[h0] ROOT",
        "  [h1] Foo",
        "    [h2] Bar",
        "      [h3] Baz",
        "    [h2] Bam",
        "      [h3] Ba"
      ])
    end
  end

  context 'LIVE' do
    it 'can parse an entire document' do
      require 'net/http'
      html = Net::HTTP.get(URI("https://jquery.com/"))

      expect(header_hierarchy(html)).to eq([
        "[h0] ROOT",
        "  [h2] jQuery",
        "    [h3] Lightweight Footprint",
        "    [h3] CSS3 Compliant",
        "    [h3] Cross-Browser",
        "  [h2] What is jQuery?",
        "  [h2] Other Related Projects",
        "    [h3] Resources",
        "  [h2] A Brief Look",
        "    [h3] DOM Traversal and Manipulation",
        "    [h3] Event Handling",
        "    [h3] Ajax",
        "    [h3] Books"
      ])
    end
  end
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment