Skip to content

Instantly share code, notes, and snippets.

@jimweirich
Created February 9, 2012 12:54
Show Gist options
  • Save jimweirich/1779811 to your computer and use it in GitHub Desktop.
Save jimweirich/1779811 to your computer and use it in GitHub Desktop.
Vital Ruby Advance Lab 2
require 'nokogiri'
class HtmlParser
def parse(source_url, body)
doc = Nokogiri::HTML(body)
urls = doc.css('a').map { |node|
att = node.attribute("href")
att ? att.text : nil }
normalize(source_url, urls.compact)
end
private
def normalize(source_url, urls)
urls.map { |u| normalize_url(source_url, u) }.compact
end
def normalize_url(source_url, url)
return nil if url =~ /^#/
URI.parse(source_url).merge(URI.parse(url)).to_s
rescue StandardError => ex
nil
end
end
require 'rspec/given'
require './html_parser'
describe HtmlParser do
Given(:parser) { HtmlParser.new }
Given(:html) { "" }
Given(:source) { "http://x.com/" }
Given(:url) { "http://x.com/" }
Given(:urls) { [url] }
Given(:anchors) { urls.map { |u| %(<a href="#{u}">Link</a>) } }
Given(:html) {
"<html><body><h1>HI</h1>" +
anchors.join +
"</body></html>"
}
When(:result) { parser.parse(source, html) }
context "with an empty body" do
Given(:html) { "" }
Then { result.should == [] }
end
context "with no links" do
Given(:html) { "<html><body><h1>HI</h1></body></html>" }
Then { result.should == [] }
end
context "with a single link" do
Then { result.should == ["http://x.com/"] }
end
context "with multiple link" do
Given(:urls) { ["http://x.com", "http://x.com/b" ] }
Then { result.should == urls }
end
context "with path-link links" do
Given(:urls) { ["/a", "b", "./c"] }
Then { result.should == ["http://x.com/a", "http://x.com/b", "http://x.com/c"] }
end
context "with off-site links" do
Given(:source) { "http://x.com/page" }
Given(:url) { "http://z.com/index.html" }
Then { result.should == ["http://z.com/index.html"] }
end
context "with non-html links" do
Given(:urls) { ["mailto://[email protected]", "http://x.com/"] }
Then { result.should == ["http://x.com/"] }
end
context "with hash pieces links" do
Given(:urls) { ["#hash", "http://x.com/"] }
Then { result.should == ["http://x.com/"] }
end
context "with a bad URI" do
Given(:source) { "http://x.com/page" }
Given(:urls) { [ "a\nb", "b" ] }
Then { result.should == ["http://x.com/b"] }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment