Created
February 9, 2012 12:54
-
-
Save jimweirich/1779811 to your computer and use it in GitHub Desktop.
Vital Ruby Advance Lab 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
class HtmlParser | |
def parse(source_url, body) | |
doc = Nokogiri::HTML(body) | |
urls = doc.css('a').map { |node| | |
att = node.attribute("href") | |
att ? att.text : nil } | |
normalize(source_url, urls.compact) | |
end | |
private | |
def normalize(source_url, urls) | |
urls.map { |u| normalize_url(source_url, u) }.compact | |
end | |
def normalize_url(source_url, url) | |
return nil if url =~ /^#/ | |
URI.parse(source_url).merge(URI.parse(url)).to_s | |
rescue StandardError => ex | |
nil | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rspec/given' | |
require './html_parser' | |
describe HtmlParser do | |
Given(:parser) { HtmlParser.new } | |
Given(:html) { "" } | |
Given(:source) { "http://x.com/" } | |
Given(:url) { "http://x.com/" } | |
Given(:urls) { [url] } | |
Given(:anchors) { urls.map { |u| %(<a href="#{u}">Link</a>) } } | |
Given(:html) { | |
"<html><body><h1>HI</h1>" + | |
anchors.join + | |
"</body></html>" | |
} | |
When(:result) { parser.parse(source, html) } | |
context "with an empty body" do | |
Given(:html) { "" } | |
Then { result.should == [] } | |
end | |
context "with no links" do | |
Given(:html) { "<html><body><h1>HI</h1></body></html>" } | |
Then { result.should == [] } | |
end | |
context "with a single link" do | |
Then { result.should == ["http://x.com/"] } | |
end | |
context "with multiple link" do | |
Given(:urls) { ["http://x.com", "http://x.com/b" ] } | |
Then { result.should == urls } | |
end | |
context "with path-link links" do | |
Given(:urls) { ["/a", "b", "./c"] } | |
Then { result.should == ["http://x.com/a", "http://x.com/b", "http://x.com/c"] } | |
end | |
context "with off-site links" do | |
Given(:source) { "http://x.com/page" } | |
Given(:url) { "http://z.com/index.html" } | |
Then { result.should == ["http://z.com/index.html"] } | |
end | |
context "with non-html links" do | |
Given(:urls) { ["mailto://[email protected]", "http://x.com/"] } | |
Then { result.should == ["http://x.com/"] } | |
end | |
context "with hash pieces links" do | |
Given(:urls) { ["#hash", "http://x.com/"] } | |
Then { result.should == ["http://x.com/"] } | |
end | |
context "with a bad URI" do | |
Given(:source) { "http://x.com/page" } | |
Given(:urls) { [ "a\nb", "b" ] } | |
Then { result.should == ["http://x.com/b"] } | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment