Created
March 6, 2012 03:56
-
-
Save mweppler/1983320 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'open-uri' | |
require 'nokogiri' | |
def get_anchors_from_html_doc html_doc | |
# Returns a Nokogiri::XML::NodeSet | |
html_doc.css('a') | |
end | |
def get_hrefs_from_anchors anchors | |
uris = [] | |
anchors.each do |anchor| | |
temp = anchor.attribute('href').to_s | |
# puts temp.index(/http/) | |
if temp.index(/http/) != 0 | |
temp.sub!(/^\//,"") | |
uris.push temp.insert(0,@url) | |
else | |
uris.push temp | |
end | |
end | |
# Remove empties entries, different domains, and mailto | |
uris.uniq.sort.delete_if {|href| | |
href.empty? or | |
href.match(/mailto:/) or | |
!href.match(@url) | |
} | |
end | |
def get_html_doc url | |
# Returns a Nokogiri::HTML::Document | |
Nokogiri::HTML(open(url)) | |
end | |
@url = "http://www.site.com/" | |
# puts "#{@url}" | |
urls = [] | |
html_doc = get_html_doc @url | |
# puts "#{html_doc}" | |
anchors = get_anchors_from_html_doc html_doc | |
# puts "#{anchors}" | |
hrefs = get_hrefs_from_anchors anchors | |
# puts hrefs | |
urls = urls + hrefs | |
urls.sort!.uniq! | |
# puts urls | |
hrefs.each do |href| | |
t1 = get_html_doc href | |
t2 = get_anchors_from_html_doc t1 | |
t3 = get_hrefs_from_anchors t2 | |
# urls = urls + (t3 - urls) | |
urls = urls + t3 | |
urls.sort!.uniq! | |
end | |
puts urls | |
puts "total urls:#{urls.size}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment