Created
January 7, 2010 22:22
-
-
Save tobinharris/271647 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'hpricot' | |
require 'open-uri' | |
require 'pp' | |
class Spider | |
def initialize | |
@max_pages = 50 | |
@counter = 0 | |
@crawled_urls = [] | |
@root_url = nil | |
@root_node = nil | |
@queue = [] | |
end | |
def start(url) | |
@root_url = url | |
@queue << [url,nil] | |
while @queue.length > 0 and @counter < @max_pages | |
item = @queue.shift | |
index item[0], item[1] | |
end | |
end | |
def index(url, parent) | |
puts "Considering #{url}" | |
#don't do the same page twice | |
return if @crawled_urls.include? url | |
#add url so we don't bother again | |
@crawled_urls << url | |
# page must contain URL of root page | |
return unless url.include? @root_url | |
doc = open(url) { |f| Hpricot(f) } | |
node = create_node(doc, url, parent) | |
links_for(doc).each do |link| | |
@queue << [link, node] | |
end | |
puts "Indexed #{url}" | |
@counter = @counter + 1 | |
#don't hammer server | |
sleep 1 | |
end | |
def create_node(doc,url, parent) | |
node = Node.new | |
parent.children << node unless parent.nil? | |
#get interesting stuff | |
node.url = url | |
found = doc.search("/html/head/title") | |
node.title = found[0].inner_html if found.length == 1 | |
node.page_name = url | |
node.page_name = url.gsub(@root_url,'') unless url == @root_url | |
#parent is first node | |
@root_node = node if @root_node.nil? | |
node | |
end | |
def links_for(doc) | |
links = [] | |
(doc/"a").each do |link| | |
next unless link.attributes['href'] | |
url = absolute_url(link.attributes['href']).chomp('/') | |
next if @crawled_urls.include? url | |
next unless is_crawlable_url(url) | |
links << url | |
end | |
links | |
end | |
def is_crawlable_url(url) | |
return false if url =~ /^\s*mailto\:/ | |
return false if url.include? '#' | |
true | |
end | |
def unique_url(url) | |
url = url.scan(/([^\#]+)\#.*/)[0] if url =~ /\#/ | |
url | |
end | |
def absolute_url(url) | |
return url if url =~ /^http\:|^https\:/ | |
u=URI.parse(@root_url) | |
begin | |
a=u+url | |
rescue | |
return "http://badurl.me" | |
end | |
a.to_s | |
end | |
def root | |
@root_node | |
end | |
end | |
class Node | |
attr_accessor :children | |
attr_accessor :url | |
attr_accessor :title | |
attr_accessor :page_name | |
def initialize | |
self.children = [] | |
end | |
end | |
class Yumlify | |
def initialize(root) | |
@data = "" | |
append(root) | |
end | |
def append(node) | |
@data += "[#{node.url}|#{node.children.length} links]\n" | |
node.children.each do |child| | |
@data += "[#{node.url}]->[#{child.url}]\n" | |
append(child) | |
end | |
end | |
def color(node) | |
end | |
def data | |
@data | |
end | |
end | |
#TODO, write code like the class above to convert to JIT js formatted nodes | |
class Jitify | |
end | |
s = Spider.new | |
s.start('http://engineroomapps.com') | |
puts Yumlify.new(s.root).data | |
#puts Jitify.new(s.root).data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment