Created
November 5, 2010 10:56
-
-
Save jcf/663971 to your computer and use it in GitHub Desktop.
Hacky crawler using Mechanize
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'uri' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'logger' | |
trap('INT') { @crawler.report; exit } | |
class Crawler | |
attr_reader :url, :failures, :pages_crawled | |
def initialize(url) | |
@url = url | |
@failures = Hash.new([]) | |
end | |
def report | |
puts | |
puts "Successful hits: #{pages_crawled.length}" | |
puts "Errors: #{failures.values.map(&:length).reduce(:+) || 0}" | |
puts | |
failures.map do |error, urls| | |
puts error | |
puts urls.map { |url| " #{url}" } | |
end | |
puts | |
end | |
def run | |
crawl_page(url) | |
end | |
def pages_crawled | |
@pages_crawled ||= [] | |
end | |
private | |
def good(message) | |
puts "\e[32m==>\e[0m #{message}" | |
end | |
def bad(message) | |
puts "\e[31m==>\e[0m #{message}" | |
end | |
def agent | |
@agent ||= Mechanize.new do |a| | |
a.log = Logger.new('log/crawler.log') | |
a.user_agent_alias = 'Mac Safari' | |
end | |
end | |
def crawl_page(url) | |
address = simple_address_from_url(url) | |
return if pages_crawled.include?(address) | |
pages_crawled.push(address) | |
begin | |
page = agent.get(url) | |
good "GET #{url}" | |
page.links.map(&:href).each do |href| | |
href.sub!(/\/?$/, '') # Remove trailing slashes | |
href.sub!(/#.*?$/, '') # Remove anchors | |
crawl_page(href) | |
end | |
rescue => e | |
failures[e.class.to_s] += [url] | |
bad "GET #{url}" | |
end | |
end | |
def simple_address_from_url(url) | |
uri_parts = URI.split(url) | |
"#{uri_parts[0]}://#{uri_parts[1..-4].join}" | |
end | |
end | |
url = ARGV[0] | |
@crawler = Crawler.new(url) | |
@crawler.run | |
@crawler.report |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment