Last active
June 1, 2016 20:27
-
-
Save petehamilton/4130573 to your computer and use it in GitHub Desktop.
Script to Crawl a Site for Links/Assets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# GEMFILE | |
# source 'https://rubygems.org' | |
# | |
# gem 'nokogiri' | |
# gem 'awesome_print' | |
#!/usr/bin/env ruby | |
require 'open-uri' | |
require 'optparse' | |
require 'monitor' | |
require 'nokogiri' | |
require 'awesome_print' | |
# The program takes a domain/site and crawls it. | |
# It then outputs the resulting hash to the screen. | |
# | |
# Author:: Peter Hamilton (mailto:[email protected]) | |
# This class is what's used to crawl a site | |
# It stores a hash @pages of url => links and assets(images/js/css) which can be | |
# retrieved using get_pages | |
# | |
# By giving it a number of threads > 1, you can speed up the processing considerably. | |
# For example: | |
# $ time ./simple_crawler.rb https://gocardless.com -t 1 => 1.94s user 0.13s system 9% cpu 21.552 total | |
# $ time ./simple_crawler.rb https://gocardless.com -t 2 => 1.82s user 0.11s system 15% cpu 12.627 total | |
# $ time ./simple_crawler.rb https://gocardless.com -t 3 => 1.79s user 0.11s system 19% cpu 9.599 total | |
# $ time ./simple_crawler.rb https://gocardless.com -t 5 => 1.75s user 0.11s system 28% cpu 6.570 total | |
# $ time ./simple_crawler.rb https://gocardless.com -t 10 => 1.69s user 0.10s system 38% cpu 4.688 total | |
class SimpleCrawler | |
# Set up the crawler | |
# Takes a root url e.g https://gocardless.com and an options hash | |
# | |
# Options: | |
# - verbose - show all urls as they're processed | |
# - thread_count - Use a certain size thread pool | |
def initialize(root_url, options = {}) | |
@verbose = false || options[:verbose] | |
@thread_count = options[:thread_count] | |
@thread_count ||= 1 | |
@root_url = "" | |
@root_url = process_url root_url | |
end | |
# Perform the site crawl | |
# 1. Creates a queue of urls to crawl (starting with the root url) | |
# 2. Create a thread pool (using size thread_count, defined when created) | |
# 3. While queue not empty, threads will process URLs | |
def crawl | |
puts "Crawling #{@root_url}" if @verbose | |
@pages = {} | |
@crawl_queue = Queue.new | |
@crawl_queue << "#{@root_url}" | |
@crawl_queue.extend MonitorMixin | |
crawl_queue_cond = @crawl_queue.new_cond | |
threads = [] | |
active_threads = 0 | |
crawl_complete = false | |
@thread_count.times do |i| | |
# Register/count each active thread | |
@crawl_queue.synchronize do | |
active_threads += 1 | |
end | |
resources = nil | |
url = nil | |
threads << Thread.new do | |
loop do | |
# Synchronize on critical code which adds to the pages and queue | |
@crawl_queue.synchronize do | |
unless resources.nil? | |
update_pages_and_queue(url, resources) | |
print_status(url) | |
else | |
# URL Error, skip. Could add future functionality for n-retries? | |
@pages.delete url | |
end | |
# 1. If empty queue + no other threads running implies that we've | |
# completed the site crawl. Can be modified by all threads | |
# 2. Wake up other threads which will either process more urls or | |
# exit depending on 'crawl_complete' and queue state | |
# 3. Wait until queue is not empty or crawling is marked as complete | |
# 4. Thread has woken up, exit if we're done crawling | |
# 5. If not done, bump active thread count and re-enter loop | |
crawl_complete = true if @crawl_queue.empty? and active_threads == 1 | |
crawl_queue_cond.broadcast unless @crawl_queue.empty? and !crawl_complete | |
active_threads -= 1 | |
crawl_queue_cond.wait_while { @crawl_queue.empty? and !crawl_complete } | |
Thread.exit if crawl_complete | |
active_threads += 1 | |
url = @crawl_queue.shift | |
end | |
resources = crawl_url url | |
end | |
end | |
end | |
threads.each { |t| t.join } | |
end | |
# Get the pages hash. Each entry contains a hash for the links and assets | |
def get_pages | |
@pages | |
end | |
private | |
# Retrieves HTML for the given url, extract all links and assets and return in a hash | |
def crawl_url(url) | |
begin | |
html = Nokogiri::HTML(open(url).read) | |
rescue Exception => e | |
puts "Error reading #{url} :: #{e}" if @verbose | |
return nil | |
end | |
links = html.css('a').map { |link| process_url link['href'] }.compact | |
assets = html.css('link').map { |link| process_url link['href'] }.compact | |
assets += html.css('img').map { |link| process_url link['src'] }.compact | |
assets += html.css('script').map { |link| process_url link['src'] }.compact | |
return {links: links.uniq, assets: assets.uniq} | |
end | |
# Given a url, clean it up | |
# - Remove any hash urls or query parameters | |
# - Discard external links, mailtos, tels, javascript triggers | |
# - Ensure returned URL is absolute | |
def process_url(url) | |
return nil if url.nil? or url.empty? | |
url = url.gsub(/[#|?].*/, '') # Clear hashrefs and query params | |
url = url.gsub(/\/$/, '') # Remove trailing slashes | |
bad_matches = [ | |
/^(http(?!#{Regexp.escape @root_url.gsub("http","")})|\/\/)/, # Discard external links | |
/^mailto/, # Discard mailto links | |
/^tel/, # Discard telephone | |
/^javascript/ # Discard javascript triggers | |
] | |
# Case slightly more open to extension | |
case url | |
when *bad_matches | |
return nil | |
else | |
return URI.join(@root_url, url).to_s | |
end | |
end | |
# Output the current completions/total_queued to the console | |
# Defaults to single-line-update but verbose (-v) mode triggers full output | |
def print_status(url) | |
done = @pages.values.compact.length.to_s.rjust(2, '0') | |
total = @pages.length.to_s.rjust(2, '0') | |
print "\r#{" "*80}\r" unless @verbose | |
print "Crawled #{done}/#{total}: #{url}" | |
print "\n" if @verbose | |
STDOUT.flush | |
end | |
# Sets the page resources for the given URL and adds any new links | |
# to the crawl queue | |
def update_pages_and_queue(url, resources) | |
@pages[url] = resources | |
resources[:links].each do |link| | |
unless @pages.has_key? link | |
@crawl_queue.enq(link) | |
@pages[link] = nil | |
end | |
end | |
end | |
end | |
# Gather Command Line Options | |
options = {} | |
options[:verbose] = false | |
opt_parser = OptionParser.new do |opt| | |
opt.banner = "Usage: simple_crawler URL [OPTIONS]" | |
opt.separator "" | |
opt.separator "Options" | |
opt.on("-t n","--thread-count=n", OptionParser::DecimalInteger, "Process using a thread pool of size n") do |thread_count| | |
options[:thread_count] = thread_count | |
end | |
opt.on("-v","--verbose","show all urls processed") do | |
options[:verbose] = true | |
end | |
opt.on("-h","--help","help (show this)") do | |
puts opt_parser | |
exit | |
end | |
end | |
# Run crawler | |
opt_parser.parse! | |
# Require domain | |
if ARGV.count < 1 | |
puts opt_parser | |
exit | |
end | |
# Crawl domain URL | |
domain_url = ARGV[0] | |
c = SimpleCrawler.new(domain_url, options) | |
c.crawl | |
# Print pages hash | |
ap c.get_pages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment