Created
January 30, 2016 18:35
-
-
Save alex-quiterio/2b784798171765d1f7f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A web crawler in Ruby | |
# | |
# This script provides a generic Spider class for crawling urls and | |
# recording data scraped from websites. The Spider is to be used in | |
# collaboration with a "processor" class that defines which pages to | |
# visit and how data from those pages should be consumed. In this example | |
# the processor is ProgrammableWeb. | |
# | |
# Usage: | |
# spider = ProgrammableWeb.new | |
# spider.results.take(10) | |
# => [{...}, {...}, ...] | |
# | |
# Requirements: | |
# Ruby 2.0+ | |
# | |
require "mechanize" | |
require "pry" | |
class Spider | |
REQUEST_INTERVAL = 1 | |
MAX_URLS = 1000 | |
attr_reader :handlers | |
def initialize(processor, options = {}) | |
@processor = processor | |
@results = [] | |
@urls = [] | |
@handlers = {} | |
@interval = options.fetch(:interval, REQUEST_INTERVAL) | |
@max_urls = options.fetch(:max_urls, MAX_URLS) | |
enqueue(@processor.root, @processor.handler) | |
end | |
def enqueue(url, method, data = {}) | |
return if @handlers[url] | |
@urls << url | |
@handlers[url] ||= { method: method, data: data } | |
end | |
def record(data = {}) | |
@results << data | |
end | |
def results | |
return enum_for(:results) unless block_given? | |
i = @results.length | |
enqueued_urls.each do |url, handler| | |
begin | |
log "Handling", url.inspect | |
@processor.send(handler[:method], agent.get(url), handler[:data]) | |
if block_given? && @results.length > i | |
yield @results.last | |
i += 1 | |
end | |
rescue => ex | |
log "Error", "#{url.inspect}, #{ex}" | |
end | |
sleep @interval if @interval > 0 | |
end | |
end | |
private | |
def enqueued_urls | |
Enumerator.new do |y| | |
index = 0 | |
while index < @urls.count && index <= @max_urls | |
url = @urls[index] | |
index += 1 | |
next unless url | |
y.yield url, @handlers[url] | |
end | |
end | |
end | |
def log(label, info) | |
warn "%-10s: %s" % [label, info] | |
end | |
def agent | |
@agent ||= Mechanize.new | |
end | |
end | |
class ProgrammableWeb | |
attr_reader :root, :handler | |
def initialize(root: "https://programmableweb.com/apis/directory", handler: :process_index, **options) | |
@root = root | |
@handler = handler | |
@options = options | |
end | |
def process_index(page, data = {}) | |
page.links_with(href: /\?page=\d+/).each do |link| | |
spider.enqueue(link.href, :process_index) | |
end | |
page.links_with(href: %r{/api/\w+$}).each do |link| | |
spider.enqueue(link.href, :process_api, name: link.text) | |
end | |
end | |
def process_api(page, data = {}) | |
categories = page.search("article.node-api .tags").first.text.strip.split(/\s+/) | |
fields = page.search("#tabs-content .field").each_with_object({}) do |tag, results| | |
key = tag.search("label").text.strip.downcase.gsub(/[^\w]+/, ' ').gsub(/\s+/, "_").to_sym | |
val = tag.search("span").text | |
results[key] = val | |
end | |
spider.record data.merge(fields).merge(categories: categories) | |
end | |
def results(&block) | |
spider.results(&block) | |
end | |
private | |
def spider | |
@spider ||= Spider.new(self, @options) | |
end | |
end | |
if __FILE__ == $0 | |
spider = ProgrammableWeb.new | |
spider.results.lazy.take(5).each_with_index do |result, i| | |
warn "%-2s: %s" % [i, result.inspect] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment