Created
August 13, 2012 05:23
-
-
Save bnagy/3337161 to your computer and use it in GitHub Desktop.
trying to make a threadsafe capybara poltergeist searcher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'capybara' | |
require 'capybara/dsl' | |
require 'capybara/poltergeist' | |
Capybara.configure do |config| | |
config.run_server = false | |
config.default_driver = :poltergeist | |
end | |
module Searchers | |
class CaptchaError < StandardError; end | |
class Search | |
include Capybara::DSL | |
DEFAULTS={lim: 100, delay: 3} | |
class << self | |
def href href = nil | |
@href = href if href | |
@href | |
end | |
end | |
attr_reader :base_href, :current_session | |
def initialize href=self.class.href | |
@base_href = URI(href) | |
# Capybara requires all absolute URLs to start with http. | |
unless @base_href.scheme =~ /^http/ | |
raise ArgumentError, "base_href must be of http(s) scheme" | |
end | |
# Overridden, to make sure we have one session per actor. | |
@current_session = Capybara::Session.new(:poltergeist) | |
visit base_href.path || '/' | |
end | |
def visit url | |
base_href.path = URI(url).path | |
super(base_href.to_s) | |
end | |
def captcha? | |
page.has_field? "captcha" | |
end | |
def setup query | |
fill_in "q", :with => query | |
click_on "Search" | |
end | |
def select_results; end | |
def extract_url result | |
result[:href] | |
end | |
def next_page; click_on 'Next'; end | |
def search query, opts={} | |
setup query | |
opts=DEFAULTS.merge opts | |
urls=[] | |
url_count=0 | |
loop do | |
begin | |
raise CaptchaError if captcha? | |
results=select_results | |
break if results.empty? | |
results.each {|e| | |
url_count+=1 | |
if block_given? | |
yield extract_url( e ) | |
else | |
urls << extract_url( e ) | |
end | |
} | |
break if url_count >= opts[:lim] | |
sleep( rand(opts[:delay]*10.0)/10 ) | |
next_page | |
rescue Capybara::Poltergeist::JavascriptError | |
#ignore | |
rescue | |
warn "#{self}:#{__method__}: #{$!}" | |
break | |
end | |
end | |
return urls unless block_given? | |
end | |
end | |
class Google < Search | |
href 'https://www.google.com' | |
def select_results | |
all "h3.r a" | |
end | |
def extract_url result | |
result[:href][%r{/url\?q=(.*?)&sa},1] | |
end | |
end | |
class Bing < Search | |
href 'http://www.bing.com' | |
def select_results | |
all 'h3 a' | |
end | |
end | |
class DuckDuckGo < Search | |
href 'http://www.duckduckgo.com/html' | |
def select_results | |
all 'div.results_links div.links_deep a' | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment