Created
April 6, 2014 20:44
-
-
Save epitron/10011260 to your computer and use it in GitHub Desktop.
A wrapper around Mechanize which implements a caching HTTP client that doesn't blow up if it hits an error pages (like 404).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mechanize' | |
require 'logger' | |
require_relative 'random_agent' | |
require_relative 'path' | |
# Mechanize::Page subclasses Mechanize::File, Mechanize::Download is its own thing | |
# module MimeInfo | |
# def size | |
# header["content-length"].to_i | |
# end | |
# def mime_type | |
# header["content-type"] || "application/octet-stream" | |
# end | |
# def link_info | |
# "#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}" | |
# end | |
# end | |
module Mechanize::Parser | |
REDIRECT_CODES = [301, 302, 303, 307] | |
# include MimeInfo | |
attr_accessor :cached_response, :saved_response | |
def size | |
header["content-length"].to_i | |
end | |
def mime_type | |
header["content-type"] || "application/octet-stream" | |
end | |
def link_info | |
"#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}" | |
end | |
def code | |
@code.to_i | |
end | |
def url | |
@uri.to_s | |
end | |
def error? | |
code != 200 and not redirect? | |
end | |
def redirect? | |
REDIRECT_CODES.include? code | |
end | |
def redirect | |
header["location"] | |
end | |
def proxy | |
"#{mech.proxy_addr}:#{mech.proxy_port}" if mech.proxy_addr and mech.proxy_port | |
end | |
def body_with_headers | |
output = [] | |
getstr = "GET #{uri}" | |
getstr += " (proxy: #{proxy})" if proxy | |
getstr += " [cached]" if cached_response? | |
output << getstr | |
output << "" | |
output << "HTTP/1.1 #{code}" | |
canonical_each do |header, val| | |
output << "#{header}: #{val}" | |
end | |
output << "" | |
output << body | |
output.join("\n") | |
end | |
def cached_response? | |
cached_response | |
end | |
alias_method :cached?, :cached_response? | |
end | |
class HttpClient | |
CACHEDIR = Path["pagecache/"] | |
def initialize( random_agent: true, | |
retries: 0, | |
cookie_file: nil, | |
verbose: nil, | |
proxy: nil, | |
relative_to: nil, | |
cache_timeout: nil, | |
raise_exceptions: true, | |
follow_redirects: true, | |
logger: nil ) | |
@cookie_file = cookie_file | |
@proxy = proxy | |
@relative_to = URI.parse(relative_to) if relative_to | |
@cache_timeout = cache_timeout | |
@raise_exceptions = raise_exceptions | |
@follow_redirects = follow_redirects | |
@logger = logger | |
@random_agent = random_agent | |
@retries = @proxy ? retries : 0 # only retry when using a proxy | |
@lock = Mutex.new | |
end | |
def logger | |
@logger ||= Rails.logger #Logger.new $stdout | |
end | |
def client | |
@client ||= new_client | |
end | |
def new_client | |
Mechanize.new do |a| | |
# TODO: Cleanup Proxy model (remove "useragent", rename "url" to "host", replace fields whose values are "NULL" or "" with nil) | |
if @proxy | |
a.set_proxy(@proxy.host, @proxy.port, @proxy.username, @proxy.password) | |
end | |
if @random_agent | |
a.user_agent = RandomAgent.get | |
else | |
a.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4" | |
end | |
a.max_history = 0 | |
a.log = logger | |
a.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
a.idle_timeout = 30 | |
a.read_timeout = 30 | |
if @follow_redirects | |
a.redirect_ok = true | |
else | |
a.redirect_ok = false | |
a.agent.allowed_error_codes += [301, 302, 303, 307] | |
end | |
a.cookie_jar.load(@cookie_file) if @cookie_file and File.exists? @cookie_file | |
end | |
end | |
def ymd_path | |
path = CACHEDIR/ymd | |
unless path.exists? | |
path.mkdir_p | |
puts "* Created #{path}" | |
end | |
path | |
end | |
def today_path | |
today = CACHEDIR/"today" | |
if not today.exists? | |
ymd_path.ln_s today | |
elsif ymd_path.dirs.last != today.symlink_target.dirs.last | |
today.rm | |
ymd_path.ln_s today | |
puts "* Symlinked #{today} to #{ymd_path}" | |
end | |
today | |
end | |
def ymd | |
Time.now.strftime("%Y-%m-%d") | |
end | |
def timestamp | |
Time.now.strftime("%H:%M:%S.%L") | |
end | |
def read_cache_path | |
path = CACHEDIR/"readcache" | |
path.mkdir_p unless path.exists? | |
path | |
end | |
def url_hash(url) | |
Digest::SHA1.hexdigest(url.to_s) | |
end | |
def cache_file_for(url) | |
read_cache_path/"#{url_hash(url)}.html" | |
end | |
def get_cached_response_for(url, cache_timeout) | |
url = @relative_to+url if @relative_to | |
file = cache_file_for(url) | |
p url: url, cache: file.to_s | |
if file.exists? and cache_timeout | |
expiry_date = file.mtime + cache_timeout | |
file.rm if Time.now > expiry_date | |
end | |
if file.exists? | |
Mechanize::Page.new( # uri=nil, response=nil, body=nil, code=nil, mech=nil) | |
URI.parse(url), # uri | |
{'content-type'=>"text/html"}, # response | |
file.read, # body | |
200, # code | |
new_client # mechanize instance | |
).tap { |r| r.cached_response = true } | |
else | |
nil | |
end | |
end | |
def log_response(response) | |
logpath = nil | |
loop do | |
logpath = today_path/"#{timestamp}.html" | |
break if not logpath.exists? | |
end | |
logpath.write response.body_with_headers | |
response.saved_response = logpath | |
response | |
end | |
def save_to_cache(response, url) | |
if response.code == 200 | |
#url = response.uri.to_s | |
cachefile = cache_file_for(url) | |
cachefile.write response.body | |
logger.info "Caching url: #{url}" | |
logger.info "Cache file: #{cachefile}" | |
end | |
end | |
def get(url, cache_timeout: nil) | |
# TODO: Ensure that there's no race condition in saving the cookie file | |
# TODO: follow_redirects should disable exceptions for 30{1,2,7} | |
url = url.to_s | |
cache_timeout = cache_timeout || @cache_timeout | |
infoline = "--- HTTP GET: #{url}" | |
infoline << " (cache timeout: #{cache_timeout})" if cache_timeout | |
infoline << " --------------------------------------------" | |
puts infoline | |
response = get_cached_response_for(url, cache_timeout) if cache_timeout | |
unless response | |
# when google blocks us, it raises: Mechanize::ResponseCodeError (503 => Net::HTTPServiceUnavailable for http://scholar.google.com/sorry/?continue=http://scholar.google.com/scholar%3Fstart%3D10%26q%3Dauthor:%2522Alexandra%2BGheciu%2522%26hl%3Den%26as_sdt%3D0,5 -- unhandled response) | |
tries = 0 | |
response = begin | |
tries += 1 | |
client.get(url) | |
rescue Mechanize::ResponseCodeError => e | |
if @retries and e.page.code == 503 and tries < @retries | |
retry | |
end | |
if @raise_exceptions | |
raise e | |
else | |
e.page | |
end | |
end | |
return response if not response.is_a? Mechanize::Page | |
response = log_response(response) | |
save_to_cache(response, url) if cache_timeout | |
end | |
# if @raise_exceptions == false | |
# if @follow_redirects == false and [301, 302, 307].include? response.code | |
# # don't follow the redirect, just return the response | |
# elsif response.code != 200 | |
# raise "HTTP GET error #{response.code}. Response has been saved to: #{response.saved_response}" | |
# end | |
# end | |
client.cookie_jar.save(@cookie_file) if @cookie_file | |
puts response.link_info | |
puts | |
response | |
end | |
def self.get(url, **opts) | |
opts_for_new = opts.extract!(:cookie_file, :random_agent, :logger) | |
new(opts_for_new).get(url, opts) | |
end | |
end | |
HTTPClient = HttpClient | |
if $0 == __FILE__ | |
require 'active_support/core_ext' | |
puts "* Getting page..." | |
c = HTTPClient.new(cache_timeout: 15.seconds) | |
# doc = HTTPClient.get("http://slashdot.org/", cache_timeout: 15.seconds) | |
doc = c.get("http://slashdot.org/") | |
p size: doc.body.size, code: doc.code, url: doc.uri, cached: doc.cached? | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment