Created
September 19, 2017 12:11
-
-
Save abhisek/e204bfc97b31b0bbce609689708f7480 to your computer and use it in GitHub Desktop.
Solution for D11 crawler problem
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'bundler' | |
Bundler.require(:default) | |
require 'nokogiri' | |
require 'uri' | |
require 'set' | |
BASE_URL = 'http://127.0.0.1:8000/' | |
class RateMan | |
def initialize(wms, daf, dms, max) | |
@winms = wms | |
@delay_after = daf | |
@delay_ms = dms | |
@max = max | |
@req_count = 0 | |
@start_t = Time.now | |
@in_throttle = false | |
end | |
def on_response() | |
#@in_throttle = false | |
#@req_count += 1 | |
#sleep 0.1 if @req_count > 498 | |
end | |
def on_throttle() | |
=begin | |
return | |
return if @in_throttle | |
t = 10.to_f - (Time.now - @start_t).to_f | |
if t > 0 | |
log("Sleeping #{t} to cover throttle time") | |
sleep t | |
@start_t = Time.now | |
@in_throttle = true | |
end | |
=end | |
end | |
end | |
$rm = RateMan.new(10000, 250, 100, 500) | |
def log(msg) | |
#puts "[LOG] #{msg}" | |
end | |
def get_uri(path) | |
u = ::URI.parse(BASE_URL) | |
u.path = path | |
return u | |
end | |
$visitables ||= Set.new | |
$visited ||= Set.new | |
$lowest_rank_str = "zzzzzzzzzzzzz" | |
$conn_c = 0 | |
def add_string(s) | |
if s < $lowest_rank_str | |
$lowest_rank_str = s | |
end | |
#log("Current sol: #{$lowest_rank_str}") | |
end | |
def add_visitable(href) | |
$visitables << href unless $visited.include?(href) | |
end | |
def is_visitable?(href) | |
! $visited.include?(href) | |
end | |
def add_visited(href) | |
$visited << href | |
end | |
def process_response(h, res) | |
html = ::Nokogiri::HTML(res.body) | |
html.css('h1').each do |e| | |
add_string(e.text) | |
end | |
html.css('a.link').each do |e| | |
target = e.attr('href') | |
visit(h, target) if is_visitable?(target) | |
end | |
end | |
def visit(h, path) | |
#log("Visiting path: #{path}") | |
req = Typhoeus::Request.new(get_uri(path).to_s(), | |
verbose: false, | |
headers: { | |
'X-Forwarded-For' => "%d.%d.%d.%d" % | |
[rand(255), rand(255), rand(255), rand(255)] | |
}) | |
req.on_complete do |res| | |
if res.success? | |
#puts "Got body of size: #{res.body.size}" | |
$conn_c += 1 | |
log("Crawled path: #{path} [#{$conn_c}]") | |
process_response(h, res) | |
$rm.on_response() | |
elsif res.timed_out? | |
log("HTTP timeout! Requeing path: #{path}") | |
visit(h, path) | |
elsif res.code.to_i == 429 | |
log("Throttled response. Requeing path: #{path}") | |
$rm.on_throttle() | |
visit(h, path) # Re-visit the node since we were throttled | |
else | |
log("HTTP error code: #{res.code}") | |
end | |
end | |
h.queue(req) | |
add_visited(path) | |
end | |
if __FILE__ == $0 | |
log("Starting crawler.. ") | |
hydra = Typhoeus::Hydra.new(max_concurrency: 250) | |
visit(hydra, '/') | |
hydra.run() | |
puts $lowest_rank_str | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment