Skip to content

Instantly share code, notes, and snippets.

@abhisek
Created September 19, 2017 12:11
Show Gist options
  • Save abhisek/e204bfc97b31b0bbce609689708f7480 to your computer and use it in GitHub Desktop.
Save abhisek/e204bfc97b31b0bbce609689708f7480 to your computer and use it in GitHub Desktop.
Solution for D11 crawler problem
require 'rubygems'
require 'bundler'
Bundler.require(:default)
require 'nokogiri'
require 'uri'
require 'set'
BASE_URL = 'http://127.0.0.1:8000/'
class RateMan
def initialize(wms, daf, dms, max)
@winms = wms
@delay_after = daf
@delay_ms = dms
@max = max
@req_count = 0
@start_t = Time.now
@in_throttle = false
end
def on_response()
#@in_throttle = false
#@req_count += 1
#sleep 0.1 if @req_count > 498
end
def on_throttle()
=begin
return
return if @in_throttle
t = 10.to_f - (Time.now - @start_t).to_f
if t > 0
log("Sleeping #{t} to cover throttle time")
sleep t
@start_t = Time.now
@in_throttle = true
end
=end
end
end
$rm = RateMan.new(10000, 250, 100, 500)
def log(msg)
#puts "[LOG] #{msg}"
end
def get_uri(path)
u = ::URI.parse(BASE_URL)
u.path = path
return u
end
$visitables ||= Set.new
$visited ||= Set.new
$lowest_rank_str = "zzzzzzzzzzzzz"
$conn_c = 0
def add_string(s)
if s < $lowest_rank_str
$lowest_rank_str = s
end
#log("Current sol: #{$lowest_rank_str}")
end
def add_visitable(href)
$visitables << href unless $visited.include?(href)
end
def is_visitable?(href)
! $visited.include?(href)
end
def add_visited(href)
$visited << href
end
def process_response(h, res)
html = ::Nokogiri::HTML(res.body)
html.css('h1').each do |e|
add_string(e.text)
end
html.css('a.link').each do |e|
target = e.attr('href')
visit(h, target) if is_visitable?(target)
end
end
def visit(h, path)
#log("Visiting path: #{path}")
req = Typhoeus::Request.new(get_uri(path).to_s(),
verbose: false,
headers: {
'X-Forwarded-For' => "%d.%d.%d.%d" %
[rand(255), rand(255), rand(255), rand(255)]
})
req.on_complete do |res|
if res.success?
#puts "Got body of size: #{res.body.size}"
$conn_c += 1
log("Crawled path: #{path} [#{$conn_c}]")
process_response(h, res)
$rm.on_response()
elsif res.timed_out?
log("HTTP timeout! Requeing path: #{path}")
visit(h, path)
elsif res.code.to_i == 429
log("Throttled response. Requeing path: #{path}")
$rm.on_throttle()
visit(h, path) # Re-visit the node since we were throttled
else
log("HTTP error code: #{res.code}")
end
end
h.queue(req)
add_visited(path)
end
if __FILE__ == $0
log("Starting crawler.. ")
hydra = Typhoeus::Hydra.new(max_concurrency: 250)
visit(hydra, '/')
hydra.run()
puts $lowest_rank_str
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment