Skip to content

Instantly share code, notes, and snippets.

@akeyhero
Last active February 28, 2020 10:43
Show Gist options
  • Save akeyhero/1f7104fa1093b2586f42f1434e3fc83f to your computer and use it in GitHub Desktop.
Save akeyhero/1f7104fa1093b2586f42f1434e3fc83f to your computer and use it in GitHub Desktop.
Crawler base class in Ruby
# frozen_string_literal: true
require 'open-uri'
require 'csv'
require 'nkf'
require 'nokogiri'
class CrawlerBase
# TODO: they should be optional.
MAX_TRY_COUNT = 3
INTERVAL = 1.0
def download
urls = fetch_urls
Dir.exist?(cache_dir) or Dir.mkdir(cache_dir)
CSV.open cache_path('manifest.csv'), 'w' do |csv|
urls.each.with_index(1) do |url, i|
csv << ["#{i}.html", url]
end
end
urls.each.with_index(1) do |url, i|
html = fetch_html(url)
File.open cache_path("#{i}.html"), 'w' do |f|
f.write html
end
warn "Fetched ##{i}"
end
end
def run
CSV(STDOUT) do |csv|
csv << get_header
with_parsed_doc do |doc, i|
get_lines(doc, i).each do |line|
csv << line
end
STDOUT.flush
end
end
end
private
def fetch_urls
raise NotImplementedError
end
def get_header
raise NotImplementedError
end
def get_lines(doc, i)
raise NotImplementedError
end
def parse(html)
Nokogiri::HTML.parse html
end
def fetch_html(url)
try_count = MAX_TRY_COUNT
if @last_fetch_time
diff = Time.now - @last_fetch_time
sleep INTERVAL - diff if INTERVAL - diff > 0
end
while true
begin
return open(url, &:read).tap do
@last_fetch_time = Time.now
end
rescue SystemCallError => e
try_count -= 1
raise e if try_count <= 0
warn "Error #{e.message}"
sleep 10
next
end
end
end
def read_html(file_name)
NKF.nkf '-wLu', File.open(cache_path(file_name), &:read)
end
def get_cache_file_names
file_names = []
CSV.foreach cache_path('manifest.csv') do |row|
file_names << row[0]
end
file_names
end
def with_parsed_doc(&block)
get_cache_file_names.each do |file_name|
block.(parse(read_html(file_name)))
end
end
def cache_path(file_name)
"#{cache_dir}/#{file_name}"
end
def cache_dir
"tmp/#{self.class.name}"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment