Last active
February 28, 2020 10:43
-
-
Save akeyhero/1f7104fa1093b2586f42f1434e3fc83f to your computer and use it in GitHub Desktop.
Crawler base class in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # frozen_string_literal: true | |
| require 'open-uri' | |
| require 'csv' | |
| require 'nkf' | |
| require 'nokogiri' | |
| class CrawlerBase | |
| # TODO: they should be optional. | |
| MAX_TRY_COUNT = 3 | |
| INTERVAL = 1.0 | |
| def download | |
| urls = fetch_urls | |
| Dir.exist?(cache_dir) or Dir.mkdir(cache_dir) | |
| CSV.open cache_path('manifest.csv'), 'w' do |csv| | |
| urls.each.with_index(1) do |url, i| | |
| csv << ["#{i}.html", url] | |
| end | |
| end | |
| urls.each.with_index(1) do |url, i| | |
| html = fetch_html(url) | |
| File.open cache_path("#{i}.html"), 'w' do |f| | |
| f.write html | |
| end | |
| warn "Fetched ##{i}" | |
| end | |
| end | |
| def run | |
| CSV(STDOUT) do |csv| | |
| csv << get_header | |
| with_parsed_doc do |doc, i| | |
| get_lines(doc, i).each do |line| | |
| csv << line | |
| end | |
| STDOUT.flush | |
| end | |
| end | |
| end | |
| private | |
| def fetch_urls | |
| raise NotImplementedError | |
| end | |
| def get_header | |
| raise NotImplementedError | |
| end | |
| def get_lines(doc, i) | |
| raise NotImplementedError | |
| end | |
| def parse(html) | |
| Nokogiri::HTML.parse html | |
| end | |
| def fetch_html(url) | |
| try_count = MAX_TRY_COUNT | |
| if @last_fetch_time | |
| diff = Time.now - @last_fetch_time | |
| sleep INTERVAL - diff if INTERVAL - diff > 0 | |
| end | |
| while true | |
| begin | |
| return open(url, &:read).tap do | |
| @last_fetch_time = Time.now | |
| end | |
| rescue SystemCallError => e | |
| try_count -= 1 | |
| raise e if try_count <= 0 | |
| warn "Error #{e.message}" | |
| sleep 10 | |
| next | |
| end | |
| end | |
| end | |
| def read_html(file_name) | |
| NKF.nkf '-wLu', File.open(cache_path(file_name), &:read) | |
| end | |
| def get_cache_file_names | |
| file_names = [] | |
| CSV.foreach cache_path('manifest.csv') do |row| | |
| file_names << row[0] | |
| end | |
| file_names | |
| end | |
| def with_parsed_doc(&block) | |
| get_cache_file_names.each do |file_name| | |
| block.(parse(read_html(file_name))) | |
| end | |
| end | |
| def cache_path(file_name) | |
| "#{cache_dir}/#{file_name}" | |
| end | |
| def cache_dir | |
| "tmp/#{self.class.name}" | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment