akeyhero · February 28, 2020 10:43
diff --git a/crawler_base.rb b/crawler_base.rb
 # frozen_string_literal: true

 require 'open-uri'
 require 'csv'
 require 'nkf'
 require 'nokogiri'

 class CrawlerBase
  # TODO: they should be optional.
  MAX_TRY_COUNT = 3
  INTERVAL = 1.0

  def download
    urls = fetch_urls
    Dir.exist?(cache_dir) or Dir.mkdir(cache_dir)
    CSV.open cache_path('manifest.csv'), 'w' do |csv|
      urls.each.with_index(1) do |url, i|
        csv << ["#{i}.html", url]
      end
    end
    urls.each.with_index(1) do |url, i|
      html = fetch_html(url)
      File.open cache_path("#{i}.html"), 'w' do |f|
        f.write html
      end
      warn "Fetched ##{i}"
    end
  end

  def run
    CSV(STDOUT) do |csv|
      csv << get_header
      with_parsed_doc do |doc, i|
        get_lines(doc, i).each do |line|
          csv << line
        end
        STDOUT.flush
      end
    end
  end

 private

  def fetch_urls
    raise NotImplementedError
  end

  def get_header
    raise NotImplementedError
  end

  def get_lines(doc, i)
    raise NotImplementedError
  end

  def parse(html)
    Nokogiri::HTML.parse html
  end

  def fetch_html(url)
    try_count = MAX_TRY_COUNT
    if @last_fetch_time
      diff = Time.now - @last_fetch_time
      sleep INTERVAL - diff if INTERVAL - diff > 0
    end
    while true
      begin
        return open(url, &:read).tap do
          @last_fetch_time = Time.now
        end
      rescue SystemCallError => e
        try_count -= 1
        raise e if try_count <= 0
        warn "Error #{e.message}"
        sleep 10
        next
      end
    end
  end

  def read_html(file_name)
    NKF.nkf '-wLu', File.open(cache_path(file_name), &:read)
  end

  def get_cache_file_names
    file_names = []
    CSV.foreach cache_path('manifest.csv') do |row|
      file_names << row[0]
    end
    file_names
  end

  def with_parsed_doc(&block)
    get_cache_file_names.each do |file_name|
      block.(parse(read_html(file_name)))
    end
  end

  def cache_path(file_name)
    "#{cache_dir}/#{file_name}"
  end

  def cache_dir
    "tmp/#{self.class.name}"
  end
 end
	# frozen_string_literal: true

	require 'open-uri'
	require 'csv'
	require 'nkf'
	require 'nokogiri'

	class CrawlerBase
	# TODO: they should be optional.
	MAX_TRY_COUNT = 3
	INTERVAL = 1.0

	def download
	urls = fetch_urls
	Dir.exist?(cache_dir) or Dir.mkdir(cache_dir)
	CSV.open cache_path('manifest.csv'), 'w' do \|csv\|
	urls.each.with_index(1) do \|url, i\|
	csv << ["#{i}.html", url]
	end
	end
	urls.each.with_index(1) do \|url, i\|
	html = fetch_html(url)
	File.open cache_path("#{i}.html"), 'w' do \|f\|
	f.write html
	end
	warn "Fetched ##{i}"
	end
	end

	def run
	CSV(STDOUT) do \|csv\|
	csv << get_header
	with_parsed_doc do \|doc, i\|
	get_lines(doc, i).each do \|line\|
	csv << line
	end
	STDOUT.flush
	end
	end
	end

	private

	def fetch_urls
	raise NotImplementedError
	end

	def get_header
	raise NotImplementedError
	end

	def get_lines(doc, i)
	raise NotImplementedError
	end

	def parse(html)
	Nokogiri::HTML.parse html
	end

	def fetch_html(url)
	try_count = MAX_TRY_COUNT
	if @last_fetch_time
	diff = Time.now - @last_fetch_time
	sleep INTERVAL - diff if INTERVAL - diff > 0
	end
	while true
	begin
	return open(url, &:read).tap do
	@last_fetch_time = Time.now
	end
	rescue SystemCallError => e
	try_count -= 1
	raise e if try_count <= 0
	warn "Error #{e.message}"
	sleep 10
	next
	end
	end
	end

	def read_html(file_name)
	NKF.nkf '-wLu', File.open(cache_path(file_name), &:read)
	end

	def get_cache_file_names
	file_names = []
	CSV.foreach cache_path('manifest.csv') do \|row\|
	file_names << row[0]
	end
	file_names
	end

	def with_parsed_doc(&block)
	get_cache_file_names.each do \|file_name\|
	block.(parse(read_html(file_name)))
	end
	end

	def cache_path(file_name)
	"#{cache_dir}/#{file_name}"
	end

	def cache_dir
	"tmp/#{self.class.name}"
	end
	end