relyks · April 3, 2020 23:55
diff --git a/coronaviruscount.rb b/coronaviruscount.rb
 require 'open-uri'
 require 'nokogiri'
 require 'json'
 require 'concurrent-ruby'
 require 'net/http'
 require 'haml'
 require 'rest-client'
 require 'webdrivers'
 require 'watir'
 require 'singleton'
 require 'pdf-reader'

 json_bin = '5e69f2cf48179e0757a587b9'
 secret_key = '$2b$10$he7wInjWqQWI0LFYeVt3KOiKXN03vH9RZL7vlUPKVTUb8MDzsAXru'

 class Scraper
  attr_accessor :place
  
  @@registry = {}

  def initialize(scraper)
    Scraper.register(scraper)
  end
  
  def self.scrape_each
    ObjectSpace.each_object(Class) do |klass|
      if klass < Scraper
        klass.new
      end
    end

    Enumerator.new do |e|
      for (place, scraper) in @@registry do
        e << {place => Concurrent::Future.execute { scraper.get_date_and_count }}
      end
    end
  end

  def self.register(scraper)
    @@registry[scraper.place] = scraper 
  end
 end

 class BergenCountyScraper < Scraper

  DASHBOARD_URL = "https://maps.arcgis.com/apps/opsdashboard/index.html#/ec4bffd48f7e495182226eee7962b422"

  def initialize
    @place = 'Bergen County'
    super(self)
  end

  def get_date_and_count
    if ENV['RUNNING_ON'] == 'HEROKU'
      Selenium::WebDriver::Chrome.path = 
        '/app/.apt/usr/bin/google-chrome'
      Selenium::WebDriver::Chrome::Service.driver_path = 
        '/app/.chromedriver/bin/chromedriver'
    end

    b = Watir::Browser.new(:chrome, headless: true)
    b.goto(DASHBOARD_URL)
    b.div(class: 'list-item-content').wait_until(&:present?)
    source = Nokogiri::HTML(b.html)

    count = 
      source
      .css('//div.external-html/p')
      .to_a
      .collect(&:to_s)
      .each_cons(2)
      .find { |county_str, count_str|
        county_str =~ /#{@place}/
      }
      .yield_self { |_, count_str| 
        Nokogiri::HTML(count_str)
        .search('//p/span/span/strong')
        .to_s
        .gsub(/[^0-9]/, '')
        .to_i  
      }
      
    date =
      source
      .at_css('//div.subtitle/text()')
      .to_s
      .match(/[0-9]+.+/)
      .to_s
      .sub(/\/[0-9]+ /, ' ')
      .yield_self { |ds| DateTime.parse(ds) }

    [date, count]
  end
 end

 class NewYorkCityScraper < Scraper

  NYC_URL = 'https://coronavirus.health.ny.gov/county-county-breakdown-positive-cases'

  def initialize
    @place = 'New York City'
    super(self)
  end

  def get_date_and_count
    source = Nokogiri::HTML(open(NYC_URL))

    count = 
      source
      .xpath('//table/tbody/tr/td/text()')
      .to_a
      .collect(&:to_s)
      .each_cons(2)
      .find { |muni, count| muni == @place }
      .yield_self { |_, count| count }
      .gsub(',', '')
      .to_i

    date = 
      source
      .at_css('//div.wysiwyg--field-webny-wysiwyg-title/text()')
      .to_s
      .yield_self { |ds| DateTime.parse(ds) }

    [date, count]
  end
 end

 old_data = Concurrent::Future.execute do
  uri = URI("https://api.jsonbin.io/b/#{json_bin}/latest")
  result = Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
    req = Net::HTTP::Get.new(uri)
    req['Content-Type'] = 'application/json'
    req['secret-key'] = secret_key
    http.request(req)
  end

  JSON.parse(result.body)
  .map { |place, dates_to_counts| 
    [
      place,
      dates_to_counts
      .map { |date, count| [DateTime.parse(date), count] }
      .to_h
    ]
  }
  .to_h
 end

 class BoroughScraperHelper
  include Singleton

  PDF_URL = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf'

  def initialize
    @reader = PDF::Reader.new(open(PDF_URL))
    metadata = Nokogiri::XML.parse(@reader.metadata)
    metadata.remove_namespaces!
    @date =
      metadata
      .xpath('//xmpmeta/RDF/Description/ModifyDate/text()')
      .to_s
      .yield_self { |ds| DateTime.parse(ds) }
    @lines =
      @reader
      .pages
      .first
      .text
      .split("\n")
  end

  def get_count(borough:)
    @lines
    .find { |s| s =~ /#{borough}/ }
    .match(/[0-9]+/)
    .to_s
    .to_i
  end

  def get_date
    @date
  end
 end

 class ManhattanScraper < Scraper
  
  def initialize
    @place = 'Manhattan'
    @borough_scraper_helper = BoroughScraperHelper.instance
    super(self)
  end

  def get_date_and_count
    date = @borough_scraper_helper.get_date
    count = @borough_scraper_helper.get_count(borough: @place)
    [date, count]
  end
 end

 class QueensScraper < Scraper
  
  def initialize
    @place = 'Queens'
    @borough_scraper_helper = BoroughScraperHelper.instance
    super(self)
  end

  def get_date_and_count
    date = @borough_scraper_helper.get_date
    count = @borough_scraper_helper.get_count(borough: @place)
    [date, count]
  end
 end

 class BrooklynScraper < Scraper
  
  def initialize
    @place = 'Brooklyn'
    @borough_scraper_helper = BoroughScraperHelper.instance
    super(self)
  end

  def get_date_and_count
    date = @borough_scraper_helper.get_date
    count = @borough_scraper_helper.get_count(borough: @place)
    [date, count]
  end
 end

 class BronxScraper < Scraper
  
  def initialize
    @place = 'Bronx'
    @borough_scraper_helper = BoroughScraperHelper.instance
    super(self)
  end

  def get_date_and_count
    date = @borough_scraper_helper.get_date
    count = @borough_scraper_helper.get_count(borough: @place)
    [date, count]
  end
 end

 class StatenIslandScraper < Scraper
  
  def initialize
    @place = 'Staten Island'
    @borough_scraper_helper = BoroughScraperHelper.instance
    super(self)
  end

  def get_date_and_count
    date = @borough_scraper_helper.get_date
    count = @borough_scraper_helper.get_count(borough: @place)
    [date, count]
  end
 end

 # class NewYorkCityScraper < Scraper
  
 #   def initialize
 #     @place = 'New York City'
 #     @borough_scraper_helper = BoroughScraperHelper.instance
 #     super(self)
 #   end

 #   def get_date_and_count
 #     date = @borough_scraper_helper.get_date
 #     count = @borough_scraper_helper.get_count(borough: 'Total')
 #     [date, count]
 #   end
 # end

 places_to_dates_to_counts = 
  Scraper
  .scrape_each
  .map { |place_to_date_and_count_future|
    place_to_date_and_count_future
    .to_a
    .flatten(1)
    .yield_self { |place, date_and_count_future|
      {
        place =>
          date_and_count_future
          .value
          .yield_self { |date, count| {date => count} } 
      }
    }
  }
  .reduce({}) do |places_to_date_and_counts, e|
    places_to_date_and_counts.merge(e)
  end

 pp places_to_dates_to_counts
 data = old_data.value
 # add keys to the data that don't already exist
 places_to_dates_to_counts
 .keys
 .reject { |place| data.keys.include?(place) }
 .each do |place| 
  data[place] = places_to_dates_to_counts[place]
  should_update = true
 end
 pp data
 if (data.any? { |k, v| not v.has_key?(places_to_dates_to_counts[k].values.flatten) }) or
  ARGV.first.downcase == 'force' or should_update # and data.keys.max < date) or ARGV.first.downcase == 'force'
  data.each do |k, _|
    # only consider places that have been defined in the code, there might be data for other places that aren't considered
    unless not places_to_dates_to_counts.has_key?(k)
      data[k].update(places_to_dates_to_counts[k])
    end 
  end
  # data = data.sort_by { |date, _| date }.to_h
  # will this work for all the data?
  # this is just organizing the data by place right now
  update_data_thread = Thread.new do
    uri = URI("https://api.jsonbin.io/b/#{json_bin}")
    Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
      req = Net::HTTP::Put.new(uri)
      req['Content-Type'] = 'application/json'
      req['secret-key'] = secret_key

      req.body = data.to_json
      http.request(req)
    end
  end
  haml = File.read('template.haml')
  rendered_page = 
    Haml::Engine.new(haml, format: :html5)
    .render(self, locals: {data: data})
  File.write('index.html', rendered_page)
  RestClient.post('https://coronaviruscount:[email protected]/api/upload', 'index.html' => File.new("index.html", 'rb'))
  update_data_thread.join
 end
 puts 'success!'
	require 'open-uri'
	require 'nokogiri'
	require 'json'
	require 'concurrent-ruby'
	require 'net/http'
	require 'haml'
	require 'rest-client'
	require 'webdrivers'
	require 'watir'
	require 'singleton'
	require 'pdf-reader'

	json_bin = '5e69f2cf48179e0757a587b9'
	secret_key = '$2b$10$he7wInjWqQWI0LFYeVt3KOiKXN03vH9RZL7vlUPKVTUb8MDzsAXru'

	class Scraper
	attr_accessor :place

	@@registry = {}

	def initialize(scraper)
	Scraper.register(scraper)
	end

	def self.scrape_each
	ObjectSpace.each_object(Class) do \|klass\|
	if klass < Scraper
	klass.new
	end
	end

	Enumerator.new do \|e\|
	for (place, scraper) in @@registry do
	e << {place => Concurrent::Future.execute { scraper.get_date_and_count }}
	end
	end
	end

	def self.register(scraper)
	@@registry[scraper.place] = scraper
	end
	end

	class BergenCountyScraper < Scraper

	DASHBOARD_URL = "https://maps.arcgis.com/apps/opsdashboard/index.html#/ec4bffd48f7e495182226eee7962b422"

	def initialize
	@place = 'Bergen County'
	super(self)
	end

	def get_date_and_count
	if ENV['RUNNING_ON'] == 'HEROKU'
	Selenium::WebDriver::Chrome.path =
	'/app/.apt/usr/bin/google-chrome'
	Selenium::WebDriver::Chrome::Service.driver_path =
	'/app/.chromedriver/bin/chromedriver'
	end

	b = Watir::Browser.new(:chrome, headless: true)
	b.goto(DASHBOARD_URL)
	b.div(class: 'list-item-content').wait_until(&:present?)
	source = Nokogiri::HTML(b.html)

	count =
	source
	.css('//div.external-html/p')
	.to_a
	.collect(&:to_s)
	.each_cons(2)
	.find { \|county_str, count_str\|
	county_str =~ /#{@place}/
	}
	.yield_self { \|_, count_str\|
	Nokogiri::HTML(count_str)
	.search('//p/span/span/strong')
	.to_s
	.gsub(/[^0-9]/, '')
	.to_i
	}

	date =
	source
	.at_css('//div.subtitle/text()')
	.to_s
	.match(/[0-9]+.+/)
	.to_s
	.sub(/\/[0-9]+ /, ' ')
	.yield_self { \|ds\| DateTime.parse(ds) }

	[date, count]
	end
	end

	class NewYorkCityScraper < Scraper

	NYC_URL = 'https://coronavirus.health.ny.gov/county-county-breakdown-positive-cases'

	def initialize
	@place = 'New York City'
	super(self)
	end

	def get_date_and_count
	source = Nokogiri::HTML(open(NYC_URL))

	count =
	source
	.xpath('//table/tbody/tr/td/text()')
	.to_a
	.collect(&:to_s)
	.each_cons(2)
	.find { \|muni, count\| muni == @place }
	.yield_self { \|_, count\| count }
	.gsub(',', '')
	.to_i

	date =
	source
	.at_css('//div.wysiwyg--field-webny-wysiwyg-title/text()')
	.to_s
	.yield_self { \|ds\| DateTime.parse(ds) }

	[date, count]
	end
	end

	old_data = Concurrent::Future.execute do
	uri = URI("https://api.jsonbin.io/b/#{json_bin}/latest")
	result = Net::HTTP.start(uri.host, uri.port, use_ssl: true) do \|http\|
	req = Net::HTTP::Get.new(uri)
	req['Content-Type'] = 'application/json'
	req['secret-key'] = secret_key
	http.request(req)
	end

	JSON.parse(result.body)
	.map { \|place, dates_to_counts\|
	[
	place,
	dates_to_counts
	.map { \|date, count\| [DateTime.parse(date), count] }
	.to_h
	]
	}
	.to_h
	end

	class BoroughScraperHelper
	include Singleton

	PDF_URL = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf'

	def initialize
	@reader = PDF::Reader.new(open(PDF_URL))
	metadata = Nokogiri::XML.parse(@reader.metadata)
	metadata.remove_namespaces!
	@date =
	metadata
	.xpath('//xmpmeta/RDF/Description/ModifyDate/text()')
	.to_s
	.yield_self { \|ds\| DateTime.parse(ds) }
	@lines =
	@reader
	.pages
	.first
	.text
	.split("\n")
	end

	def get_count(borough:)
	@lines
	.find { \|s\| s =~ /#{borough}/ }
	.match(/[0-9]+/)
	.to_s
	.to_i
	end

	def get_date
	@date
	end
	end

	class ManhattanScraper < Scraper

	def initialize
	@place = 'Manhattan'
	@borough_scraper_helper = BoroughScraperHelper.instance
	super(self)
	end

	def get_date_and_count
	date = @borough_scraper_helper.get_date
	count = @borough_scraper_helper.get_count(borough: @place)
	[date, count]
	end
	end

	class QueensScraper < Scraper

	def initialize
	@place = 'Queens'
	@borough_scraper_helper = BoroughScraperHelper.instance
	super(self)
	end

	def get_date_and_count
	date = @borough_scraper_helper.get_date
	count = @borough_scraper_helper.get_count(borough: @place)
	[date, count]
	end
	end

	class BrooklynScraper < Scraper

	def initialize
	@place = 'Brooklyn'
	@borough_scraper_helper = BoroughScraperHelper.instance
	super(self)
	end

	def get_date_and_count
	date = @borough_scraper_helper.get_date
	count = @borough_scraper_helper.get_count(borough: @place)
	[date, count]
	end
	end

	class BronxScraper < Scraper

	def initialize
	@place = 'Bronx'
	@borough_scraper_helper = BoroughScraperHelper.instance
	super(self)
	end

	def get_date_and_count
	date = @borough_scraper_helper.get_date
	count = @borough_scraper_helper.get_count(borough: @place)
	[date, count]
	end
	end

	class StatenIslandScraper < Scraper

	def initialize
	@place = 'Staten Island'
	@borough_scraper_helper = BoroughScraperHelper.instance
	super(self)
	end

	def get_date_and_count
	date = @borough_scraper_helper.get_date
	count = @borough_scraper_helper.get_count(borough: @place)
	[date, count]
	end
	end

	# class NewYorkCityScraper < Scraper

	# def initialize
	# @place = 'New York City'
	# @borough_scraper_helper = BoroughScraperHelper.instance
	# super(self)
	# end

	# def get_date_and_count
	# date = @borough_scraper_helper.get_date
	# count = @borough_scraper_helper.get_count(borough: 'Total')
	# [date, count]
	# end
	# end

	places_to_dates_to_counts =
	Scraper
	.scrape_each
	.map { \|place_to_date_and_count_future\|
	place_to_date_and_count_future
	.to_a
	.flatten(1)
	.yield_self { \|place, date_and_count_future\|
	{
	place =>
	date_and_count_future
	.value
	.yield_self { \|date, count\| {date => count} }
	}
	}
	}
	.reduce({}) do \|places_to_date_and_counts, e\|
	places_to_date_and_counts.merge(e)
	end

	pp places_to_dates_to_counts
	data = old_data.value
	# add keys to the data that don't already exist
	places_to_dates_to_counts
	.keys
	.reject { \|place\| data.keys.include?(place) }
	.each do \|place\|
	data[place] = places_to_dates_to_counts[place]
	should_update = true
	end
	pp data
	if (data.any? { \|k, v\| not v.has_key?(places_to_dates_to_counts[k].values.flatten) }) or
	ARGV.first.downcase == 'force' or should_update # and data.keys.max < date) or ARGV.first.downcase == 'force'
	data.each do \|k, _\|
	# only consider places that have been defined in the code, there might be data for other places that aren't considered
	unless not places_to_dates_to_counts.has_key?(k)
	data[k].update(places_to_dates_to_counts[k])
	end
	end
	# data = data.sort_by { \|date, _\| date }.to_h
	# will this work for all the data?
	# this is just organizing the data by place right now
	update_data_thread = Thread.new do
	uri = URI("https://api.jsonbin.io/b/#{json_bin}")
	Net::HTTP.start(uri.host, uri.port, use_ssl: true) do \|http\|
	req = Net::HTTP::Put.new(uri)
	req['Content-Type'] = 'application/json'
	req['secret-key'] = secret_key

	req.body = data.to_json
	http.request(req)
	end
	end
	haml = File.read('template.haml')
	rendered_page =
	Haml::Engine.new(haml, format: :html5)
	.render(self, locals: {data: data})
	File.write('index.html', rendered_page)
	RestClient.post('https://coronaviruscount:[email protected]/api/upload', 'index.html' => File.new("index.html", 'rb'))
	update_data_thread.join
	end
	puts 'success!'