Skip to content

Instantly share code, notes, and snippets.

@relyks
Created April 3, 2020 23:55
Show Gist options
  • Save relyks/ab9fe3a916d33da2dca21a5ce5d61f1c to your computer and use it in GitHub Desktop.
Save relyks/ab9fe3a916d33da2dca21a5ce5d61f1c to your computer and use it in GitHub Desktop.
require 'open-uri'
require 'nokogiri'
require 'json'
require 'concurrent-ruby'
require 'net/http'
require 'haml'
require 'rest-client'
require 'webdrivers'
require 'watir'
require 'singleton'
require 'pdf-reader'
json_bin = '5e69f2cf48179e0757a587b9'
secret_key = '$2b$10$he7wInjWqQWI0LFYeVt3KOiKXN03vH9RZL7vlUPKVTUb8MDzsAXru'
class Scraper
attr_accessor :place
@@registry = {}
def initialize(scraper)
Scraper.register(scraper)
end
def self.scrape_each
ObjectSpace.each_object(Class) do |klass|
if klass < Scraper
klass.new
end
end
Enumerator.new do |e|
for (place, scraper) in @@registry do
e << {place => Concurrent::Future.execute { scraper.get_date_and_count }}
end
end
end
def self.register(scraper)
@@registry[scraper.place] = scraper
end
end
class BergenCountyScraper < Scraper
DASHBOARD_URL = "https://maps.arcgis.com/apps/opsdashboard/index.html#/ec4bffd48f7e495182226eee7962b422"
def initialize
@place = 'Bergen County'
super(self)
end
def get_date_and_count
if ENV['RUNNING_ON'] == 'HEROKU'
Selenium::WebDriver::Chrome.path =
'/app/.apt/usr/bin/google-chrome'
Selenium::WebDriver::Chrome::Service.driver_path =
'/app/.chromedriver/bin/chromedriver'
end
b = Watir::Browser.new(:chrome, headless: true)
b.goto(DASHBOARD_URL)
b.div(class: 'list-item-content').wait_until(&:present?)
source = Nokogiri::HTML(b.html)
count =
source
.css('//div.external-html/p')
.to_a
.collect(&:to_s)
.each_cons(2)
.find { |county_str, count_str|
county_str =~ /#{@place}/
}
.yield_self { |_, count_str|
Nokogiri::HTML(count_str)
.search('//p/span/span/strong')
.to_s
.gsub(/[^0-9]/, '')
.to_i
}
date =
source
.at_css('//div.subtitle/text()')
.to_s
.match(/[0-9]+.+/)
.to_s
.sub(/\/[0-9]+ /, ' ')
.yield_self { |ds| DateTime.parse(ds) }
[date, count]
end
end
class NewYorkCityScraper < Scraper
NYC_URL = 'https://coronavirus.health.ny.gov/county-county-breakdown-positive-cases'
def initialize
@place = 'New York City'
super(self)
end
def get_date_and_count
source = Nokogiri::HTML(open(NYC_URL))
count =
source
.xpath('//table/tbody/tr/td/text()')
.to_a
.collect(&:to_s)
.each_cons(2)
.find { |muni, count| muni == @place }
.yield_self { |_, count| count }
.gsub(',', '')
.to_i
date =
source
.at_css('//div.wysiwyg--field-webny-wysiwyg-title/text()')
.to_s
.yield_self { |ds| DateTime.parse(ds) }
[date, count]
end
end
old_data = Concurrent::Future.execute do
uri = URI("https://api.jsonbin.io/b/#{json_bin}/latest")
result = Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
req = Net::HTTP::Get.new(uri)
req['Content-Type'] = 'application/json'
req['secret-key'] = secret_key
http.request(req)
end
JSON.parse(result.body)
.map { |place, dates_to_counts|
[
place,
dates_to_counts
.map { |date, count| [DateTime.parse(date), count] }
.to_h
]
}
.to_h
end
class BoroughScraperHelper
include Singleton
PDF_URL = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf'
def initialize
@reader = PDF::Reader.new(open(PDF_URL))
metadata = Nokogiri::XML.parse(@reader.metadata)
metadata.remove_namespaces!
@date =
metadata
.xpath('//xmpmeta/RDF/Description/ModifyDate/text()')
.to_s
.yield_self { |ds| DateTime.parse(ds) }
@lines =
@reader
.pages
.first
.text
.split("\n")
end
def get_count(borough:)
@lines
.find { |s| s =~ /#{borough}/ }
.match(/[0-9]+/)
.to_s
.to_i
end
def get_date
@date
end
end
class ManhattanScraper < Scraper
def initialize
@place = 'Manhattan'
@borough_scraper_helper = BoroughScraperHelper.instance
super(self)
end
def get_date_and_count
date = @borough_scraper_helper.get_date
count = @borough_scraper_helper.get_count(borough: @place)
[date, count]
end
end
class QueensScraper < Scraper
def initialize
@place = 'Queens'
@borough_scraper_helper = BoroughScraperHelper.instance
super(self)
end
def get_date_and_count
date = @borough_scraper_helper.get_date
count = @borough_scraper_helper.get_count(borough: @place)
[date, count]
end
end
class BrooklynScraper < Scraper
def initialize
@place = 'Brooklyn'
@borough_scraper_helper = BoroughScraperHelper.instance
super(self)
end
def get_date_and_count
date = @borough_scraper_helper.get_date
count = @borough_scraper_helper.get_count(borough: @place)
[date, count]
end
end
class BronxScraper < Scraper
def initialize
@place = 'Bronx'
@borough_scraper_helper = BoroughScraperHelper.instance
super(self)
end
def get_date_and_count
date = @borough_scraper_helper.get_date
count = @borough_scraper_helper.get_count(borough: @place)
[date, count]
end
end
class StatenIslandScraper < Scraper
def initialize
@place = 'Staten Island'
@borough_scraper_helper = BoroughScraperHelper.instance
super(self)
end
def get_date_and_count
date = @borough_scraper_helper.get_date
count = @borough_scraper_helper.get_count(borough: @place)
[date, count]
end
end
# class NewYorkCityScraper < Scraper
# def initialize
# @place = 'New York City'
# @borough_scraper_helper = BoroughScraperHelper.instance
# super(self)
# end
# def get_date_and_count
# date = @borough_scraper_helper.get_date
# count = @borough_scraper_helper.get_count(borough: 'Total')
# [date, count]
# end
# end
places_to_dates_to_counts =
Scraper
.scrape_each
.map { |place_to_date_and_count_future|
place_to_date_and_count_future
.to_a
.flatten(1)
.yield_self { |place, date_and_count_future|
{
place =>
date_and_count_future
.value
.yield_self { |date, count| {date => count} }
}
}
}
.reduce({}) do |places_to_date_and_counts, e|
places_to_date_and_counts.merge(e)
end
pp places_to_dates_to_counts
data = old_data.value
# add keys to the data that don't already exist
places_to_dates_to_counts
.keys
.reject { |place| data.keys.include?(place) }
.each do |place|
data[place] = places_to_dates_to_counts[place]
should_update = true
end
pp data
if (data.any? { |k, v| not v.has_key?(places_to_dates_to_counts[k].values.flatten) }) or
ARGV.first.downcase == 'force' or should_update # and data.keys.max < date) or ARGV.first.downcase == 'force'
data.each do |k, _|
# only consider places that have been defined in the code, there might be data for other places that aren't considered
unless not places_to_dates_to_counts.has_key?(k)
data[k].update(places_to_dates_to_counts[k])
end
end
# data = data.sort_by { |date, _| date }.to_h
# will this work for all the data?
# this is just organizing the data by place right now
update_data_thread = Thread.new do
uri = URI("https://api.jsonbin.io/b/#{json_bin}")
Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
req = Net::HTTP::Put.new(uri)
req['Content-Type'] = 'application/json'
req['secret-key'] = secret_key
req.body = data.to_json
http.request(req)
end
end
haml = File.read('template.haml')
rendered_page =
Haml::Engine.new(haml, format: :html5)
.render(self, locals: {data: data})
File.write('index.html', rendered_page)
RestClient.post('https://coronaviruscount:[email protected]/api/upload', 'index.html' => File.new("index.html", 'rb'))
update_data_thread.join
end
puts 'success!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment