Instantly share code, notes, and snippets.
Created
April 3, 2020 23:55
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save relyks/ab9fe3a916d33da2dca21a5ce5d61f1c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'json' | |
require 'concurrent-ruby' | |
require 'net/http' | |
require 'haml' | |
require 'rest-client' | |
require 'webdrivers' | |
require 'watir' | |
require 'singleton' | |
require 'pdf-reader' | |
json_bin = '5e69f2cf48179e0757a587b9' | |
secret_key = '$2b$10$he7wInjWqQWI0LFYeVt3KOiKXN03vH9RZL7vlUPKVTUb8MDzsAXru' | |
class Scraper | |
attr_accessor :place | |
@@registry = {} | |
def initialize(scraper) | |
Scraper.register(scraper) | |
end | |
def self.scrape_each | |
ObjectSpace.each_object(Class) do |klass| | |
if klass < Scraper | |
klass.new | |
end | |
end | |
Enumerator.new do |e| | |
for (place, scraper) in @@registry do | |
e << {place => Concurrent::Future.execute { scraper.get_date_and_count }} | |
end | |
end | |
end | |
def self.register(scraper) | |
@@registry[scraper.place] = scraper | |
end | |
end | |
class BergenCountyScraper < Scraper | |
DASHBOARD_URL = "https://maps.arcgis.com/apps/opsdashboard/index.html#/ec4bffd48f7e495182226eee7962b422" | |
def initialize | |
@place = 'Bergen County' | |
super(self) | |
end | |
def get_date_and_count | |
if ENV['RUNNING_ON'] == 'HEROKU' | |
Selenium::WebDriver::Chrome.path = | |
'/app/.apt/usr/bin/google-chrome' | |
Selenium::WebDriver::Chrome::Service.driver_path = | |
'/app/.chromedriver/bin/chromedriver' | |
end | |
b = Watir::Browser.new(:chrome, headless: true) | |
b.goto(DASHBOARD_URL) | |
b.div(class: 'list-item-content').wait_until(&:present?) | |
source = Nokogiri::HTML(b.html) | |
count = | |
source | |
.css('//div.external-html/p') | |
.to_a | |
.collect(&:to_s) | |
.each_cons(2) | |
.find { |county_str, count_str| | |
county_str =~ /#{@place}/ | |
} | |
.yield_self { |_, count_str| | |
Nokogiri::HTML(count_str) | |
.search('//p/span/span/strong') | |
.to_s | |
.gsub(/[^0-9]/, '') | |
.to_i | |
} | |
date = | |
source | |
.at_css('//div.subtitle/text()') | |
.to_s | |
.match(/[0-9]+.+/) | |
.to_s | |
.sub(/\/[0-9]+ /, ' ') | |
.yield_self { |ds| DateTime.parse(ds) } | |
[date, count] | |
end | |
end | |
class NewYorkCityScraper < Scraper | |
NYC_URL = 'https://coronavirus.health.ny.gov/county-county-breakdown-positive-cases' | |
def initialize | |
@place = 'New York City' | |
super(self) | |
end | |
def get_date_and_count | |
source = Nokogiri::HTML(open(NYC_URL)) | |
count = | |
source | |
.xpath('//table/tbody/tr/td/text()') | |
.to_a | |
.collect(&:to_s) | |
.each_cons(2) | |
.find { |muni, count| muni == @place } | |
.yield_self { |_, count| count } | |
.gsub(',', '') | |
.to_i | |
date = | |
source | |
.at_css('//div.wysiwyg--field-webny-wysiwyg-title/text()') | |
.to_s | |
.yield_self { |ds| DateTime.parse(ds) } | |
[date, count] | |
end | |
end | |
old_data = Concurrent::Future.execute do | |
uri = URI("https://api.jsonbin.io/b/#{json_bin}/latest") | |
result = Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http| | |
req = Net::HTTP::Get.new(uri) | |
req['Content-Type'] = 'application/json' | |
req['secret-key'] = secret_key | |
http.request(req) | |
end | |
JSON.parse(result.body) | |
.map { |place, dates_to_counts| | |
[ | |
place, | |
dates_to_counts | |
.map { |date, count| [DateTime.parse(date), count] } | |
.to_h | |
] | |
} | |
.to_h | |
end | |
class BoroughScraperHelper | |
include Singleton | |
PDF_URL = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf' | |
def initialize | |
@reader = PDF::Reader.new(open(PDF_URL)) | |
metadata = Nokogiri::XML.parse(@reader.metadata) | |
metadata.remove_namespaces! | |
@date = | |
metadata | |
.xpath('//xmpmeta/RDF/Description/ModifyDate/text()') | |
.to_s | |
.yield_self { |ds| DateTime.parse(ds) } | |
@lines = | |
@reader | |
.pages | |
.first | |
.text | |
.split("\n") | |
end | |
def get_count(borough:) | |
@lines | |
.find { |s| s =~ /#{borough}/ } | |
.match(/[0-9]+/) | |
.to_s | |
.to_i | |
end | |
def get_date | |
@date | |
end | |
end | |
class ManhattanScraper < Scraper | |
def initialize | |
@place = 'Manhattan' | |
@borough_scraper_helper = BoroughScraperHelper.instance | |
super(self) | |
end | |
def get_date_and_count | |
date = @borough_scraper_helper.get_date | |
count = @borough_scraper_helper.get_count(borough: @place) | |
[date, count] | |
end | |
end | |
class QueensScraper < Scraper | |
def initialize | |
@place = 'Queens' | |
@borough_scraper_helper = BoroughScraperHelper.instance | |
super(self) | |
end | |
def get_date_and_count | |
date = @borough_scraper_helper.get_date | |
count = @borough_scraper_helper.get_count(borough: @place) | |
[date, count] | |
end | |
end | |
class BrooklynScraper < Scraper | |
def initialize | |
@place = 'Brooklyn' | |
@borough_scraper_helper = BoroughScraperHelper.instance | |
super(self) | |
end | |
def get_date_and_count | |
date = @borough_scraper_helper.get_date | |
count = @borough_scraper_helper.get_count(borough: @place) | |
[date, count] | |
end | |
end | |
class BronxScraper < Scraper | |
def initialize | |
@place = 'Bronx' | |
@borough_scraper_helper = BoroughScraperHelper.instance | |
super(self) | |
end | |
def get_date_and_count | |
date = @borough_scraper_helper.get_date | |
count = @borough_scraper_helper.get_count(borough: @place) | |
[date, count] | |
end | |
end | |
class StatenIslandScraper < Scraper | |
def initialize | |
@place = 'Staten Island' | |
@borough_scraper_helper = BoroughScraperHelper.instance | |
super(self) | |
end | |
def get_date_and_count | |
date = @borough_scraper_helper.get_date | |
count = @borough_scraper_helper.get_count(borough: @place) | |
[date, count] | |
end | |
end | |
# class NewYorkCityScraper < Scraper | |
# def initialize | |
# @place = 'New York City' | |
# @borough_scraper_helper = BoroughScraperHelper.instance | |
# super(self) | |
# end | |
# def get_date_and_count | |
# date = @borough_scraper_helper.get_date | |
# count = @borough_scraper_helper.get_count(borough: 'Total') | |
# [date, count] | |
# end | |
# end | |
places_to_dates_to_counts = | |
Scraper | |
.scrape_each | |
.map { |place_to_date_and_count_future| | |
place_to_date_and_count_future | |
.to_a | |
.flatten(1) | |
.yield_self { |place, date_and_count_future| | |
{ | |
place => | |
date_and_count_future | |
.value | |
.yield_self { |date, count| {date => count} } | |
} | |
} | |
} | |
.reduce({}) do |places_to_date_and_counts, e| | |
places_to_date_and_counts.merge(e) | |
end | |
pp places_to_dates_to_counts | |
data = old_data.value | |
# add keys to the data that don't already exist | |
places_to_dates_to_counts | |
.keys | |
.reject { |place| data.keys.include?(place) } | |
.each do |place| | |
data[place] = places_to_dates_to_counts[place] | |
should_update = true | |
end | |
pp data | |
if (data.any? { |k, v| not v.has_key?(places_to_dates_to_counts[k].values.flatten) }) or | |
ARGV.first.downcase == 'force' or should_update # and data.keys.max < date) or ARGV.first.downcase == 'force' | |
data.each do |k, _| | |
# only consider places that have been defined in the code, there might be data for other places that aren't considered | |
unless not places_to_dates_to_counts.has_key?(k) | |
data[k].update(places_to_dates_to_counts[k]) | |
end | |
end | |
# data = data.sort_by { |date, _| date }.to_h | |
# will this work for all the data? | |
# this is just organizing the data by place right now | |
update_data_thread = Thread.new do | |
uri = URI("https://api.jsonbin.io/b/#{json_bin}") | |
Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http| | |
req = Net::HTTP::Put.new(uri) | |
req['Content-Type'] = 'application/json' | |
req['secret-key'] = secret_key | |
req.body = data.to_json | |
http.request(req) | |
end | |
end | |
haml = File.read('template.haml') | |
rendered_page = | |
Haml::Engine.new(haml, format: :html5) | |
.render(self, locals: {data: data}) | |
File.write('index.html', rendered_page) | |
RestClient.post('https://coronaviruscount:[email protected]/api/upload', 'index.html' => File.new("index.html", 'rb')) | |
update_data_thread.join | |
end | |
puts 'success!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment