|
import requests |
|
import json |
|
from html.parser import HTMLParser |
|
from lxml import html, etree |
|
from datetime import timedelta, date |
|
|
|
URL_BASE = "https://archive.org/wayback/available?url=https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx×tamp=" |
|
COUNTIES = [ |
|
"Adams", |
|
"Allegheny", |
|
"Armstrong", |
|
"Beaver", |
|
"Bedford", |
|
"Berks", |
|
"Blair", |
|
"Bradford", |
|
"Bucks", |
|
"Butler", |
|
"Cambria", |
|
"Cameron", |
|
"Carbon", |
|
"Centre", |
|
"Chester", |
|
"Clarion", |
|
"Clearfield", |
|
"Clinton", |
|
"Columbia", |
|
"Crawford", |
|
"Cumberland", |
|
"Dauphin", |
|
"Delaware", |
|
"Elk", |
|
"Erie", |
|
"Fayette", |
|
"Forest", |
|
"Franklin", |
|
"Fulton", |
|
"Greene", |
|
"Huntingdon", |
|
"Indiana", |
|
"Jefferson", |
|
"Juniata", |
|
"Lackawanna", |
|
"Lancaster", |
|
"Lawrence", |
|
"Lebanon", |
|
"Lehigh", |
|
"Luzerne", |
|
"Lycoming", |
|
"McKean", |
|
"Mercer", |
|
"Mifflin", |
|
"Monroe", |
|
"Montgomery", |
|
"Montour", |
|
"Northampton", |
|
"Northumberland", |
|
"Perry", |
|
"Philadelphia", |
|
"Pike", |
|
"Potter", |
|
"Schuylkill", |
|
"Snyder", |
|
"Somerset", |
|
"Sullivan", |
|
"Susquehanna", |
|
"Tioga", |
|
"Union", |
|
"Venango", |
|
"Warren", |
|
"Washington", |
|
"Wayne", |
|
"Westmoreland", |
|
"Wyoming", |
|
"York", |
|
] |
|
|
|
urls = {} |
|
|
|
def daterange(start_date, end_date): |
|
for n in range(int ((end_date - start_date).days)): |
|
yield start_date + timedelta(n) |
|
|
|
start_date = date(2020, 3, 19) |
|
end_date = date(2020, 3, 25) |
|
|
|
date_strings = [] |
|
|
|
# Get historical data |
|
for single_date in daterange(start_date, end_date): |
|
date_strings.append(single_date.strftime("%Y-%m-%d")) |
|
urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000") |
|
|
|
# Get today |
|
date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d")) |
|
urls[date(2020, 3, 26)] = "https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx" |
|
|
|
data = {} |
|
for county in COUNTIES: |
|
data[county] = [] |
|
|
|
for date, url in urls.items(): |
|
# Behave differently for a live page vs Internet Archive |
|
if date.strftime("%Y-%m-%d") == date_strings[-1]: |
|
r = requests.get(url, allow_redirects=True) |
|
else: |
|
r = requests.get(url, allow_redirects=True) |
|
r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True) |
|
|
|
node_count = 0 |
|
found_counties = [] |
|
root = html.fromstring(r.content) |
|
for c1 in root.iter(): |
|
if c1.tag != "script": |
|
if c1.tag == "tbody": |
|
for child in c1.iter(): |
|
if child.tag == "td" and child.text != None and child.text: |
|
|
|
if node_count == 0 and child.text not in COUNTIES: |
|
continue |
|
if node_count % 3 == 0: |
|
key = child.text.replace(u"\u200b", "") |
|
if key in COUNTIES: |
|
value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0 |
|
data[key].append(value) |
|
found_counties.append(key) |
|
node_count += 1 |
|
for county in COUNTIES: |
|
if county not in found_counties: |
|
data[county].append(0) |
|
|
|
print(data) |