timarnold · March 26, 2020 16:06
diff --git a/fetch_coronavirus_pa.py b/fetch_coronavirus_pa.py
 import requests
 import json
 from html.parser import HTMLParser
 from lxml import html, etree
 from datetime import timedelta, date

 URL_BASE = "https://archive.org/wayback/available?url=https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx&timestamp="
 COUNTIES = [
 "Adams",
 "Allegheny",
 "Armstrong",
 "Beaver",
 "Bedford",
 "Berks",
 "Blair",
 "Bradford",
 "Bucks",
 "Butler",
 "Cambria",
 "Cameron",
 "Carbon",
 "Centre",
 "Chester",
 "Clarion",
 "Clearfield",
 "Clinton",
 "Columbia",
 "Crawford",
 "Cumberland",
 "Dauphin",
 "Delaware",
 "Elk",
 "Erie",
 "Fayette",
 "Forest",
 "Franklin",
 "Fulton",
 "Greene",
 "Huntingdon",
 "Indiana",
 "Jefferson",
 "Juniata",
 "Lackawanna",
 "Lancaster",
 "Lawrence",
 "Lebanon",
 "Lehigh",
 "Luzerne",
 "Lycoming",
 "McKean",
 "Mercer",
 "Mifflin",
 "Monroe",
 "Montgomery",
 "Montour",
 "Northampton",
 "Northumberland",
 "Perry",
 "Philadelphia",
 "Pike",
 "Potter",
 "Schuylkill",
 "Snyder",
 "Somerset",
 "Sullivan",
 "Susquehanna",
 "Tioga",
 "Union",
 "Venango",
 "Warren",
 "Washington",
 "Wayne",
 "Westmoreland",
 "Wyoming",
 "York",
 ]

 urls = {}

 def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

 start_date = date(2020, 3, 19)
 end_date = date(2020, 3, 25)

 date_strings = []

 # Get historical data
 for single_date in daterange(start_date, end_date):
  date_strings.append(single_date.strftime("%Y-%m-%d"))
  urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000")

 # Get today
 date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d"))
 urls[date(2020, 3, 26)] = "https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx"

 data = {}
 for county in COUNTIES:
  data[county] = []

 for date, url in urls.items():
  # Behave differently for a live page vs Internet Archive
  if date.strftime("%Y-%m-%d") == date_strings[-1]:
    r = requests.get(url, allow_redirects=True)
  else:
    r = requests.get(url, allow_redirects=True)
    r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True)

  node_count = 0
  found_counties = []
  root = html.fromstring(r.content)
  for c1 in root.iter():
    if c1.tag != "script":
      if c1.tag == "tbody":
        for child in c1.iter():
          if child.tag == "td" and child.text != None and child.text:

            if node_count == 0 and child.text not in COUNTIES:
              continue
            if node_count % 3 == 0:
              key = child.text.replace(u"\u200b", "")
              if key in COUNTIES:
                value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0
                data[key].append(value)
                found_counties.append(key)
            node_count += 1
  for county in COUNTIES:
    if county not in found_counties:
      data[county].append(0)

 print(data)
	import requests
	import json
	from html.parser import HTMLParser
	from lxml import html, etree
	from datetime import timedelta, date

	URL_BASE = "https://archive.org/wayback/available?url=https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx&timestamp="
	COUNTIES = [
	"Adams",
	"Allegheny",
	"Armstrong",
	"Beaver",
	"Bedford",
	"Berks",
	"Blair",
	"Bradford",
	"Bucks",
	"Butler",
	"Cambria",
	"Cameron",
	"Carbon",
	"Centre",
	"Chester",
	"Clarion",
	"Clearfield",
	"Clinton",
	"Columbia",
	"Crawford",
	"Cumberland",
	"Dauphin",
	"Delaware",
	"Elk",
	"Erie",
	"Fayette",
	"Forest",
	"Franklin",
	"Fulton",
	"Greene",
	"Huntingdon",
	"Indiana",
	"Jefferson",
	"Juniata",
	"Lackawanna",
	"Lancaster",
	"Lawrence",
	"Lebanon",
	"Lehigh",
	"Luzerne",
	"Lycoming",
	"McKean",
	"Mercer",
	"Mifflin",
	"Monroe",
	"Montgomery",
	"Montour",
	"Northampton",
	"Northumberland",
	"Perry",
	"Philadelphia",
	"Pike",
	"Potter",
	"Schuylkill",
	"Snyder",
	"Somerset",
	"Sullivan",
	"Susquehanna",
	"Tioga",
	"Union",
	"Venango",
	"Warren",
	"Washington",
	"Wayne",
	"Westmoreland",
	"Wyoming",
	"York",
	]

	urls = {}

	def daterange(start_date, end_date):
	for n in range(int ((end_date - start_date).days)):
	yield start_date + timedelta(n)

	start_date = date(2020, 3, 19)
	end_date = date(2020, 3, 25)

	date_strings = []

	# Get historical data
	for single_date in daterange(start_date, end_date):
	date_strings.append(single_date.strftime("%Y-%m-%d"))
	urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000")

	# Get today
	date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d"))
	urls[date(2020, 3, 26)] = "https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx"

	data = {}
	for county in COUNTIES:
	data[county] = []

	for date, url in urls.items():
	# Behave differently for a live page vs Internet Archive
	if date.strftime("%Y-%m-%d") == date_strings[-1]:
	r = requests.get(url, allow_redirects=True)
	else:
	r = requests.get(url, allow_redirects=True)
	r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True)

	node_count = 0
	found_counties = []
	root = html.fromstring(r.content)
	for c1 in root.iter():
	if c1.tag != "script":
	if c1.tag == "tbody":
	for child in c1.iter():
	if child.tag == "td" and child.text != None and child.text:

	if node_count == 0 and child.text not in COUNTIES:
	continue
	if node_count % 3 == 0:
	key = child.text.replace(u"\u200b", "")
	if key in COUNTIES:
	value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0
	data[key].append(value)
	found_counties.append(key)
	node_count += 1
	for county in COUNTIES:
	if county not in found_counties:
	data[county].append(0)

	print(data)