Last active
July 22, 2020 18:07
-
-
Save jorendorff/2cc339cdfe56ad00813b310a5d29f1eb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Load usafacts.org dataset on COVID-19 spread per U.S. county over time. | |
To use this, you need a copy of covid_confirmed_usafacts.csv, | |
which you can download at | |
<https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv>. | |
I got there from here: | |
<https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/>. | |
""" | |
import csv | |
from datetime import date | |
import re | |
def parse_date(s): | |
"""Parse a date in the form mm/dd/yy.""" | |
m = re.match(r'^(1[0-2]?|[2-9])/([1-9]\d?)/(\d\d)$', s) | |
if m is None: | |
raise ValueError(f"invalid date {s!r} (expected mm/dd/yy)") | |
mm, dd, yy = map(int, m.groups()) | |
try: | |
return date(2000 + yy, mm, dd) | |
except ValueError as exc: | |
raise ValueError(f"invalid date {s!r} ({exc})") | |
class County: | |
"""Data for one county.""" | |
def __init__(self, fips, name, state, state_fips, cases_by_date): | |
self.fips = fips | |
self.name = name | |
self.state = state | |
self.state_fips = state_fips | |
self.cases_by_date = dict(cases_by_date) | |
COUNTY_FIPS_KEY = '\ufeffcountyFIPS' # cope with a BOM in the data file | |
def load(): | |
"""Load county COVID-19 data. | |
Returns a dictionary that maps FIPS county codes to County objects. | |
""" | |
counties = {} | |
with open("covid_confirmed_usafacts.csv") as f: | |
for row in csv.DictReader(f): | |
fips = int(row.pop(COUNTY_FIPS_KEY)) | |
name = row.pop("County Name") | |
state = row.pop("State") | |
state_fips = int(row.pop("stateFIPS")) | |
if fips == 0 and name == "Statewide Unallocated": | |
# Ignore the data in these rows | |
continue | |
if fips in counties: | |
raise ValueError(f"County {fips} ({name}, {state}) appears more than once in the CSV") | |
data = [(parse_date(date_str), int(ncases_str)) | |
for date_str, ncases_str in row.items()] | |
counties[fips] = County(fips, name, state, state_fips, data) | |
return counties | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Attempt to reproduce the "Hot 50" counties list by @EthicalSkeptic. | |
This reproduces the list of counties in | |
<https://twitter.com/EthicalSkeptic/status/1285297329470398465>. | |
NOTE: THIS IS NOT AN ENDORSEMENT. The formula is clearly flawed, and I think | |
@EthicalSkeptic's work generally is so rife with confirmation bias (assuming | |
good faith) that it should be disregarded. (I feel bad saying so, but I also | |
don't want my part in this misrepresented, so I have little choice but to | |
say clearly what I think.) | |
I did this to see if this one formula produces consistent results. | |
Conclusion: The output fluctuates greatly from day to day. Noise in the input | |
is amplified. For example, the "heat" for Los Angeles County, CA on consecutive | |
days is: | |
209, 237, 2725, 231, -162, -461, 578, -1550 | |
It flaps between being one of the hottest counties in the country and one of | |
the coldest. Needless to say, this does not correspond to anything actually | |
happening in the real world. L.A. County is home to some ten million people; | |
it is either #1 or #2 in daily new cases every day over that time period. | |
""" | |
import dataset | |
from datetime import date, timedelta | |
def new_cases(county, d): | |
"""For the given county, get the number of new cases reported on date d.""" | |
return county.cases_by_date[d] - county.cases_by_date[d - timedelta(days=1)] | |
def heat(county, d): | |
"""@EthicalSkeptic defines "heat" as increase in daily new cases from 7 days prior. | |
The main problem with this is that reporting is uneven, so new_cases | |
fluctuates quite a bit. | |
""" | |
return new_cases(county, d) - new_cases(county, d - timedelta(days=7)) | |
def hot50(counties, d): | |
"""Given the full dataset, compute the "heat" for every county and sort by heat.""" | |
counties_with_heat = [(c, heat(c, d)) for c in counties.values()] | |
return sorted(counties_with_heat, key=lambda pair: pair[1], reverse=True) | |
def main(): | |
counties = dataset.load() | |
N = 50 | |
# Show the "Hot N" for 8 consecutive days. | |
for t in range(8): | |
when = date(2020, 7, 14) + timedelta(days=t) | |
hot = hot50(counties, when) | |
print(when) | |
for i, (c, h) in list(enumerate(hot))[:N]: | |
print(f"{i + 1:2d}. {h:5d} {c.name}, {c.state}") | |
# Show L.A. County even if it wasn't in the top N for this day. | |
TARGET = "Los Angeles County", "CA" | |
[(i_la, h_la)] = [(i, hh) for i, (hc, hh) in enumerate(hot) | |
if (hc.name, hc.state) == TARGET] | |
if i_la >= N: | |
print("...") | |
print(f"{i_la + 1:2d}. {h_la:5d} {TARGET[0]}, {TARGET[1]}") | |
print() | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment