Skip to content

Instantly share code, notes, and snippets.

@airalcorn2
Last active November 18, 2020 05:38
Show Gist options
  • Select an option

  • Save airalcorn2/f5e8d4fbfb2853add193d465a27adf44 to your computer and use it in GitHub Desktop.

Select an option

Save airalcorn2/f5e8d4fbfb2853add193d465a27adf44 to your computer and use it in GitHub Desktop.
Trying to make sense of COVID-19 data given the testing strategies.
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pgeocode
import re
import seaborn as sns
from datetime import datetime
# from mpl_toolkits.basemap import Basemap
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
nyc_msa_fips = {
36119: "Westchester",
34003: "Bergen",
34017: "Hudson",
34031: "Passaic",
36079: "Putnam",
36087: "Rockland",
36103: "Suffolk",
36059: "Nassau",
34023: "Middlesex",
34025: "Monmouth",
34029: "Ocean",
34035: "Somerset",
34013: "Essex",
34039: "Union",
34027: "Morris",
34037: "Sussex",
34019: "Hunterdon",
42103: "Pike",
}
nyc_csa_fips = {
9001: "Fairfield",
9009: "New Haven",
34021: "Mercer",
9005: "Litchfield",
36111: "Ulster",
42089: "Monroe",
36027: "Dutchess",
36071: "Orange",
}
all_nyc_fips = set(nyc_msa_fips) | set(nyc_csa_fips)
state_pops = {
"AL": 4_903_185,
"AK": 731_545,
"AZ": 7_278_717,
"AR": 3_017_825,
"CA": 39_512_223,
"CO": 5_758_736,
"CT": 3_565_287,
"DE": 973_764,
"FL": 21_477_737,
"GA": 10_617_423,
"HI": 1_415_872,
"ID": 1_787_065,
"IL": 12_671_821,
"IN": 6_732_219,
"IA": 3_155_070,
"KS": 2_913_314,
"KY": 4_467_673,
"LA": 4_648_794,
"ME": 1_344_212,
"MD": 6_045_680,
"MA": 6_949_503,
"MI": 9_986_857,
"MN": 5_639_632,
"MS": 2_976_149,
"MO": 6_137_428,
"MT": 1_068_778,
"NE": 1_934_408,
"NV": 3_080_156,
"NH": 1_359_711,
"NJ": 8_882_190,
"NM": 2_096_829,
"NY": 19_453_561,
"NC": 10_488_084,
"ND": 762_062,
"OH": 11_689_100,
"OK": 3_956_971,
"OR": 4_217_737,
"PA": 12_801_989,
"RI": 1_059_361,
"SC": 5_148_714,
"SD": 884_659,
"TN": 6_833_174,
"TX": 28_995_881,
"UT": 3_205_958,
"VT": 623_989,
"VA": 8_535_519,
"WA": 7_614_893,
"WV": 1_792_147,
"WI": 5_822_434,
"WY": 578_759,
}
markers = ".,ov^<>12348spP*hH+xXDd|_"
colors = "bgrcmyk"
def fixed_infection():
# Infect the population.
pop_size = 1_000_000
pop = np.random.rand(pop_size)
prop_infected = 0.01
pop_dis_status = pop > (1.0 - prop_infected)
print(pop_dis_status[:100])
print(sum(pop_dis_status) / pop_size)
# Determine the number of tests over time.
tests_day_one = 1
new_test_r = 0.3
days = np.arange(14)
cumu_tests = np.array(tests_day_one * (1.0 + new_test_r) ** days, dtype="int")
print(cumu_tests)
# Plot the total number of tests over time.
sns.scatterplot(x=days, y=cumu_tests)
plt.show()
# Count the number of detected cases over time.
cumu_detects = []
total_detected = 0
prev_total_tests = 0
ratio_detecteds = []
for (day, total_tests) in enumerate(cumu_tests):
new_detected = sum(pop_dis_status[prev_total_tests:total_tests])
ratio_detected = new_detected / (total_tests - prev_total_tests)
ratio_detecteds.append(ratio_detected)
total_detected += new_detected
cumu_detects.append(total_detected)
prev_total_tests = total_tests
print(
f"Day: {day} Detected: {total_detected} Proportion: {ratio_detected:.2f}",
flush=True,
)
# Plot the total number of detected cases over time.
sns.scatterplot(x=days, y=cumu_detects)
plt.show()
# Plot the ratio of detected cases per day.
sns.scatterplot(x=days, y=ratio_detecteds)
plt.show()
def spreading_infection():
# Infect the population.
full_pop = 1_000_000
num_infected = 1000 # 0.1% of the population.
spread_rate = 0.02
# Determine the number of tests over time.
tests_day_one = 1
new_test_r = 0.3
days = np.arange(14)
cumu_tests = np.array(tests_day_one * (1.0 + new_test_r) ** days, dtype="int")
cumu_detects = []
total_detected = 0
prev_total_tests = 0
ratio_detecteds = []
for (day, total_tests) in enumerate(cumu_tests):
pop_size = full_pop - total_detected
pop = np.random.rand(pop_size)
prop_infected = (num_infected - total_detected) / len(pop)
pop_dis_status = pop > (1.0 - prop_infected)
new_tests = total_tests - prev_total_tests
new_detected = sum(pop_dis_status[:new_tests])
ratio_detected = new_detected / new_tests
ratio_detecteds.append(ratio_detected)
total_detected += new_detected
cumu_detects.append(total_detected)
prev_total_tests = total_tests
num_infected *= 1 + spread_rate
sns.scatterplot(x=days, y=cumu_detects)
plt.show()
sns.scatterplot(x=days, y=ratio_detecteds)
plt.show()
def diff_countries():
# Different countries with different infection and testing rates.
num_countries = 200
infection_rates = np.random.uniform(0.01, 0.1, num_countries)
infection_rates_per_mil = 1_000_000 * infection_rates
# Range from: https://ourworldindata.org/covid-testing#per-capita-tests-by-country
testing_rates = np.random.uniform(10 / 1e6, 10000 / 1e6, num_countries)
testing_rates_per_mil = 1_000_000 * testing_rates
detected_rates = testing_rates * infection_rates
detected_rates_per_mil = 1_000_000 * detected_rates
sns.scatterplot(x=np.log10(detected_rates_per_mil), y=np.log(testing_rates_per_mil))
plt.show()
def states():
url = "https://covidtracking.com/api/states/daily.csv"
df_states = pd.read_csv(url).sort_values("date")
df_states["population"] = df_states["state"].apply(
lambda state: state_pops.get(state, -1)
)
lag = 1
multi_plot = False
(rows, cols) = (10, 5)
if multi_plot:
(fig, axs) = plt.subplots(rows, cols)
else:
fig = plt.figure(figsize=(10, 6))
pos_or_death = "Death"
tot_or_pos = "Total" if pos_or_death == "Positive" else "Positive"
state_test_rates = []
state_death_rates = []
max_per_capita_death = {}
for (state_idx, state) in enumerate(state_pops):
df_state = df_states[df_states["state"] == state]
keep = df_state[df_state["total"] > 1000]
inc_tests_rate_state = (
keep["total"].iloc[lag:].values / keep["total"].iloc[:-lag].values
)
state_test_rates.append(inc_tests_rate_state.mean())
inc_deaths_rate_state = (
keep["death"].iloc[lag:].values / keep["death"].iloc[:-lag].values
)
state_death_rates.append(inc_deaths_rate_state.mean())
(row, col) = (state_idx // cols, state_idx % cols)
tests_per_mil = 1_000_000 * df_state[tot_or_pos.lower()] / state_pops[state]
res_per_mil = 1_000_000 * df_state[pos_or_death.lower()] / state_pops[state]
ax = axs[row][col] if multi_plot else None
sns.scatterplot(
x=tests_per_mil,
y=res_per_mil,
marker=markers[state_idx % len(markers)],
color=colors[state_idx % len(colors)],
ax=ax,
)
if multi_plot:
ax.set_xlabel("")
ax.set_ylabel("")
max_per_capita_death[state] = (
1_000_000
* df_state["deathIncrease"].rolling(7).mean().max()
/ state_pops[state]
)
fig.legend(list(state_pops), ncol=2, loc=(0.85, 0.15))
if not multi_plot:
plt.xlabel(f"{tot_or_pos}s per million")
plt.ylabel(f"{pos_or_death}s per million")
plt.show()
max_per_capita_death = pd.DataFrame(
[
{"state": state, "death": death}
for (state, death) in max_per_capita_death.items()
]
)
max_per_capita_death = max_per_capita_death.sort_values("death", ascending=False)
sns.barplot(x="state", y="death", data=max_per_capita_death)
plt.show()
state_death_rates = np.array(state_death_rates)
is_nans = np.isnan(state_death_rates)
state_death_rates = state_death_rates[~is_nans]
state_test_rates = np.array(state_test_rates)
state_test_rates = state_test_rates[~is_nans]
df_states["per_positive"] = (
df_states["positiveIncrease"] / df_states["totalTestResultsIncrease"]
)
fig = plt.figure(figsize=(10, 6))
how_many = 45
for (state_idx, state) in enumerate(state_pops):
df_state = df_states[df_states["state"] == state]
sns.lineplot(
x=np.arange(how_many),
y=df_state["per_positive"].iloc[-how_many:],
marker=markers[state_idx % len(markers)],
)
fig.legend(list(state_pops), ncol=2, loc=(0.85, 0.15))
plt.ylabel("Proportion positives")
plt.show()
states = ["FL", "TX", "AZ"]
lag = 60
fig = plt.figure(figsize=(10, 6))
for state in states:
df_state = df_states[df_states["state"] == state]
dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))
avg_pos = df_state["positiveIncrease"].iloc[-lag:].rolling(7).mean()
avg_test = df_state["totalTestResultsIncrease"].iloc[-lag:].rolling(7).mean()
avg_rate = avg_pos / avg_test
sns.lineplot(x=dates.iloc[-lag:], y=avg_rate)
plt.ylim((0, 0.35))
fig.legend(list(states), ncol=2, loc=(0.85, 0.15))
plt.show()
# states = ["FL", "GA", "AL", "NC", "SC", "TX", "MS", "LA", "TN", "AZ"]
states = ["FL", "GA", "AL", "NC", "SC"]
lag = 60
fig = plt.figure(figsize=(10, 6))
for state in states:
df_state = df_states[df_states["state"] == state]
dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))
avg_pos = (
1_000_000
* df_state["positiveIncrease"].iloc[-lag:].rolling(7).mean()
/ state_pops[state]
)
sns.lineplot(x=dates.iloc[-lag:], y=avg_pos)
# plt.ylim((0, 0.35))
fig.legend(list(states), ncol=2, loc=(0.85, 0.15))
plt.show()
date_increase = df_states.groupby("date")["totalTestResultsIncrease"].sum()
dates = [datetime.strptime(str(date), "%Y%m%d") for date in date_increase.index]
sns.lineplot(x=dates[-lag:], y=date_increase.values[-lag:])
plt.show()
fig = plt.figure(figsize=(10, 6))
for state in states:
df_state = df_states[df_states["state"] == state]
dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))
avg_test = df_state["totalTestResultsIncrease"].iloc[-lag:].rolling(7).mean()
sns.lineplot(x=dates.iloc[-lag:], y=avg_test)
fig.legend(list(states), ncol=2, loc=(0.85, 0.15))
plt.show()
sns.scatterplot(x=df_state["total"], y=df_state["positive"])
plt.show()
sns.scatterplot(x=df_state["positive"], y=df_state["death"])
plt.show()
sns.lineplot(x=dates, y=df_state["positiveIncrease"])
plt.show()
sns.lineplot(x=dates, y=df_state["deathIncrease"])
plt.show()
df_al = df_states[df_states["state"] == "AL"]
df_ny = df_states[df_states["state"] == "NY"]
sns.scatterplot(df_ny.iloc[3:]["positive"].values, df_al["positive"].values)
plt.show()
al_diffs = df_al.iloc[1:]["positive"].values - df_al.iloc[:-1]["positive"].values
ny_diffs = (
df_ny.iloc[3 + 1 :]["positive"].values - df_ny.iloc[3:-1]["positive"].values
)
print(al_diffs / ny_diffs)
df_state = df_states[df_states["state"] == "HI"]
df_state["month_day"] = df_state["date"].astype("str").str[4:]
sns.barplot(df_state["month_day"], df_state["positiveIncrease"])
plt.show()
today = df_states["date"].max()
df_today = df_states[df_states["date"] == today]
print(
df_today[["state", "deathIncrease"]].sort_values(
"deathIncrease", ascending=False
)
)
df_today["death_inc_per_capita"] = (
df_today["deathIncrease"] / df_today["population"]
)
print(
df_today[["state", "death_inc_per_capita"]].sort_values(
"death_inc_per_capita", ascending=False
)
)
(x, y, z) = ([], [], [])
states = []
per_capita = True
use_log = False
log_func = np.log if use_log else lambda x: x
for (state_idx, state) in enumerate(state_pops):
df_state = df_today[df_states["state"] == state]
if per_capita:
# x.append((1_000_000 * df_state["positive"] / state_pops[state]).values[0])
x.append(
(log_func(df_state["positive"]) / log_func(df_state["total"])).values[0]
)
y.append((1_000_000 * df_state["death"] / state_pops[state]).values[0])
else:
x.append(
(log_func(df_state["positive"]) / log_func(df_state["total"])).values[0]
)
y.append(
(log_func(df_state["death"]) / log_func(df_state["total"])).values[0]
)
z.append((1_000_000 * df_state["total"] / state_pops[state]).values[0])
states.append(state)
scatter = sns.regplot(x, y)
for (idx, val) in enumerate(x):
scatter.text(
val,
y[idx],
states[idx],
horizontalalignment="left",
verticalalignment="top",
size="small",
color="black",
)
plt.show()
scatter = sns.regplot(z, y)
for (idx, val) in enumerate(z):
scatter.text(
val,
y[idx],
states[idx],
horizontalalignment="left",
verticalalignment="top",
size="small",
color="black",
)
plt.show()
scatter = sns.scatterplot(x=x, y=y, size=z)
for (idx, val) in enumerate(x):
scatter.text(
val,
y[idx],
states[idx],
horizontalalignment="left",
verticalalignment="top",
size="small",
color="black",
)
plt.xlabel("Proportion positive")
plt.ylabel("Deaths per million")
plt.show()
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x, y, z)
ax.set_xlabel("positive")
ax.set_ylabel("death")
ax.set_zlabel("total")
plt.show()
state = "FL"
df_state = df_states[df_states["state"] == state]
pos_len = 11
kernel = (
np.array(list(range(1, pos_len)) + [pos_len] + list(range(1, pos_len))[::-1])
/ pos_len
)
cases = df_state.iloc[-(2 * pos_len - 1) :]["positiveIncrease"].values
tests = df_state.iloc[-(2 * pos_len - 1) :]["totalTestResultsIncrease"].values
print(kernel @ cases / tests.sum())
def death_lags():
url = "https://covidtracking.com/api/states/daily.csv"
df_states = pd.read_csv(url).sort_values("date")
df_states["date"] = df_states["date"].apply(lambda x: str(x))
min_date = "20200525"
max_lag = 30
multi_plot = True
(rows, cols) = (10, 5)
if multi_plot:
(fig, axs) = plt.subplots(rows, cols)
else:
fig = plt.figure(figsize=(10, 6))
best_corrs = []
best_lags = []
legend = []
only_inc = False
use_diff = False
use_rank = False
state2cor_lags = {}
for (state_idx, state) in enumerate(state_pops):
df_state = df_states[
(df_states["state"] == state) & (df_states["date"] >= min_date)
]
if use_diff:
avg_pos = df_state["positiveIncrease"].rolling(7).mean().diff().values[7:]
avg_death = df_state["deathIncrease"].rolling(7).mean().diff().values[7:]
else:
avg_pos = df_state["positiveIncrease"].rolling(7).mean().values[6:]
avg_death = df_state["deathIncrease"].rolling(7).mean().values[6:]
if only_inc and (avg_death[0] > avg_death[-1]):
continue
corrs = []
state2cor_lags[state] = {"corrs": [], "lags": []}
for lag in range(1, max_lag + 1):
if use_rank:
corr = stats.kendalltau(avg_pos[:-lag], avg_death[lag:]).correlation
else:
corr = stats.pearsonr(avg_pos[:-lag], avg_death[lag:])[0]
corrs.append(corr)
state2cor_lags[state]["corrs"].append(corr)
state2cor_lags[state]["lags"].append(lag)
best_lag = np.argmax(np.abs(corrs)) + 1
(row, col) = (state_idx // cols, state_idx % cols)
ax = axs[row][col] if multi_plot else None
sns.scatterplot(
x=avg_pos[:-best_lag],
y=avg_death[best_lag:],
marker=markers[state_idx % len(markers)],
color=colors[state_idx % len(colors)],
ax=ax,
)
best_lags.append(best_lag)
best_corrs.append(corrs[best_lag - 1])
legend.append(f"{state}:{best_lag}:{corrs[best_lag - 1]:.2f}")
fig.legend(list(legend), ncol=2, loc=(0.8, 0.15))
plt.show()
sns.scatterplot(x=best_lags, y=best_corrs)
plt.show()
if multi_plot:
(fig, axs) = plt.subplots(rows, cols)
else:
fig = plt.figure(figsize=(10, 6))
for (state_idx, state) in enumerate(state_pops):
(row, col) = (state_idx // cols, state_idx % cols)
ax = axs[row][col] if multi_plot else None
sns.scatterplot(
x=state2cor_lags[state]["lags"],
y=state2cor_lags[state]["corrs"],
marker=markers[state_idx % len(markers)],
color=colors[state_idx % len(colors)],
ax=ax,
)
fig.legend(list(legend), ncol=2, loc=(0.8, 0.15))
plt.show()
corrs = []
n = 30
for samp in range(10000):
steps = 2 * np.random.randint(0, 1 + 1, (2, n)) - 1
positions = np.cumsum(steps, axis=1)
corrs.append(stats.pearsonr(positions[0], positions[1])[0])
sns.scatterplot(x=positions[0], y=positions[1])
plt.show()
sns.distplot(corrs)
plt.show()
def us():
total_pop_us = 328_239_523
url = "https://covidtracking.com/api/us/daily.csv"
df_us = pd.read_csv(url).sort_values("date")
lag = 1
new_pos = df_us["positive"].iloc[lag:].values - df_us["positive"].iloc[:-lag].values
new_tests = df_us["total"].iloc[lag:].values - df_us["total"].iloc[:-lag].values
new_deaths = df_us["death"].iloc[lag:].values - df_us["death"].iloc[:-lag].values
inc_tests_rate_us = (
df_us["total"].iloc[lag:].values / df_us["total"].iloc[:-lag].values
)
print(inc_tests_rate_us[-13:].mean())
inc_deaths_rate_us = (
df_us["death"].iloc[lag:].values / df_us["death"].iloc[:-lag].values
)
print(inc_deaths_rate_us[-13:].mean())
sns.scatterplot(x=np.arange(len(new_tests)), y=new_tests)
# Plot the actual total number of cases over time.
sns.scatterplot(x=np.arange(len(df_us)), y=df_us["positive"])
plt.show()
sns.scatterplot(x=np.arange(len(df_us)), y=np.log10(df_us["positive"]))
plt.show()
sns.scatterplot(x=np.arange(len(df_us)), y=df_us["positive"] / total_pop_us)
plt.show()
# Plot the actual total number of tests over time.
sns.scatterplot(x=np.arange(len(df_us)), y=df_us["total"])
plt.show()
sns.scatterplot(x=np.arange(len(df_us)), y=np.log10(df_us["total"]))
plt.show()
sns.scatterplot(x=np.arange(len(df_us)), y=df_us["total"] / total_pop_us)
plt.show()
# Plot the actual ratio of detected cases per day.
sns.scatterplot(x=np.arange(len(new_pos)), y=new_pos / new_tests)
plt.show()
# Plot the actual number of tests vs. the number of cases.
sns.regplot(df_us["total"], df_us["positive"])
plt.show()
sns.scatterplot(x=np.log10(df_us["total"]), y=np.log10(df_us["positive"]))
plt.show()
# Plot the actual number of tests vs. the number of deaths.
sns.regplot(df_us["total"], df_us["death"])
plt.show()
sns.scatterplot(x=np.log10(df_us["total"]), y=np.log10(df_us["death"]))
plt.show()
# Plot the actual number of new tests vs. the number of new deaths.
sns.scatterplot(x=new_tests, y=new_deaths)
plt.show()
sns.scatterplot(x=np.log10(new_tests), y=np.log10(new_deaths))
plt.show()
# Plot the actual number of cases vs. the number of deaths.
sns.scatterplot(x=df_us["positive"], y=df_us["death"])
plt.show()
sns.scatterplot(x=np.log10(df_us["positive"]), y=np.log10(df_us["death"]))
plt.show()
def it():
total_pop_it = 60_317_546
url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv"
df_it = pd.read_csv(url)
dates = df_it["data"].str.split("T").str.get(0).str.split("-", expand=True)
df_it["month_day"] = dates[1] + dates[2]
lag = 1
inc_tests_rate_it = (
df_it["tamponi"].iloc[lag:].values / df_it["tamponi"].iloc[:-lag].values
)
print(inc_tests_rate_it[-13:].mean())
inc_deaths_rate_it = (
df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values
)
print(inc_deaths_rate_it[-13:].mean())
# Plot the actual total number of cases over time.
sns.scatterplot(x=df_it["month_day"], y=df_it["totale_casi"])
plt.show()
sns.scatterplot(x=df_it["month_day"], y=np.log10(df_it["totale_casi"]))
plt.show()
sns.scatterplot(x=df_it["month_day"], y=df_it["totale_casi"] / total_pop_it)
plt.show()
# Plot the actual total number of tests over time.
sns.scatterplot(x=df_it["month_day"], y=df_it["tamponi"])
plt.show()
sns.scatterplot(x=df_it["month_day"], y=np.log10(df_it["tamponi"]))
plt.show()
sns.scatterplot(x=df_it["month_day"], y=df_it["tamponi"] / total_pop_it)
plt.show()
# Plot the actual ratio of detected cases per day.
new_pos = (
df_it["totale_casi"].iloc[lag:].values - df_it["totale_casi"].iloc[:-lag].values
)
new_tests = df_it["tamponi"].iloc[lag:].values - df_it["tamponi"].iloc[:-lag].values
sns.scatterplot(x=np.arange(len(new_pos)), y=new_pos / new_tests)
plt.show()
# Plot the ratio of new deaths to new cases over time.
new_deaths = (
df_it["deceduti"].iloc[lag:].values - df_it["deceduti"].iloc[:-lag].values
)
sns.scatterplot(x=df_it["month_day"].iloc[lag:], y=new_deaths / new_pos)
plt.show()
# Plot the ratio of of positive cases increase rate vs. death increase rate.
case_rate_inc = (
df_it["totale_casi"].iloc[lag:].values / df_it["totale_casi"].iloc[:-lag].values
)
death_rate_inc = (
df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values
)
sns.scatterplot(x=df_it["month_day"].iloc[lag:], y=case_rate_inc / death_rate_inc)
plt.show()
def lom_and_ven():
total_pop_lom = 10_088_484
total_pop_ven = 4_865_380
url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv"
df_regions = pd.read_csv(url)
dates = df_regions["data"].str.split("T").str.get(0).str.split("-", expand=True)
df_regions["month_day"] = dates[1] + dates[2]
fig = plt.figure(figsize=(10, 6))
pos_or_death = "Death"
test_or_pos = "Positive" if pos_or_death == "Death" else "Test"
if pos_or_death == "Positive":
cat_it = "totale_casi"
else:
cat_it = "deceduti"
regions = ["Lombardia", "Veneto"]
pops = {"Lombardia": total_pop_lom, "Veneto": total_pop_ven}
for region in regions:
df_region = df_regions[df_regions["denominazione_regione"] == region]
tests_per_mil = 1_000_000 * df_region["tamponi"].values / pops[region]
res_per_mil = 1_000_000 * df_region[cat_it].values / pops[region]
scatter_it = sns.scatterplot(x=tests_per_mil, y=res_per_mil)
for (idx, val) in enumerate(tests_per_mil):
scatter_it.text(
val,
res_per_mil[idx],
df_region.iloc[idx]["month_day"].split("T")[0],
horizontalalignment="left",
verticalalignment="top",
size="small",
color="black",
)
fig.legend(regions, loc=(0.8, 0.25))
plt.xlabel(f"{test_or_pos}s per million")
plt.ylabel(f"{pos_or_death}s per million")
plt.show()
def us_vs_it():
lag = 1
total_pop_us = 328_239_523
total_pop_it = 60_317_546
url = "https://covidtracking.com/api/us/daily.csv"
df_us = pd.read_csv(url).sort_values("date")
inc_tests_rate_us = (
df_us["total"].iloc[lag:].values / df_us["total"].iloc[:-lag].values
)
print(inc_tests_rate_us[-13:].mean())
inc_deaths_rate_us = (
df_us["death"].iloc[lag:].values / df_us["death"].iloc[:-lag].values
)
print(inc_deaths_rate_us[-13:].mean())
url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv"
df_it = pd.read_csv(url)
dates = df_it["data"].str.split("T").str.get(0).str.split("-", expand=True)
df_it["month_day"] = dates[1] + dates[2]
inc_tests_rate_it = (
df_it["tamponi"].iloc[lag:].values / df_it["tamponi"].iloc[:-lag].values
)
inc_deaths_rate_it = (
df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values
)
threshold_us = 150_000
keep = df_us[df_us["total"] > threshold_us]
sns.scatterplot(keep["total"], keep["death"])
plt.show()
sns.scatterplot(
1_000_000 * keep["total"] / total_pop_us,
1_000_000 * keep["positive"] / total_pop_us,
)
plt.show()
threshold_it = 50000
keep = df_it[df_it["tamponi"] >= threshold_it]
sns.scatterplot(
1_000_000 * keep["tamponi"] / total_pop_it,
1_000_000 * keep["deceduti"] / total_pop_it,
)
plt.show()
sns.scatterplot(
1_000_000 * keep["tamponi"] / total_pop_it,
1_000_000 * keep["totale_casi"] / total_pop_it,
)
plt.show()
# (A, B) = np.polyfit(np.log(keep["tamponi"]), keep["totale_casi"], 1)
# keep["pred_totale_casi"] = A * np.log(keep["tamponi"]) + B
# sns.scatterplot(keep["tamponi"], keep["totale_casi"])
# sns.scatterplot(keep["tamponi"], keep["pred_totale_casi"])
# plt.show()
pos_or_death = "Positive"
if pos_or_death == "Positive":
(cat_it, cat_us) = ("totale_casi", "positive")
else:
(cat_it, cat_us) = ("deceduti", "death")
fig = plt.figure(figsize=(10, 6))
keep = df_it[df_it["tamponi"] >= threshold_it]
tests_per_mil = 1_000_000 * keep["tamponi"].values / total_pop_it
res_per_mil = 1_000_000 * keep[cat_it].values / total_pop_it
scatter_it = sns.scatterplot(tests_per_mil, res_per_mil)
for (idx, val) in enumerate(tests_per_mil):
scatter_it.text(
val,
res_per_mil[idx],
keep.iloc[idx]["month_day"].split("T")[0],
horizontalalignment="left",
verticalalignment="top",
size="small",
color="black",
)
keep = df_us[df_us["total"] > threshold_us]
tests_per_mil = 1_000_000 * keep["total"].values / total_pop_us
res_per_mil = 1_000_000 * keep[cat_us].values / total_pop_us
scatter_us = sns.scatterplot(tests_per_mil, res_per_mil)
for (idx, val) in enumerate(tests_per_mil):
scatter_us.text(
val,
res_per_mil[idx],
str(keep.iloc[idx]["date"])[4:],
horizontalalignment="left",
verticalalignment="top",
size="small",
color="black",
)
fig.legend(["Italy", "U.S."], loc=(0.8, 0.15))
plt.xlabel("Tests per million")
plt.ylabel(f"{pos_or_death}s per million")
plt.show()
start_cases = 250
steps = 13
rate_1 = inc_tests_rate_it[-steps:].mean()
rate_2 = inc_deaths_rate_us[-steps:].mean()
x = threshold_it * rate_1 ** np.arange(steps)
y = start_cases * rate_2 ** np.arange(steps)
sns.regplot(x, y)
plt.show()
rate_1 = inc_tests_rate_us[-steps:].mean()
rate_2 = inc_deaths_rate_it[-steps:].mean()
x = threshold_us * rate_1 ** np.arange(steps)
y = start_cases * rate_2 ** np.arange(steps)
sns.regplot(x, y)
plt.show()
def total_mortality():
# CSV from: https://gis.cdc.gov/grasp/fluview/mortality.html.
df_totals = pd.read_csv("National_2015-20_Data.csv")
df_totals["TOTAL DEATHS"] = (
df_totals["TOTAL DEATHS"].str.replace(",", "").astype("int")
)
seasons = list(set(df_totals["SEASON"]))
seasons.sort()
fig = plt.figure(figsize=(10, 6))
for season in seasons:
df_season = df_totals[df_totals["SEASON"] == season]
sns.scatterplot(df_season["WEEK"], df_season["TOTAL DEATHS"])
fig.legend(list(seasons), loc=(0.85, 0.15))
plt.xticks(1 + np.arange(52))
plt.show()
def counties():
url = (
"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
)
df_counties = pd.read_csv(url)
today = df_counties["date"].max()
df_counties_today = df_counties[df_counties["date"] == today].sort_values(
"deaths", ascending=False
)
total_cases = df_counties_today["cases"].sum()
total_deaths = df_counties_today["deaths"].sum()
df_counties_today["per_deaths"] = df_counties_today["deaths"] / total_deaths
df_counties_today["per_cases"] = df_counties_today["cases"] / total_cases
df_counties_today["over_rep"] = (
df_counties_today["per_deaths"] - df_counties_today["per_cases"]
)
df_counties_today = df_counties_today.sort_values("over_rep", ascending=False)
df_counties_today["over_rep_rat"] = (
df_counties_today["per_deaths"] / df_counties_today["per_cases"]
)
df_counties_today = df_counties_today.sort_values("over_rep_rat", ascending=False)
df_counties_today = df_counties_today.sort_values("deaths", ascending=False)
sns.scatterplot(df_counties_today["per_cases"], df_counties_today["per_deaths"])
plt.show()
ny_nj_ct = df_counties_today[
(df_counties_today["state"] == "New York")
| (df_counties_today["state"] == "New Jersey")
| (df_counties_today["state"] == "Connecitcut")
]
total_cases_ny_nj_ct = ny_nj_ct["cases"].sum()
total_deaths_ny_nj_ct = ny_nj_ct["deaths"].sum()
print(total_cases_ny_nj_ct / df_counties_today["cases"].sum())
print(total_deaths_ny_nj_ct / df_counties_today["deaths"].sum())
nyc_csa_deaths = df_counties_today[df_counties_today["county"] == "New York City"][
"deaths"
].iloc[0]
nyc_csa_cases = df_counties_today[df_counties_today["county"] == "New York City"][
"cases"
].iloc[0]
for fips in all_nyc_fips:
df_fips = df_counties_today[df_counties_today["fips"] == fips]
print(df_fips)
nyc_csa_deaths += df_fips["deaths"].iloc[0]
nyc_csa_cases += df_fips["cases"].iloc[0]
nyc_csa_per_deaths = nyc_csa_deaths / df_counties_today["deaths"].sum()
nyc_csa_per_cases = nyc_csa_cases / df_counties_today["cases"].sum()
df_counties_today = df_counties_today.append(
{
"county": "NYC MSA",
"per_deaths": nyc_csa_per_deaths,
"per_cases": nyc_csa_per_cases,
},
ignore_index=True,
)
sns.scatterplot(df_counties_today["per_cases"], df_counties_today["per_deaths"])
plt.show()
print(nyc_csa_per_deaths)
print(nyc_csa_per_cases)
def surge_counties():
url = (
"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
)
df_counties = pd.read_csv(url)
df_counties.sort_values(["fips", "date"], inplace=True)
df_counties["new_cases"] = df_counties["cases"].diff()
mask = df_counties["fips"] != df_counties["fips"].shift(1)
df_counties["new_cases"][mask] = np.nan
df_counties["new_deaths"] = df_counties["deaths"].diff()
df_counties["new_deaths"][mask] = np.nan
today = df_counties["date"].max()
df_counties_today = df_counties[df_counties["date"] == today]
df_counties_today["fips"] = df_counties_today["fips"].replace(np.nan, -1)
df_counties_today["fips"] = df_counties_today["fips"].apply(
lambda fips: str(int(fips)).zfill(5)
)
url_population = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
df_population = pd.read_csv(url_population, encoding="ISO-8859-1")
df_population["STATE"] = df_population["STATE"].apply(
lambda state_fips: str(state_fips).zfill(2)
)
df_population["COUNTY"] = df_population["COUNTY"].apply(
lambda county_fips: str(county_fips).zfill(3)
)
df_population["FIPS"] = df_population["STATE"] + df_population["COUNTY"]
combined = df_counties_today.merge(
df_population[["FIPS", "POPESTIMATE2019"]],
left_on="fips",
right_on="FIPS",
how="left",
)
combined["new_cases_per_mil"] = (
1000000 * combined["new_cases"] / combined["POPESTIMATE2019"]
)
combined["new_deaths_per_mil"] = (
1000000 * combined["new_deaths"] / combined["POPESTIMATE2019"]
)
combined[
[
"county",
"state",
"new_cases_per_mil",
"new_deaths_per_mil",
"POPESTIMATE2019",
]
].dropna().to_csv("surging.csv", index=False)
def ca_counties():
url = "https://covidtracking.com/api/states/daily.csv"
df_states = pd.read_csv(url).sort_values("date")
county2data = {}
region_pops = {}
df_ca = pd.read_csv("ca-counties.csv")
for (row_idx, row) in df_ca.iterrows():
region = row["region"]
population = row["population"]
county2data[row["county"]] = {"region": region, "population": population}
region_pops[region] = region_pops.get(region, 0) + int(
population.replace(",", "")
)
url = (
"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
)
df_counties = pd.read_csv(url)
df_ca_counties = df_counties[df_counties["state"] == "California"]
df_ca_counties["region"] = df_ca_counties["county"].apply(
lambda county: county2data.get(county, {"region": "None"})["region"]
)
df_ca_counties = df_ca_counties[df_ca_counties["region"] != "None"]
df_ca_counties = df_ca_counties[df_ca_counties["date"] >= "2020-05-01"]
df_ca_counties["date"] = df_ca_counties["date"]
df_ca_counties.sort_values(["county", "date"], inplace=True)
df_ca_counties["date"] = df_ca_counties["date"].apply(
lambda x: datetime.strptime(str(x), "%Y-%m-%d")
)
df_ca_counties["new_cases"] = df_ca_counties["cases"].diff()
mask = df_ca_counties["county"] != df_ca_counties["county"].shift(1)
df_ca_counties["new_cases"][mask] = np.nan
df_ca_counties["new_deaths"] = df_ca_counties["deaths"].diff()
df_ca_counties["new_deaths"][mask] = np.nan
region_new_cases = (
df_ca_counties.groupby(["date", "region"])["new_cases"].sum().reset_index()
)
region_new_deaths = (
df_ca_counties.groupby(["date", "region"])["new_deaths"].sum().reset_index()
)
region_new_deaths["population"] = region_new_deaths["region"].apply(
lambda region: region_pops[region]
)
region_new_deaths["per_mil"] = (
1_000_000 * region_new_deaths["new_deaths"] / region_new_deaths["population"]
)
fig = plt.figure(figsize=(10, 6))
ca_regions = list(region_pops)
for region in ca_regions:
df_region = region_new_deaths[region_new_deaths["region"] == region]
sns.lineplot(x=df_region["date"], y=df_region["per_mil"].rolling(7).mean())
state_pops = {"AZ": 7_278_717, "FL": 21_477_737}
keep_states = list(state_pops)
for state in keep_states:
df_state = df_states[df_states["state"] == state]
df_state = df_state[df_state["date"] >= 20_200_501]
df_state["date"] = df_state["date"].apply(
lambda x: datetime.strptime(str(x), "%Y%m%d")
)
df_state["per_mil"] = 1_000_000 * df_state["deathIncrease"] / state_pops[state]
sns.lineplot(x=df_state["date"], y=df_state["per_mil"].rolling(7).mean())
fig.legend(ca_regions + keep_states, loc=(0.85, 0.15))
plt.show()
def nursing_homes():
# ZIP to FIPS from: https://data.world/niccolley/us-zipcode-to-county-state#.
f_zip_fips = "ZIP-COUNTY-FIPS_2017-06.csv"
df_zip_fips = pd.read_csv(f_zip_fips)
df_zip_fips["ZIP"] = df_zip_fips["ZIP"].apply(
lambda zip_code: str(zip_code).zfill(5)
)
df_zip_fips["STCOUNTYFP"] = df_zip_fips["STCOUNTYFP"].apply(
lambda zip_code: str(zip_code).zfill(5)
)
zip2fips = dict(zip(df_zip_fips["ZIP"], df_zip_fips["STCOUNTYFP"]))
# Nursing home data.
url_nursing_homes = "https://opendata.arcgis.com/datasets/78c58035fb3942ba82af991bb4476f13_0.csv?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"
df_nursing_homes = pd.read_csv(url_nursing_homes)
df_nursing_homes["ZIP"] = df_nursing_homes["ZIP"].apply(
lambda zip_code: str(zip_code).zfill(5)
)
df_nursing_homes["FIPS"] = df_nursing_homes["ZIP"].apply(
lambda zip_code: zip2fips.get(zip_code, "")
)
print(len(df_nursing_homes))
# Remove rows with missing ZIP codes.
df_nursing_homes = df_nursing_homes[df_nursing_homes["FIPS"] != ""]
print(len(df_nursing_homes))
# Some rows are missing beds data (i.e., coded as -999). Impute the missing data
# using the median number of beds per nursing home for rows without missing data.
median_beds = df_nursing_homes[df_nursing_homes["BEDS"] > 0]["BEDS"].median()
df_nursing_homes.loc[df_nursing_homes["BEDS"] < 0, "BEDS"] = median_beds
# Get nursing home data per county.
beds_by_fips = df_nursing_homes.groupby("FIPS", as_index=False)["BEDS"].sum()
homes_by_fips = pd.DataFrame(df_nursing_homes["FIPS"].value_counts())
homes_by_fips.columns = ["HOMES"]
homes_by_fips["FIPS"] = homes_by_fips.index
# County COVID-19 data.
url_counties = (
"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
)
df_counties = pd.read_csv(url_counties)
today = df_counties["date"].max()
df_counties_today = df_counties[df_counties["date"] == today].sort_values(
"deaths", ascending=False
)
df_counties_today["fips"] = df_counties_today["fips"].replace(np.nan, -1)
df_counties_today["fips"] = df_counties_today["fips"].apply(
lambda fips: str(int(fips)).zfill(5)
)
# Combine with population data.
url_population = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
df_population = pd.read_csv(url_population, encoding="ISO-8859-1")
df_population["STATE"] = df_population["STATE"].apply(
lambda state_fips: str(state_fips).zfill(2)
)
df_population["COUNTY"] = df_population["COUNTY"].apply(
lambda county_fips: str(county_fips).zfill(3)
)
df_population["FIPS"] = df_population["STATE"] + df_population["COUNTY"]
combined = df_counties_today.merge(
df_population[["FIPS", "POPESTIMATE2019"]],
left_on="fips",
right_on="FIPS",
how="left",
)
# Combine with beds data.
combined = combined.merge(beds_by_fips, left_on="fips", right_on="FIPS", how="left")
# Fill in counties without nursing home data.
combined["BEDS"] = combined["BEDS"].replace(np.nan, 0)
# Combine with homes data.
combined = combined.merge(
homes_by_fips, left_on="fips", right_on="FIPS", how="left"
)
# Fill in counties without nursing home data.
combined["HOMES"] = combined["HOMES"].replace(np.nan, 0)
# New York City FIPS: https://data.ny.gov/Government-Finance/NY-Municipalities-and-County-FIPS-codes/79vr-2kdi.
nyc_fips = {"36005", "36047", "36061", "36081", "36085"}
nyc_beds = nyc_homes = nyc_pop = 0
for fips in nyc_fips:
nyc_beds += beds_by_fips[beds_by_fips["FIPS"] == fips]["BEDS"].iloc[0]
nyc_homes += homes_by_fips[homes_by_fips["FIPS"] == fips]["HOMES"].iloc[0]
nyc_pop += df_population[df_population["FIPS"] == fips]["POPESTIMATE2019"].iloc[
0
]
combined.loc[combined["county"] == "New York City", "BEDS"] = nyc_beds
combined.loc[combined["county"] == "New York City", "HOMES"] = nyc_homes
combined.loc[combined["county"] == "New York City", "POPESTIMATE2019"] = nyc_pop
# Add ZIP codes.
fips2zip = {f: z for (z, f) in zip2fips.items()}
combined["zip"] = combined["fips"].apply(lambda fips: fips2zip.get(fips, fips))
# Add latitudes and longitudes.
nomi = pgeocode.Nominatim("us")
(combined["lat"], combined["long"]) = zip(
*combined["zip"].map(
lambda zip_code: tuple(
nomi.query_postal_code(zip_code)[["latitude", "longitude"]]
)
)
)
combined.loc[combined["county"] == "New York City", "lat"] = 40.7128
combined.loc[combined["county"] == "New York City", "long"] = -74.0060
# Exclude locations without latitude/longitude coordinates.
print(len(combined))
final = combined[~combined["lat"].isnull()]
print(len(final))
# No obvious relationship.
sns.scatterplot(final["BEDS"], final["deaths"])
plt.show()
sns.scatterplot(final["HOMES"], final["deaths"])
plt.show()
sns.scatterplot(final["cases"], final["deaths"])
plt.show()
# Add New York MSA.
nyc_beds = final[final["county"] == "New York City"]["BEDS"].iloc[0]
nyc_homes = final[final["county"] == "New York City"]["HOMES"].iloc[0]
nyc_pop = final[final["county"] == "New York City"]["POPESTIMATE2019"].iloc[0]
nyc_cases = final[final["county"] == "New York City"]["cases"].iloc[0]
nyc_deaths = final[final["county"] == "New York City"]["deaths"].iloc[0]
for fips in nyc_msa_fips:
fips = str(fips).zfill(5)
nyc_beds += beds_by_fips[beds_by_fips["FIPS"] == fips]["BEDS"].iloc[0]
nyc_homes += homes_by_fips[homes_by_fips["FIPS"] == fips]["HOMES"].iloc[0]
nyc_pop += df_population[df_population["FIPS"] == fips]["POPESTIMATE2019"].iloc[
0
]
nyc_cases += df_counties_today[df_counties_today["fips"] == fips]["cases"].iloc[
0
]
nyc_deaths += df_counties_today[df_counties_today["fips"] == fips][
"deaths"
].iloc[0]
final = final.append(
{
"county": "NYC MSA",
"cases": nyc_cases,
"deaths": nyc_deaths,
"BEDS": nyc_beds,
"HOMES": nyc_homes,
"POPESTIMATE2019": nyc_pop,
"lat": 40.7128,
"long": -74.0060,
},
ignore_index=True,
)
scale = 500
final["deaths_per_mil"] = 1_000_000 * final["deaths"] / final["POPESTIMATE2019"]
which_zs = ["POPESTIMATE2019", "BEDS", "HOMES", "cases", "deaths", "deaths_per_mil"]
for which_z in which_zs:
m = Basemap(
llcrnrlon=-119,
llcrnrlat=22,
urcrnrlon=-64,
urcrnrlat=49,
projection="lcc",
lat_1=33,
lat_2=45,
lon_0=-95,
)
m.drawmapboundary()
m.drawcoastlines()
(x, y) = m(list(final["long"]), list(final["lat"]))
z = final[which_z]
log_z = np.log(z + 1)
norm = matplotlib.colors.Normalize(
vmin=log_z.min(), vmax=log_z.max(), clip=True
)
mapper = cm.ScalarMappable(norm=norm, cmap=cm.coolwarm)
colors = mapper.to_rgba(log_z)
z = z / z.max()
m.scatter(x, y, c=colors, s=scale * z, alpha=0.5)
plt.show()
def maricopa_mobility():
df = pd.read_csv("Global_Mobility_Report.csv")
df_az = df[df["sub_region_1"] == "Arizona"]
df_mari = df_az[df_az["sub_region_2"] == "Maricopa County"]
df_mari["date"] = df_mari["date"].apply(
lambda x: datetime.strptime(str(x), "%Y-%m-%d")
)
mobility_cols = [
"retail_and_recreation_percent_change_from_baseline",
"grocery_and_pharmacy_percent_change_from_baseline",
"parks_percent_change_from_baseline",
"transit_stations_percent_change_from_baseline",
"workplaces_percent_change_from_baseline",
"residential_percent_change_from_baseline",
]
(rows, cols) = (2, 3)
(fig, axs) = plt.subplots(rows, cols)
for (col_idx, mobility_col) in enumerate(mobility_cols):
(row, col) = (col_idx // cols, col_idx % cols)
sns.lineplot(df_mari["date"], df_mari[mobility_col], ax=axs[row][col])
plt.show()
def excess():
url = "https://data.cdc.gov/api/views/xkkf-xrst/rows.csv?accessType=DOWNLOAD&bom=true&format=true%20target="
df = pd.read_csv(url)
state_pop_mils = {
"Florida": 21.48,
"Texas": 29,
"New Jersey": 8.882,
"New York": 11.051,
"New York City": 8.339,
}
state_excesses = {}
for state in state_pop_mils:
df_state = df[df["State"] == state]
lower_excess = df_state["Total Excess Lower Estimate in 2020"].max()
higher_excess = df_state["Total Excess Higher Estimate in 2020"].max()
excess = (lower_excess + higher_excess) / 2
state_excesses[state] = excess
for (state, excess) in state_excesses.items():
print(state)
print(excess / state_pop_mils[state])
all_ny_excess = state_excesses["New York"] + state_excesses["New York City"]
all_ny_pop_mil = state_pop_mils["New York"] + state_pop_mils["New York City"]
print("New York with New York City")
print(all_ny_excess / all_ny_pop_mil)
nyc_deaths = df[
(df["State"] == "New York City")
& (df["Week Ending Date"] >= "2020")
& (df["Type"] == "Predicted (weighted)")
& (df["Outcome"] == "All causes")
][["Average Expected Count", "Observed Number"]].values
print((nyc_deaths[:, 1] - nyc_deaths[:, 0]).sum())
lows = df.groupby("State")["Total Excess Lower Estimate in 2020"].max()
highs = df.groupby("State")["Total Excess Higher Estimate in 2020"].max()
avgs = (lows + highs) / 2
print(avgs.sum() - avgs[avgs.index == "United States"])
def canada():
url = "https://raw.githubusercontent.com/ishaberry/Covid19Canada/master/timeseries_prov/testing_timeseries_prov.csv"
df = pd.read_csv(url)
df_on = df[df["province"] == "Ontario"]
dates = df_on["date_testing"].apply(lambda x: datetime.strptime(str(x), "%d-%m-%Y"))
sns.lineplot(dates, df_on["testing"].rolling(7).mean())
plt.show()
def age_deaths():
url = "https://data.cdc.gov/api/views/vsak-wrfu/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(url)
age_groups = [
"Under 1 year",
"1-4 years",
"5-14 years",
"15-24 years",
"25-34 years",
"35-44 years",
"45-54 years",
"55-64 years",
"65-74 years",
"75-84 years",
"85 years and over",
]
df_age_sex = df.groupby(["MMWR Week", "Age Group"])[
["Total Deaths", "COVID-19 Deaths"]
].sum()
df_age_sex = df_age_sex.reset_index()
fig = plt.figure(figsize=(10, 6))
for age_group in age_groups:
df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
sns.lineplot(df_age["MMWR Week"], df_age["COVID-19 Deaths"])
fig.legend(age_groups, loc=(0.85, 0.15))
plt.show()
fig = plt.figure(figsize=(10, 6))
for age_group in age_groups:
df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
sns.lineplot(
df_age["MMWR Week"], df_age["COVID-19 Deaths"] / df_age["Total Deaths"]
)
fig.legend(age_groups, loc=(0.85, 0.15))
plt.show()
fig = plt.figure(figsize=(10, 6))
for age_group in age_groups[3:]:
df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
max_deaths = df_age["COVID-19 Deaths"].max()
sns.lineplot(df_age["MMWR Week"], df_age["COVID-19 Deaths"] / max_deaths)
fig.legend(age_groups[3:], loc=(0.85, 0.15))
plt.show()
df_age_totals = df_age_sex.groupby("Age Group")["COVID-19 Deaths"].sum().to_dict()
fig = plt.figure(figsize=(10, 6))
for age_group in age_groups:
df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
cum_totals = df_age["COVID-19 Deaths"].cumsum()
sns.lineplot(df_age["MMWR Week"], cum_totals / df_age_totals[age_group])
fig.legend(age_groups, loc=(0.85, 0.15))
plt.show()
def death_risk():
url = "https://data.cdc.gov/api/views/y5bj-9g5w/rows.csv?accessType=DOWNLOAD&bom=true&format=true%20target="
df = pd.read_csv(url)
comp_states = [
"New York City",
"New York",
"New Jersey",
"Louisiana",
"Massachusetts",
"Mississippi",
"Florida",
"Alabama",
]
comp_years = [2019, 2020]
comp_age_groups = [
"25-44 years",
"65-74 years",
"75-84 years",
"85 years and older",
]
comp_age_groups = df["Age Group"].unique()
data = []
for comp_state in comp_states:
print(f"\nJurisdiction: {comp_state}")
row = {"state": comp_state}
for comp_age_group in comp_age_groups:
print(f"\nAge Group: {comp_age_group}")
death_data = []
for comp_year in comp_years:
deaths = df[
(df["Jurisdiction"] == comp_state)
& (df["Year"] == comp_year)
& (df["Week Ending Date"] < "10")
& (df["Age Group"] == comp_age_group)
& (df["Type"] == "Unweighted")
]["Number of Deaths"].sum()
if comp_state == "New York":
deaths += df[
(df["Jurisdiction"] == "New York City")
& (df["Year"] == comp_year)
& (df["Week Ending Date"] < "10")
& (df["Age Group"] == comp_age_group)
& (df["Type"] == "Unweighted")
]["Number of Deaths"].sum()
death_data.append(deaths)
print(f"{comp_year}: {int(deaths)}")
per_inc = 100 * (death_data[1] / death_data[0] - 1)
col = f"{comp_age_group}"
row[col] = per_inc
print(f"Percent Increase: {per_inc:.2f}")
data.append(row)
df_deaths = pd.DataFrame(data)
df_melted = df_deaths.melt(
"state", var_name="age_group", value_name="deaths % inc."
)
sns.barplot(x="state", y="deaths % inc.", hue="age_group", data=df_melted)
plt.show()
def obesity():
us_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_United%20States%20of%20America.csv"
de_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_Germany.csv"
vn_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_Viet%20Nam.csv"
urls = [us_url, de_url, vn_url]
dfs = []
for url in urls:
dfs.append(pd.read_csv(url, encoding="ISO-8859-1"))
df = pd.concat(dfs)
keep_columns = [
column
for column in df.columns
if (("Prevalence" in column) and ("interval" not in column))
][2:]
country_col = "Country/Region/World"
keep_columns.append(country_col)
df_2016 = df[(df["Sex"] == "Men") & (df["Year"] == 2016)][keep_columns]
cols = {"bmi": [], "percent": [], "country": []}
for (row_idx, row) in df_2016.iterrows():
country = row[country_col]
for col in keep_columns:
if col == country_col:
continue
bmi_str = re.search("BMI.*m²", col).group()
bmi_str = bmi_str.replace("BMI ", "")
bmi_str = bmi_str.replace("BMI", "")
bmi_str = bmi_str.replace(" kg/m²", "")
cols["bmi"].append(bmi_str)
cols["percent"].append(row[col])
cols["country"].append(country)
df = pd.DataFrame(cols)
sns.barplot(x="bmi", y="percent", hue="country", data=df)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment