Last active
November 18, 2020 05:38
-
-
Save airalcorn2/f5e8d4fbfb2853add193d465a27adf44 to your computer and use it in GitHub Desktop.
Trying to make sense of COVID-19 data given the testing strategies.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib | |
| import matplotlib.cm as cm | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import pgeocode | |
| import re | |
| import seaborn as sns | |
| from datetime import datetime | |
| # from mpl_toolkits.basemap import Basemap | |
| from mpl_toolkits.mplot3d import Axes3D | |
| from scipy import stats | |
| nyc_msa_fips = { | |
| 36119: "Westchester", | |
| 34003: "Bergen", | |
| 34017: "Hudson", | |
| 34031: "Passaic", | |
| 36079: "Putnam", | |
| 36087: "Rockland", | |
| 36103: "Suffolk", | |
| 36059: "Nassau", | |
| 34023: "Middlesex", | |
| 34025: "Monmouth", | |
| 34029: "Ocean", | |
| 34035: "Somerset", | |
| 34013: "Essex", | |
| 34039: "Union", | |
| 34027: "Morris", | |
| 34037: "Sussex", | |
| 34019: "Hunterdon", | |
| 42103: "Pike", | |
| } | |
| nyc_csa_fips = { | |
| 9001: "Fairfield", | |
| 9009: "New Haven", | |
| 34021: "Mercer", | |
| 9005: "Litchfield", | |
| 36111: "Ulster", | |
| 42089: "Monroe", | |
| 36027: "Dutchess", | |
| 36071: "Orange", | |
| } | |
| all_nyc_fips = set(nyc_msa_fips) | set(nyc_csa_fips) | |
| state_pops = { | |
| "AL": 4_903_185, | |
| "AK": 731_545, | |
| "AZ": 7_278_717, | |
| "AR": 3_017_825, | |
| "CA": 39_512_223, | |
| "CO": 5_758_736, | |
| "CT": 3_565_287, | |
| "DE": 973_764, | |
| "FL": 21_477_737, | |
| "GA": 10_617_423, | |
| "HI": 1_415_872, | |
| "ID": 1_787_065, | |
| "IL": 12_671_821, | |
| "IN": 6_732_219, | |
| "IA": 3_155_070, | |
| "KS": 2_913_314, | |
| "KY": 4_467_673, | |
| "LA": 4_648_794, | |
| "ME": 1_344_212, | |
| "MD": 6_045_680, | |
| "MA": 6_949_503, | |
| "MI": 9_986_857, | |
| "MN": 5_639_632, | |
| "MS": 2_976_149, | |
| "MO": 6_137_428, | |
| "MT": 1_068_778, | |
| "NE": 1_934_408, | |
| "NV": 3_080_156, | |
| "NH": 1_359_711, | |
| "NJ": 8_882_190, | |
| "NM": 2_096_829, | |
| "NY": 19_453_561, | |
| "NC": 10_488_084, | |
| "ND": 762_062, | |
| "OH": 11_689_100, | |
| "OK": 3_956_971, | |
| "OR": 4_217_737, | |
| "PA": 12_801_989, | |
| "RI": 1_059_361, | |
| "SC": 5_148_714, | |
| "SD": 884_659, | |
| "TN": 6_833_174, | |
| "TX": 28_995_881, | |
| "UT": 3_205_958, | |
| "VT": 623_989, | |
| "VA": 8_535_519, | |
| "WA": 7_614_893, | |
| "WV": 1_792_147, | |
| "WI": 5_822_434, | |
| "WY": 578_759, | |
| } | |
| markers = ".,ov^<>12348spP*hH+xXDd|_" | |
| colors = "bgrcmyk" | |
| def fixed_infection(): | |
| # Infect the population. | |
| pop_size = 1_000_000 | |
| pop = np.random.rand(pop_size) | |
| prop_infected = 0.01 | |
| pop_dis_status = pop > (1.0 - prop_infected) | |
| print(pop_dis_status[:100]) | |
| print(sum(pop_dis_status) / pop_size) | |
| # Determine the number of tests over time. | |
| tests_day_one = 1 | |
| new_test_r = 0.3 | |
| days = np.arange(14) | |
| cumu_tests = np.array(tests_day_one * (1.0 + new_test_r) ** days, dtype="int") | |
| print(cumu_tests) | |
| # Plot the total number of tests over time. | |
| sns.scatterplot(x=days, y=cumu_tests) | |
| plt.show() | |
| # Count the number of detected cases over time. | |
| cumu_detects = [] | |
| total_detected = 0 | |
| prev_total_tests = 0 | |
| ratio_detecteds = [] | |
| for (day, total_tests) in enumerate(cumu_tests): | |
| new_detected = sum(pop_dis_status[prev_total_tests:total_tests]) | |
| ratio_detected = new_detected / (total_tests - prev_total_tests) | |
| ratio_detecteds.append(ratio_detected) | |
| total_detected += new_detected | |
| cumu_detects.append(total_detected) | |
| prev_total_tests = total_tests | |
| print( | |
| f"Day: {day} Detected: {total_detected} Proportion: {ratio_detected:.2f}", | |
| flush=True, | |
| ) | |
| # Plot the total number of detected cases over time. | |
| sns.scatterplot(x=days, y=cumu_detects) | |
| plt.show() | |
| # Plot the ratio of detected cases per day. | |
| sns.scatterplot(x=days, y=ratio_detecteds) | |
| plt.show() | |
| def spreading_infection(): | |
| # Infect the population. | |
| full_pop = 1_000_000 | |
| num_infected = 1000 # 0.1% of the population. | |
| spread_rate = 0.02 | |
| # Determine the number of tests over time. | |
| tests_day_one = 1 | |
| new_test_r = 0.3 | |
| days = np.arange(14) | |
| cumu_tests = np.array(tests_day_one * (1.0 + new_test_r) ** days, dtype="int") | |
| cumu_detects = [] | |
| total_detected = 0 | |
| prev_total_tests = 0 | |
| ratio_detecteds = [] | |
| for (day, total_tests) in enumerate(cumu_tests): | |
| pop_size = full_pop - total_detected | |
| pop = np.random.rand(pop_size) | |
| prop_infected = (num_infected - total_detected) / len(pop) | |
| pop_dis_status = pop > (1.0 - prop_infected) | |
| new_tests = total_tests - prev_total_tests | |
| new_detected = sum(pop_dis_status[:new_tests]) | |
| ratio_detected = new_detected / new_tests | |
| ratio_detecteds.append(ratio_detected) | |
| total_detected += new_detected | |
| cumu_detects.append(total_detected) | |
| prev_total_tests = total_tests | |
| num_infected *= 1 + spread_rate | |
| sns.scatterplot(x=days, y=cumu_detects) | |
| plt.show() | |
| sns.scatterplot(x=days, y=ratio_detecteds) | |
| plt.show() | |
| def diff_countries(): | |
| # Different countries with different infection and testing rates. | |
| num_countries = 200 | |
| infection_rates = np.random.uniform(0.01, 0.1, num_countries) | |
| infection_rates_per_mil = 1_000_000 * infection_rates | |
| # Range from: https://ourworldindata.org/covid-testing#per-capita-tests-by-country | |
| testing_rates = np.random.uniform(10 / 1e6, 10000 / 1e6, num_countries) | |
| testing_rates_per_mil = 1_000_000 * testing_rates | |
| detected_rates = testing_rates * infection_rates | |
| detected_rates_per_mil = 1_000_000 * detected_rates | |
| sns.scatterplot(x=np.log10(detected_rates_per_mil), y=np.log(testing_rates_per_mil)) | |
| plt.show() | |
| def states(): | |
| url = "https://covidtracking.com/api/states/daily.csv" | |
| df_states = pd.read_csv(url).sort_values("date") | |
| df_states["population"] = df_states["state"].apply( | |
| lambda state: state_pops.get(state, -1) | |
| ) | |
| lag = 1 | |
| multi_plot = False | |
| (rows, cols) = (10, 5) | |
| if multi_plot: | |
| (fig, axs) = plt.subplots(rows, cols) | |
| else: | |
| fig = plt.figure(figsize=(10, 6)) | |
| pos_or_death = "Death" | |
| tot_or_pos = "Total" if pos_or_death == "Positive" else "Positive" | |
| state_test_rates = [] | |
| state_death_rates = [] | |
| max_per_capita_death = {} | |
| for (state_idx, state) in enumerate(state_pops): | |
| df_state = df_states[df_states["state"] == state] | |
| keep = df_state[df_state["total"] > 1000] | |
| inc_tests_rate_state = ( | |
| keep["total"].iloc[lag:].values / keep["total"].iloc[:-lag].values | |
| ) | |
| state_test_rates.append(inc_tests_rate_state.mean()) | |
| inc_deaths_rate_state = ( | |
| keep["death"].iloc[lag:].values / keep["death"].iloc[:-lag].values | |
| ) | |
| state_death_rates.append(inc_deaths_rate_state.mean()) | |
| (row, col) = (state_idx // cols, state_idx % cols) | |
| tests_per_mil = 1_000_000 * df_state[tot_or_pos.lower()] / state_pops[state] | |
| res_per_mil = 1_000_000 * df_state[pos_or_death.lower()] / state_pops[state] | |
| ax = axs[row][col] if multi_plot else None | |
| sns.scatterplot( | |
| x=tests_per_mil, | |
| y=res_per_mil, | |
| marker=markers[state_idx % len(markers)], | |
| color=colors[state_idx % len(colors)], | |
| ax=ax, | |
| ) | |
| if multi_plot: | |
| ax.set_xlabel("") | |
| ax.set_ylabel("") | |
| max_per_capita_death[state] = ( | |
| 1_000_000 | |
| * df_state["deathIncrease"].rolling(7).mean().max() | |
| / state_pops[state] | |
| ) | |
| fig.legend(list(state_pops), ncol=2, loc=(0.85, 0.15)) | |
| if not multi_plot: | |
| plt.xlabel(f"{tot_or_pos}s per million") | |
| plt.ylabel(f"{pos_or_death}s per million") | |
| plt.show() | |
| max_per_capita_death = pd.DataFrame( | |
| [ | |
| {"state": state, "death": death} | |
| for (state, death) in max_per_capita_death.items() | |
| ] | |
| ) | |
| max_per_capita_death = max_per_capita_death.sort_values("death", ascending=False) | |
| sns.barplot(x="state", y="death", data=max_per_capita_death) | |
| plt.show() | |
| state_death_rates = np.array(state_death_rates) | |
| is_nans = np.isnan(state_death_rates) | |
| state_death_rates = state_death_rates[~is_nans] | |
| state_test_rates = np.array(state_test_rates) | |
| state_test_rates = state_test_rates[~is_nans] | |
| df_states["per_positive"] = ( | |
| df_states["positiveIncrease"] / df_states["totalTestResultsIncrease"] | |
| ) | |
| fig = plt.figure(figsize=(10, 6)) | |
| how_many = 45 | |
| for (state_idx, state) in enumerate(state_pops): | |
| df_state = df_states[df_states["state"] == state] | |
| sns.lineplot( | |
| x=np.arange(how_many), | |
| y=df_state["per_positive"].iloc[-how_many:], | |
| marker=markers[state_idx % len(markers)], | |
| ) | |
| fig.legend(list(state_pops), ncol=2, loc=(0.85, 0.15)) | |
| plt.ylabel("Proportion positives") | |
| plt.show() | |
| states = ["FL", "TX", "AZ"] | |
| lag = 60 | |
| fig = plt.figure(figsize=(10, 6)) | |
| for state in states: | |
| df_state = df_states[df_states["state"] == state] | |
| dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d")) | |
| avg_pos = df_state["positiveIncrease"].iloc[-lag:].rolling(7).mean() | |
| avg_test = df_state["totalTestResultsIncrease"].iloc[-lag:].rolling(7).mean() | |
| avg_rate = avg_pos / avg_test | |
| sns.lineplot(x=dates.iloc[-lag:], y=avg_rate) | |
| plt.ylim((0, 0.35)) | |
| fig.legend(list(states), ncol=2, loc=(0.85, 0.15)) | |
| plt.show() | |
| # states = ["FL", "GA", "AL", "NC", "SC", "TX", "MS", "LA", "TN", "AZ"] | |
| states = ["FL", "GA", "AL", "NC", "SC"] | |
| lag = 60 | |
| fig = plt.figure(figsize=(10, 6)) | |
| for state in states: | |
| df_state = df_states[df_states["state"] == state] | |
| dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d")) | |
| avg_pos = ( | |
| 1_000_000 | |
| * df_state["positiveIncrease"].iloc[-lag:].rolling(7).mean() | |
| / state_pops[state] | |
| ) | |
| sns.lineplot(x=dates.iloc[-lag:], y=avg_pos) | |
| # plt.ylim((0, 0.35)) | |
| fig.legend(list(states), ncol=2, loc=(0.85, 0.15)) | |
| plt.show() | |
| date_increase = df_states.groupby("date")["totalTestResultsIncrease"].sum() | |
| dates = [datetime.strptime(str(date), "%Y%m%d") for date in date_increase.index] | |
| sns.lineplot(x=dates[-lag:], y=date_increase.values[-lag:]) | |
| plt.show() | |
| fig = plt.figure(figsize=(10, 6)) | |
| for state in states: | |
| df_state = df_states[df_states["state"] == state] | |
| dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d")) | |
| avg_test = df_state["totalTestResultsIncrease"].iloc[-lag:].rolling(7).mean() | |
| sns.lineplot(x=dates.iloc[-lag:], y=avg_test) | |
| fig.legend(list(states), ncol=2, loc=(0.85, 0.15)) | |
| plt.show() | |
| sns.scatterplot(x=df_state["total"], y=df_state["positive"]) | |
| plt.show() | |
| sns.scatterplot(x=df_state["positive"], y=df_state["death"]) | |
| plt.show() | |
| sns.lineplot(x=dates, y=df_state["positiveIncrease"]) | |
| plt.show() | |
| sns.lineplot(x=dates, y=df_state["deathIncrease"]) | |
| plt.show() | |
| df_al = df_states[df_states["state"] == "AL"] | |
| df_ny = df_states[df_states["state"] == "NY"] | |
| sns.scatterplot(df_ny.iloc[3:]["positive"].values, df_al["positive"].values) | |
| plt.show() | |
| al_diffs = df_al.iloc[1:]["positive"].values - df_al.iloc[:-1]["positive"].values | |
| ny_diffs = ( | |
| df_ny.iloc[3 + 1 :]["positive"].values - df_ny.iloc[3:-1]["positive"].values | |
| ) | |
| print(al_diffs / ny_diffs) | |
| df_state = df_states[df_states["state"] == "HI"] | |
| df_state["month_day"] = df_state["date"].astype("str").str[4:] | |
| sns.barplot(df_state["month_day"], df_state["positiveIncrease"]) | |
| plt.show() | |
| today = df_states["date"].max() | |
| df_today = df_states[df_states["date"] == today] | |
| print( | |
| df_today[["state", "deathIncrease"]].sort_values( | |
| "deathIncrease", ascending=False | |
| ) | |
| ) | |
| df_today["death_inc_per_capita"] = ( | |
| df_today["deathIncrease"] / df_today["population"] | |
| ) | |
| print( | |
| df_today[["state", "death_inc_per_capita"]].sort_values( | |
| "death_inc_per_capita", ascending=False | |
| ) | |
| ) | |
| (x, y, z) = ([], [], []) | |
| states = [] | |
| per_capita = True | |
| use_log = False | |
| log_func = np.log if use_log else lambda x: x | |
| for (state_idx, state) in enumerate(state_pops): | |
| df_state = df_today[df_states["state"] == state] | |
| if per_capita: | |
| # x.append((1_000_000 * df_state["positive"] / state_pops[state]).values[0]) | |
| x.append( | |
| (log_func(df_state["positive"]) / log_func(df_state["total"])).values[0] | |
| ) | |
| y.append((1_000_000 * df_state["death"] / state_pops[state]).values[0]) | |
| else: | |
| x.append( | |
| (log_func(df_state["positive"]) / log_func(df_state["total"])).values[0] | |
| ) | |
| y.append( | |
| (log_func(df_state["death"]) / log_func(df_state["total"])).values[0] | |
| ) | |
| z.append((1_000_000 * df_state["total"] / state_pops[state]).values[0]) | |
| states.append(state) | |
| scatter = sns.regplot(x, y) | |
| for (idx, val) in enumerate(x): | |
| scatter.text( | |
| val, | |
| y[idx], | |
| states[idx], | |
| horizontalalignment="left", | |
| verticalalignment="top", | |
| size="small", | |
| color="black", | |
| ) | |
| plt.show() | |
| scatter = sns.regplot(z, y) | |
| for (idx, val) in enumerate(z): | |
| scatter.text( | |
| val, | |
| y[idx], | |
| states[idx], | |
| horizontalalignment="left", | |
| verticalalignment="top", | |
| size="small", | |
| color="black", | |
| ) | |
| plt.show() | |
| scatter = sns.scatterplot(x=x, y=y, size=z) | |
| for (idx, val) in enumerate(x): | |
| scatter.text( | |
| val, | |
| y[idx], | |
| states[idx], | |
| horizontalalignment="left", | |
| verticalalignment="top", | |
| size="small", | |
| color="black", | |
| ) | |
| plt.xlabel("Proportion positive") | |
| plt.ylabel("Deaths per million") | |
| plt.show() | |
| fig = plt.figure() | |
| ax = fig.add_subplot(111, projection="3d") | |
| ax.scatter(x, y, z) | |
| ax.set_xlabel("positive") | |
| ax.set_ylabel("death") | |
| ax.set_zlabel("total") | |
| plt.show() | |
| state = "FL" | |
| df_state = df_states[df_states["state"] == state] | |
| pos_len = 11 | |
| kernel = ( | |
| np.array(list(range(1, pos_len)) + [pos_len] + list(range(1, pos_len))[::-1]) | |
| / pos_len | |
| ) | |
| cases = df_state.iloc[-(2 * pos_len - 1) :]["positiveIncrease"].values | |
| tests = df_state.iloc[-(2 * pos_len - 1) :]["totalTestResultsIncrease"].values | |
| print(kernel @ cases / tests.sum()) | |
| def death_lags(): | |
| url = "https://covidtracking.com/api/states/daily.csv" | |
| df_states = pd.read_csv(url).sort_values("date") | |
| df_states["date"] = df_states["date"].apply(lambda x: str(x)) | |
| min_date = "20200525" | |
| max_lag = 30 | |
| multi_plot = True | |
| (rows, cols) = (10, 5) | |
| if multi_plot: | |
| (fig, axs) = plt.subplots(rows, cols) | |
| else: | |
| fig = plt.figure(figsize=(10, 6)) | |
| best_corrs = [] | |
| best_lags = [] | |
| legend = [] | |
| only_inc = False | |
| use_diff = False | |
| use_rank = False | |
| state2cor_lags = {} | |
| for (state_idx, state) in enumerate(state_pops): | |
| df_state = df_states[ | |
| (df_states["state"] == state) & (df_states["date"] >= min_date) | |
| ] | |
| if use_diff: | |
| avg_pos = df_state["positiveIncrease"].rolling(7).mean().diff().values[7:] | |
| avg_death = df_state["deathIncrease"].rolling(7).mean().diff().values[7:] | |
| else: | |
| avg_pos = df_state["positiveIncrease"].rolling(7).mean().values[6:] | |
| avg_death = df_state["deathIncrease"].rolling(7).mean().values[6:] | |
| if only_inc and (avg_death[0] > avg_death[-1]): | |
| continue | |
| corrs = [] | |
| state2cor_lags[state] = {"corrs": [], "lags": []} | |
| for lag in range(1, max_lag + 1): | |
| if use_rank: | |
| corr = stats.kendalltau(avg_pos[:-lag], avg_death[lag:]).correlation | |
| else: | |
| corr = stats.pearsonr(avg_pos[:-lag], avg_death[lag:])[0] | |
| corrs.append(corr) | |
| state2cor_lags[state]["corrs"].append(corr) | |
| state2cor_lags[state]["lags"].append(lag) | |
| best_lag = np.argmax(np.abs(corrs)) + 1 | |
| (row, col) = (state_idx // cols, state_idx % cols) | |
| ax = axs[row][col] if multi_plot else None | |
| sns.scatterplot( | |
| x=avg_pos[:-best_lag], | |
| y=avg_death[best_lag:], | |
| marker=markers[state_idx % len(markers)], | |
| color=colors[state_idx % len(colors)], | |
| ax=ax, | |
| ) | |
| best_lags.append(best_lag) | |
| best_corrs.append(corrs[best_lag - 1]) | |
| legend.append(f"{state}:{best_lag}:{corrs[best_lag - 1]:.2f}") | |
| fig.legend(list(legend), ncol=2, loc=(0.8, 0.15)) | |
| plt.show() | |
| sns.scatterplot(x=best_lags, y=best_corrs) | |
| plt.show() | |
| if multi_plot: | |
| (fig, axs) = plt.subplots(rows, cols) | |
| else: | |
| fig = plt.figure(figsize=(10, 6)) | |
| for (state_idx, state) in enumerate(state_pops): | |
| (row, col) = (state_idx // cols, state_idx % cols) | |
| ax = axs[row][col] if multi_plot else None | |
| sns.scatterplot( | |
| x=state2cor_lags[state]["lags"], | |
| y=state2cor_lags[state]["corrs"], | |
| marker=markers[state_idx % len(markers)], | |
| color=colors[state_idx % len(colors)], | |
| ax=ax, | |
| ) | |
| fig.legend(list(legend), ncol=2, loc=(0.8, 0.15)) | |
| plt.show() | |
| corrs = [] | |
| n = 30 | |
| for samp in range(10000): | |
| steps = 2 * np.random.randint(0, 1 + 1, (2, n)) - 1 | |
| positions = np.cumsum(steps, axis=1) | |
| corrs.append(stats.pearsonr(positions[0], positions[1])[0]) | |
| sns.scatterplot(x=positions[0], y=positions[1]) | |
| plt.show() | |
| sns.distplot(corrs) | |
| plt.show() | |
| def us(): | |
| total_pop_us = 328_239_523 | |
| url = "https://covidtracking.com/api/us/daily.csv" | |
| df_us = pd.read_csv(url).sort_values("date") | |
| lag = 1 | |
| new_pos = df_us["positive"].iloc[lag:].values - df_us["positive"].iloc[:-lag].values | |
| new_tests = df_us["total"].iloc[lag:].values - df_us["total"].iloc[:-lag].values | |
| new_deaths = df_us["death"].iloc[lag:].values - df_us["death"].iloc[:-lag].values | |
| inc_tests_rate_us = ( | |
| df_us["total"].iloc[lag:].values / df_us["total"].iloc[:-lag].values | |
| ) | |
| print(inc_tests_rate_us[-13:].mean()) | |
| inc_deaths_rate_us = ( | |
| df_us["death"].iloc[lag:].values / df_us["death"].iloc[:-lag].values | |
| ) | |
| print(inc_deaths_rate_us[-13:].mean()) | |
| sns.scatterplot(x=np.arange(len(new_tests)), y=new_tests) | |
| # Plot the actual total number of cases over time. | |
| sns.scatterplot(x=np.arange(len(df_us)), y=df_us["positive"]) | |
| plt.show() | |
| sns.scatterplot(x=np.arange(len(df_us)), y=np.log10(df_us["positive"])) | |
| plt.show() | |
| sns.scatterplot(x=np.arange(len(df_us)), y=df_us["positive"] / total_pop_us) | |
| plt.show() | |
| # Plot the actual total number of tests over time. | |
| sns.scatterplot(x=np.arange(len(df_us)), y=df_us["total"]) | |
| plt.show() | |
| sns.scatterplot(x=np.arange(len(df_us)), y=np.log10(df_us["total"])) | |
| plt.show() | |
| sns.scatterplot(x=np.arange(len(df_us)), y=df_us["total"] / total_pop_us) | |
| plt.show() | |
| # Plot the actual ratio of detected cases per day. | |
| sns.scatterplot(x=np.arange(len(new_pos)), y=new_pos / new_tests) | |
| plt.show() | |
| # Plot the actual number of tests vs. the number of cases. | |
| sns.regplot(df_us["total"], df_us["positive"]) | |
| plt.show() | |
| sns.scatterplot(x=np.log10(df_us["total"]), y=np.log10(df_us["positive"])) | |
| plt.show() | |
| # Plot the actual number of tests vs. the number of deaths. | |
| sns.regplot(df_us["total"], df_us["death"]) | |
| plt.show() | |
| sns.scatterplot(x=np.log10(df_us["total"]), y=np.log10(df_us["death"])) | |
| plt.show() | |
| # Plot the actual number of new tests vs. the number of new deaths. | |
| sns.scatterplot(x=new_tests, y=new_deaths) | |
| plt.show() | |
| sns.scatterplot(x=np.log10(new_tests), y=np.log10(new_deaths)) | |
| plt.show() | |
| # Plot the actual number of cases vs. the number of deaths. | |
| sns.scatterplot(x=df_us["positive"], y=df_us["death"]) | |
| plt.show() | |
| sns.scatterplot(x=np.log10(df_us["positive"]), y=np.log10(df_us["death"])) | |
| plt.show() | |
| def it(): | |
| total_pop_it = 60_317_546 | |
| url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv" | |
| df_it = pd.read_csv(url) | |
| dates = df_it["data"].str.split("T").str.get(0).str.split("-", expand=True) | |
| df_it["month_day"] = dates[1] + dates[2] | |
| lag = 1 | |
| inc_tests_rate_it = ( | |
| df_it["tamponi"].iloc[lag:].values / df_it["tamponi"].iloc[:-lag].values | |
| ) | |
| print(inc_tests_rate_it[-13:].mean()) | |
| inc_deaths_rate_it = ( | |
| df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values | |
| ) | |
| print(inc_deaths_rate_it[-13:].mean()) | |
| # Plot the actual total number of cases over time. | |
| sns.scatterplot(x=df_it["month_day"], y=df_it["totale_casi"]) | |
| plt.show() | |
| sns.scatterplot(x=df_it["month_day"], y=np.log10(df_it["totale_casi"])) | |
| plt.show() | |
| sns.scatterplot(x=df_it["month_day"], y=df_it["totale_casi"] / total_pop_it) | |
| plt.show() | |
| # Plot the actual total number of tests over time. | |
| sns.scatterplot(x=df_it["month_day"], y=df_it["tamponi"]) | |
| plt.show() | |
| sns.scatterplot(x=df_it["month_day"], y=np.log10(df_it["tamponi"])) | |
| plt.show() | |
| sns.scatterplot(x=df_it["month_day"], y=df_it["tamponi"] / total_pop_it) | |
| plt.show() | |
| # Plot the actual ratio of detected cases per day. | |
| new_pos = ( | |
| df_it["totale_casi"].iloc[lag:].values - df_it["totale_casi"].iloc[:-lag].values | |
| ) | |
| new_tests = df_it["tamponi"].iloc[lag:].values - df_it["tamponi"].iloc[:-lag].values | |
| sns.scatterplot(x=np.arange(len(new_pos)), y=new_pos / new_tests) | |
| plt.show() | |
| # Plot the ratio of new deaths to new cases over time. | |
| new_deaths = ( | |
| df_it["deceduti"].iloc[lag:].values - df_it["deceduti"].iloc[:-lag].values | |
| ) | |
| sns.scatterplot(x=df_it["month_day"].iloc[lag:], y=new_deaths / new_pos) | |
| plt.show() | |
| # Plot the ratio of of positive cases increase rate vs. death increase rate. | |
| case_rate_inc = ( | |
| df_it["totale_casi"].iloc[lag:].values / df_it["totale_casi"].iloc[:-lag].values | |
| ) | |
| death_rate_inc = ( | |
| df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values | |
| ) | |
| sns.scatterplot(x=df_it["month_day"].iloc[lag:], y=case_rate_inc / death_rate_inc) | |
| plt.show() | |
| def lom_and_ven(): | |
| total_pop_lom = 10_088_484 | |
| total_pop_ven = 4_865_380 | |
| url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv" | |
| df_regions = pd.read_csv(url) | |
| dates = df_regions["data"].str.split("T").str.get(0).str.split("-", expand=True) | |
| df_regions["month_day"] = dates[1] + dates[2] | |
| fig = plt.figure(figsize=(10, 6)) | |
| pos_or_death = "Death" | |
| test_or_pos = "Positive" if pos_or_death == "Death" else "Test" | |
| if pos_or_death == "Positive": | |
| cat_it = "totale_casi" | |
| else: | |
| cat_it = "deceduti" | |
| regions = ["Lombardia", "Veneto"] | |
| pops = {"Lombardia": total_pop_lom, "Veneto": total_pop_ven} | |
| for region in regions: | |
| df_region = df_regions[df_regions["denominazione_regione"] == region] | |
| tests_per_mil = 1_000_000 * df_region["tamponi"].values / pops[region] | |
| res_per_mil = 1_000_000 * df_region[cat_it].values / pops[region] | |
| scatter_it = sns.scatterplot(x=tests_per_mil, y=res_per_mil) | |
| for (idx, val) in enumerate(tests_per_mil): | |
| scatter_it.text( | |
| val, | |
| res_per_mil[idx], | |
| df_region.iloc[idx]["month_day"].split("T")[0], | |
| horizontalalignment="left", | |
| verticalalignment="top", | |
| size="small", | |
| color="black", | |
| ) | |
| fig.legend(regions, loc=(0.8, 0.25)) | |
| plt.xlabel(f"{test_or_pos}s per million") | |
| plt.ylabel(f"{pos_or_death}s per million") | |
| plt.show() | |
| def us_vs_it(): | |
| lag = 1 | |
| total_pop_us = 328_239_523 | |
| total_pop_it = 60_317_546 | |
| url = "https://covidtracking.com/api/us/daily.csv" | |
| df_us = pd.read_csv(url).sort_values("date") | |
| inc_tests_rate_us = ( | |
| df_us["total"].iloc[lag:].values / df_us["total"].iloc[:-lag].values | |
| ) | |
| print(inc_tests_rate_us[-13:].mean()) | |
| inc_deaths_rate_us = ( | |
| df_us["death"].iloc[lag:].values / df_us["death"].iloc[:-lag].values | |
| ) | |
| print(inc_deaths_rate_us[-13:].mean()) | |
| url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv" | |
| df_it = pd.read_csv(url) | |
| dates = df_it["data"].str.split("T").str.get(0).str.split("-", expand=True) | |
| df_it["month_day"] = dates[1] + dates[2] | |
| inc_tests_rate_it = ( | |
| df_it["tamponi"].iloc[lag:].values / df_it["tamponi"].iloc[:-lag].values | |
| ) | |
| inc_deaths_rate_it = ( | |
| df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values | |
| ) | |
| threshold_us = 150_000 | |
| keep = df_us[df_us["total"] > threshold_us] | |
| sns.scatterplot(keep["total"], keep["death"]) | |
| plt.show() | |
| sns.scatterplot( | |
| 1_000_000 * keep["total"] / total_pop_us, | |
| 1_000_000 * keep["positive"] / total_pop_us, | |
| ) | |
| plt.show() | |
| threshold_it = 50000 | |
| keep = df_it[df_it["tamponi"] >= threshold_it] | |
| sns.scatterplot( | |
| 1_000_000 * keep["tamponi"] / total_pop_it, | |
| 1_000_000 * keep["deceduti"] / total_pop_it, | |
| ) | |
| plt.show() | |
| sns.scatterplot( | |
| 1_000_000 * keep["tamponi"] / total_pop_it, | |
| 1_000_000 * keep["totale_casi"] / total_pop_it, | |
| ) | |
| plt.show() | |
| # (A, B) = np.polyfit(np.log(keep["tamponi"]), keep["totale_casi"], 1) | |
| # keep["pred_totale_casi"] = A * np.log(keep["tamponi"]) + B | |
| # sns.scatterplot(keep["tamponi"], keep["totale_casi"]) | |
| # sns.scatterplot(keep["tamponi"], keep["pred_totale_casi"]) | |
| # plt.show() | |
| pos_or_death = "Positive" | |
| if pos_or_death == "Positive": | |
| (cat_it, cat_us) = ("totale_casi", "positive") | |
| else: | |
| (cat_it, cat_us) = ("deceduti", "death") | |
| fig = plt.figure(figsize=(10, 6)) | |
| keep = df_it[df_it["tamponi"] >= threshold_it] | |
| tests_per_mil = 1_000_000 * keep["tamponi"].values / total_pop_it | |
| res_per_mil = 1_000_000 * keep[cat_it].values / total_pop_it | |
| scatter_it = sns.scatterplot(tests_per_mil, res_per_mil) | |
| for (idx, val) in enumerate(tests_per_mil): | |
| scatter_it.text( | |
| val, | |
| res_per_mil[idx], | |
| keep.iloc[idx]["month_day"].split("T")[0], | |
| horizontalalignment="left", | |
| verticalalignment="top", | |
| size="small", | |
| color="black", | |
| ) | |
| keep = df_us[df_us["total"] > threshold_us] | |
| tests_per_mil = 1_000_000 * keep["total"].values / total_pop_us | |
| res_per_mil = 1_000_000 * keep[cat_us].values / total_pop_us | |
| scatter_us = sns.scatterplot(tests_per_mil, res_per_mil) | |
| for (idx, val) in enumerate(tests_per_mil): | |
| scatter_us.text( | |
| val, | |
| res_per_mil[idx], | |
| str(keep.iloc[idx]["date"])[4:], | |
| horizontalalignment="left", | |
| verticalalignment="top", | |
| size="small", | |
| color="black", | |
| ) | |
| fig.legend(["Italy", "U.S."], loc=(0.8, 0.15)) | |
| plt.xlabel("Tests per million") | |
| plt.ylabel(f"{pos_or_death}s per million") | |
| plt.show() | |
| start_cases = 250 | |
| steps = 13 | |
| rate_1 = inc_tests_rate_it[-steps:].mean() | |
| rate_2 = inc_deaths_rate_us[-steps:].mean() | |
| x = threshold_it * rate_1 ** np.arange(steps) | |
| y = start_cases * rate_2 ** np.arange(steps) | |
| sns.regplot(x, y) | |
| plt.show() | |
| rate_1 = inc_tests_rate_us[-steps:].mean() | |
| rate_2 = inc_deaths_rate_it[-steps:].mean() | |
| x = threshold_us * rate_1 ** np.arange(steps) | |
| y = start_cases * rate_2 ** np.arange(steps) | |
| sns.regplot(x, y) | |
| plt.show() | |
| def total_mortality(): | |
| # CSV from: https://gis.cdc.gov/grasp/fluview/mortality.html. | |
| df_totals = pd.read_csv("National_2015-20_Data.csv") | |
| df_totals["TOTAL DEATHS"] = ( | |
| df_totals["TOTAL DEATHS"].str.replace(",", "").astype("int") | |
| ) | |
| seasons = list(set(df_totals["SEASON"])) | |
| seasons.sort() | |
| fig = plt.figure(figsize=(10, 6)) | |
| for season in seasons: | |
| df_season = df_totals[df_totals["SEASON"] == season] | |
| sns.scatterplot(df_season["WEEK"], df_season["TOTAL DEATHS"]) | |
| fig.legend(list(seasons), loc=(0.85, 0.15)) | |
| plt.xticks(1 + np.arange(52)) | |
| plt.show() | |
| def counties(): | |
| url = ( | |
| "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv" | |
| ) | |
| df_counties = pd.read_csv(url) | |
| today = df_counties["date"].max() | |
| df_counties_today = df_counties[df_counties["date"] == today].sort_values( | |
| "deaths", ascending=False | |
| ) | |
| total_cases = df_counties_today["cases"].sum() | |
| total_deaths = df_counties_today["deaths"].sum() | |
| df_counties_today["per_deaths"] = df_counties_today["deaths"] / total_deaths | |
| df_counties_today["per_cases"] = df_counties_today["cases"] / total_cases | |
| df_counties_today["over_rep"] = ( | |
| df_counties_today["per_deaths"] - df_counties_today["per_cases"] | |
| ) | |
| df_counties_today = df_counties_today.sort_values("over_rep", ascending=False) | |
| df_counties_today["over_rep_rat"] = ( | |
| df_counties_today["per_deaths"] / df_counties_today["per_cases"] | |
| ) | |
| df_counties_today = df_counties_today.sort_values("over_rep_rat", ascending=False) | |
| df_counties_today = df_counties_today.sort_values("deaths", ascending=False) | |
| sns.scatterplot(df_counties_today["per_cases"], df_counties_today["per_deaths"]) | |
| plt.show() | |
| ny_nj_ct = df_counties_today[ | |
| (df_counties_today["state"] == "New York") | |
| | (df_counties_today["state"] == "New Jersey") | |
| | (df_counties_today["state"] == "Connecitcut") | |
| ] | |
| total_cases_ny_nj_ct = ny_nj_ct["cases"].sum() | |
| total_deaths_ny_nj_ct = ny_nj_ct["deaths"].sum() | |
| print(total_cases_ny_nj_ct / df_counties_today["cases"].sum()) | |
| print(total_deaths_ny_nj_ct / df_counties_today["deaths"].sum()) | |
| nyc_csa_deaths = df_counties_today[df_counties_today["county"] == "New York City"][ | |
| "deaths" | |
| ].iloc[0] | |
| nyc_csa_cases = df_counties_today[df_counties_today["county"] == "New York City"][ | |
| "cases" | |
| ].iloc[0] | |
| for fips in all_nyc_fips: | |
| df_fips = df_counties_today[df_counties_today["fips"] == fips] | |
| print(df_fips) | |
| nyc_csa_deaths += df_fips["deaths"].iloc[0] | |
| nyc_csa_cases += df_fips["cases"].iloc[0] | |
| nyc_csa_per_deaths = nyc_csa_deaths / df_counties_today["deaths"].sum() | |
| nyc_csa_per_cases = nyc_csa_cases / df_counties_today["cases"].sum() | |
| df_counties_today = df_counties_today.append( | |
| { | |
| "county": "NYC MSA", | |
| "per_deaths": nyc_csa_per_deaths, | |
| "per_cases": nyc_csa_per_cases, | |
| }, | |
| ignore_index=True, | |
| ) | |
| sns.scatterplot(df_counties_today["per_cases"], df_counties_today["per_deaths"]) | |
| plt.show() | |
| print(nyc_csa_per_deaths) | |
| print(nyc_csa_per_cases) | |
| def surge_counties(): | |
| url = ( | |
| "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv" | |
| ) | |
| df_counties = pd.read_csv(url) | |
| df_counties.sort_values(["fips", "date"], inplace=True) | |
| df_counties["new_cases"] = df_counties["cases"].diff() | |
| mask = df_counties["fips"] != df_counties["fips"].shift(1) | |
| df_counties["new_cases"][mask] = np.nan | |
| df_counties["new_deaths"] = df_counties["deaths"].diff() | |
| df_counties["new_deaths"][mask] = np.nan | |
| today = df_counties["date"].max() | |
| df_counties_today = df_counties[df_counties["date"] == today] | |
| df_counties_today["fips"] = df_counties_today["fips"].replace(np.nan, -1) | |
| df_counties_today["fips"] = df_counties_today["fips"].apply( | |
| lambda fips: str(int(fips)).zfill(5) | |
| ) | |
| url_population = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv" | |
| df_population = pd.read_csv(url_population, encoding="ISO-8859-1") | |
| df_population["STATE"] = df_population["STATE"].apply( | |
| lambda state_fips: str(state_fips).zfill(2) | |
| ) | |
| df_population["COUNTY"] = df_population["COUNTY"].apply( | |
| lambda county_fips: str(county_fips).zfill(3) | |
| ) | |
| df_population["FIPS"] = df_population["STATE"] + df_population["COUNTY"] | |
| combined = df_counties_today.merge( | |
| df_population[["FIPS", "POPESTIMATE2019"]], | |
| left_on="fips", | |
| right_on="FIPS", | |
| how="left", | |
| ) | |
| combined["new_cases_per_mil"] = ( | |
| 1000000 * combined["new_cases"] / combined["POPESTIMATE2019"] | |
| ) | |
| combined["new_deaths_per_mil"] = ( | |
| 1000000 * combined["new_deaths"] / combined["POPESTIMATE2019"] | |
| ) | |
| combined[ | |
| [ | |
| "county", | |
| "state", | |
| "new_cases_per_mil", | |
| "new_deaths_per_mil", | |
| "POPESTIMATE2019", | |
| ] | |
| ].dropna().to_csv("surging.csv", index=False) | |
| def ca_counties(): | |
| url = "https://covidtracking.com/api/states/daily.csv" | |
| df_states = pd.read_csv(url).sort_values("date") | |
| county2data = {} | |
| region_pops = {} | |
| df_ca = pd.read_csv("ca-counties.csv") | |
| for (row_idx, row) in df_ca.iterrows(): | |
| region = row["region"] | |
| population = row["population"] | |
| county2data[row["county"]] = {"region": region, "population": population} | |
| region_pops[region] = region_pops.get(region, 0) + int( | |
| population.replace(",", "") | |
| ) | |
| url = ( | |
| "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv" | |
| ) | |
| df_counties = pd.read_csv(url) | |
| df_ca_counties = df_counties[df_counties["state"] == "California"] | |
| df_ca_counties["region"] = df_ca_counties["county"].apply( | |
| lambda county: county2data.get(county, {"region": "None"})["region"] | |
| ) | |
| df_ca_counties = df_ca_counties[df_ca_counties["region"] != "None"] | |
| df_ca_counties = df_ca_counties[df_ca_counties["date"] >= "2020-05-01"] | |
| df_ca_counties["date"] = df_ca_counties["date"] | |
| df_ca_counties.sort_values(["county", "date"], inplace=True) | |
| df_ca_counties["date"] = df_ca_counties["date"].apply( | |
| lambda x: datetime.strptime(str(x), "%Y-%m-%d") | |
| ) | |
| df_ca_counties["new_cases"] = df_ca_counties["cases"].diff() | |
| mask = df_ca_counties["county"] != df_ca_counties["county"].shift(1) | |
| df_ca_counties["new_cases"][mask] = np.nan | |
| df_ca_counties["new_deaths"] = df_ca_counties["deaths"].diff() | |
| df_ca_counties["new_deaths"][mask] = np.nan | |
| region_new_cases = ( | |
| df_ca_counties.groupby(["date", "region"])["new_cases"].sum().reset_index() | |
| ) | |
| region_new_deaths = ( | |
| df_ca_counties.groupby(["date", "region"])["new_deaths"].sum().reset_index() | |
| ) | |
| region_new_deaths["population"] = region_new_deaths["region"].apply( | |
| lambda region: region_pops[region] | |
| ) | |
| region_new_deaths["per_mil"] = ( | |
| 1_000_000 * region_new_deaths["new_deaths"] / region_new_deaths["population"] | |
| ) | |
| fig = plt.figure(figsize=(10, 6)) | |
| ca_regions = list(region_pops) | |
| for region in ca_regions: | |
| df_region = region_new_deaths[region_new_deaths["region"] == region] | |
| sns.lineplot(x=df_region["date"], y=df_region["per_mil"].rolling(7).mean()) | |
| state_pops = {"AZ": 7_278_717, "FL": 21_477_737} | |
| keep_states = list(state_pops) | |
| for state in keep_states: | |
| df_state = df_states[df_states["state"] == state] | |
| df_state = df_state[df_state["date"] >= 20_200_501] | |
| df_state["date"] = df_state["date"].apply( | |
| lambda x: datetime.strptime(str(x), "%Y%m%d") | |
| ) | |
| df_state["per_mil"] = 1_000_000 * df_state["deathIncrease"] / state_pops[state] | |
| sns.lineplot(x=df_state["date"], y=df_state["per_mil"].rolling(7).mean()) | |
| fig.legend(ca_regions + keep_states, loc=(0.85, 0.15)) | |
| plt.show() | |
| def nursing_homes(): | |
| # ZIP to FIPS from: https://data.world/niccolley/us-zipcode-to-county-state#. | |
| f_zip_fips = "ZIP-COUNTY-FIPS_2017-06.csv" | |
| df_zip_fips = pd.read_csv(f_zip_fips) | |
| df_zip_fips["ZIP"] = df_zip_fips["ZIP"].apply( | |
| lambda zip_code: str(zip_code).zfill(5) | |
| ) | |
| df_zip_fips["STCOUNTYFP"] = df_zip_fips["STCOUNTYFP"].apply( | |
| lambda zip_code: str(zip_code).zfill(5) | |
| ) | |
| zip2fips = dict(zip(df_zip_fips["ZIP"], df_zip_fips["STCOUNTYFP"])) | |
| # Nursing home data. | |
| url_nursing_homes = "https://opendata.arcgis.com/datasets/78c58035fb3942ba82af991bb4476f13_0.csv?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D" | |
| df_nursing_homes = pd.read_csv(url_nursing_homes) | |
| df_nursing_homes["ZIP"] = df_nursing_homes["ZIP"].apply( | |
| lambda zip_code: str(zip_code).zfill(5) | |
| ) | |
| df_nursing_homes["FIPS"] = df_nursing_homes["ZIP"].apply( | |
| lambda zip_code: zip2fips.get(zip_code, "") | |
| ) | |
| print(len(df_nursing_homes)) | |
| # Remove rows with missing ZIP codes. | |
| df_nursing_homes = df_nursing_homes[df_nursing_homes["FIPS"] != ""] | |
| print(len(df_nursing_homes)) | |
| # Some rows are missing beds data (i.e., coded as -999). Impute the missing data | |
| # using the median number of beds per nursing home for rows without missing data. | |
| median_beds = df_nursing_homes[df_nursing_homes["BEDS"] > 0]["BEDS"].median() | |
| df_nursing_homes.loc[df_nursing_homes["BEDS"] < 0, "BEDS"] = median_beds | |
| # Get nursing home data per county. | |
| beds_by_fips = df_nursing_homes.groupby("FIPS", as_index=False)["BEDS"].sum() | |
| homes_by_fips = pd.DataFrame(df_nursing_homes["FIPS"].value_counts()) | |
| homes_by_fips.columns = ["HOMES"] | |
| homes_by_fips["FIPS"] = homes_by_fips.index | |
| # County COVID-19 data. | |
| url_counties = ( | |
| "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv" | |
| ) | |
| df_counties = pd.read_csv(url_counties) | |
| today = df_counties["date"].max() | |
| df_counties_today = df_counties[df_counties["date"] == today].sort_values( | |
| "deaths", ascending=False | |
| ) | |
| df_counties_today["fips"] = df_counties_today["fips"].replace(np.nan, -1) | |
| df_counties_today["fips"] = df_counties_today["fips"].apply( | |
| lambda fips: str(int(fips)).zfill(5) | |
| ) | |
| # Combine with population data. | |
| url_population = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv" | |
| df_population = pd.read_csv(url_population, encoding="ISO-8859-1") | |
| df_population["STATE"] = df_population["STATE"].apply( | |
| lambda state_fips: str(state_fips).zfill(2) | |
| ) | |
| df_population["COUNTY"] = df_population["COUNTY"].apply( | |
| lambda county_fips: str(county_fips).zfill(3) | |
| ) | |
| df_population["FIPS"] = df_population["STATE"] + df_population["COUNTY"] | |
| combined = df_counties_today.merge( | |
| df_population[["FIPS", "POPESTIMATE2019"]], | |
| left_on="fips", | |
| right_on="FIPS", | |
| how="left", | |
| ) | |
| # Combine with beds data. | |
| combined = combined.merge(beds_by_fips, left_on="fips", right_on="FIPS", how="left") | |
| # Fill in counties without nursing home data. | |
| combined["BEDS"] = combined["BEDS"].replace(np.nan, 0) | |
| # Combine with homes data. | |
| combined = combined.merge( | |
| homes_by_fips, left_on="fips", right_on="FIPS", how="left" | |
| ) | |
| # Fill in counties without nursing home data. | |
| combined["HOMES"] = combined["HOMES"].replace(np.nan, 0) | |
| # New York City FIPS: https://data.ny.gov/Government-Finance/NY-Municipalities-and-County-FIPS-codes/79vr-2kdi. | |
| nyc_fips = {"36005", "36047", "36061", "36081", "36085"} | |
| nyc_beds = nyc_homes = nyc_pop = 0 | |
| for fips in nyc_fips: | |
| nyc_beds += beds_by_fips[beds_by_fips["FIPS"] == fips]["BEDS"].iloc[0] | |
| nyc_homes += homes_by_fips[homes_by_fips["FIPS"] == fips]["HOMES"].iloc[0] | |
| nyc_pop += df_population[df_population["FIPS"] == fips]["POPESTIMATE2019"].iloc[ | |
| 0 | |
| ] | |
| combined.loc[combined["county"] == "New York City", "BEDS"] = nyc_beds | |
| combined.loc[combined["county"] == "New York City", "HOMES"] = nyc_homes | |
| combined.loc[combined["county"] == "New York City", "POPESTIMATE2019"] = nyc_pop | |
| # Add ZIP codes. | |
| fips2zip = {f: z for (z, f) in zip2fips.items()} | |
| combined["zip"] = combined["fips"].apply(lambda fips: fips2zip.get(fips, fips)) | |
| # Add latitudes and longitudes. | |
| nomi = pgeocode.Nominatim("us") | |
| (combined["lat"], combined["long"]) = zip( | |
| *combined["zip"].map( | |
| lambda zip_code: tuple( | |
| nomi.query_postal_code(zip_code)[["latitude", "longitude"]] | |
| ) | |
| ) | |
| ) | |
| combined.loc[combined["county"] == "New York City", "lat"] = 40.7128 | |
| combined.loc[combined["county"] == "New York City", "long"] = -74.0060 | |
| # Exclude locations without latitude/longitude coordinates. | |
| print(len(combined)) | |
| final = combined[~combined["lat"].isnull()] | |
| print(len(final)) | |
| # No obvious relationship. | |
| sns.scatterplot(final["BEDS"], final["deaths"]) | |
| plt.show() | |
| sns.scatterplot(final["HOMES"], final["deaths"]) | |
| plt.show() | |
| sns.scatterplot(final["cases"], final["deaths"]) | |
| plt.show() | |
| # Add New York MSA. | |
| nyc_beds = final[final["county"] == "New York City"]["BEDS"].iloc[0] | |
| nyc_homes = final[final["county"] == "New York City"]["HOMES"].iloc[0] | |
| nyc_pop = final[final["county"] == "New York City"]["POPESTIMATE2019"].iloc[0] | |
| nyc_cases = final[final["county"] == "New York City"]["cases"].iloc[0] | |
| nyc_deaths = final[final["county"] == "New York City"]["deaths"].iloc[0] | |
| for fips in nyc_msa_fips: | |
| fips = str(fips).zfill(5) | |
| nyc_beds += beds_by_fips[beds_by_fips["FIPS"] == fips]["BEDS"].iloc[0] | |
| nyc_homes += homes_by_fips[homes_by_fips["FIPS"] == fips]["HOMES"].iloc[0] | |
| nyc_pop += df_population[df_population["FIPS"] == fips]["POPESTIMATE2019"].iloc[ | |
| 0 | |
| ] | |
| nyc_cases += df_counties_today[df_counties_today["fips"] == fips]["cases"].iloc[ | |
| 0 | |
| ] | |
| nyc_deaths += df_counties_today[df_counties_today["fips"] == fips][ | |
| "deaths" | |
| ].iloc[0] | |
| final = final.append( | |
| { | |
| "county": "NYC MSA", | |
| "cases": nyc_cases, | |
| "deaths": nyc_deaths, | |
| "BEDS": nyc_beds, | |
| "HOMES": nyc_homes, | |
| "POPESTIMATE2019": nyc_pop, | |
| "lat": 40.7128, | |
| "long": -74.0060, | |
| }, | |
| ignore_index=True, | |
| ) | |
| scale = 500 | |
| final["deaths_per_mil"] = 1_000_000 * final["deaths"] / final["POPESTIMATE2019"] | |
| which_zs = ["POPESTIMATE2019", "BEDS", "HOMES", "cases", "deaths", "deaths_per_mil"] | |
| for which_z in which_zs: | |
| m = Basemap( | |
| llcrnrlon=-119, | |
| llcrnrlat=22, | |
| urcrnrlon=-64, | |
| urcrnrlat=49, | |
| projection="lcc", | |
| lat_1=33, | |
| lat_2=45, | |
| lon_0=-95, | |
| ) | |
| m.drawmapboundary() | |
| m.drawcoastlines() | |
| (x, y) = m(list(final["long"]), list(final["lat"])) | |
| z = final[which_z] | |
| log_z = np.log(z + 1) | |
| norm = matplotlib.colors.Normalize( | |
| vmin=log_z.min(), vmax=log_z.max(), clip=True | |
| ) | |
| mapper = cm.ScalarMappable(norm=norm, cmap=cm.coolwarm) | |
| colors = mapper.to_rgba(log_z) | |
| z = z / z.max() | |
| m.scatter(x, y, c=colors, s=scale * z, alpha=0.5) | |
| plt.show() | |
| def maricopa_mobility(): | |
| df = pd.read_csv("Global_Mobility_Report.csv") | |
| df_az = df[df["sub_region_1"] == "Arizona"] | |
| df_mari = df_az[df_az["sub_region_2"] == "Maricopa County"] | |
| df_mari["date"] = df_mari["date"].apply( | |
| lambda x: datetime.strptime(str(x), "%Y-%m-%d") | |
| ) | |
| mobility_cols = [ | |
| "retail_and_recreation_percent_change_from_baseline", | |
| "grocery_and_pharmacy_percent_change_from_baseline", | |
| "parks_percent_change_from_baseline", | |
| "transit_stations_percent_change_from_baseline", | |
| "workplaces_percent_change_from_baseline", | |
| "residential_percent_change_from_baseline", | |
| ] | |
| (rows, cols) = (2, 3) | |
| (fig, axs) = plt.subplots(rows, cols) | |
| for (col_idx, mobility_col) in enumerate(mobility_cols): | |
| (row, col) = (col_idx // cols, col_idx % cols) | |
| sns.lineplot(df_mari["date"], df_mari[mobility_col], ax=axs[row][col]) | |
| plt.show() | |
| def excess(): | |
| url = "https://data.cdc.gov/api/views/xkkf-xrst/rows.csv?accessType=DOWNLOAD&bom=true&format=true%20target=" | |
| df = pd.read_csv(url) | |
| state_pop_mils = { | |
| "Florida": 21.48, | |
| "Texas": 29, | |
| "New Jersey": 8.882, | |
| "New York": 11.051, | |
| "New York City": 8.339, | |
| } | |
| state_excesses = {} | |
| for state in state_pop_mils: | |
| df_state = df[df["State"] == state] | |
| lower_excess = df_state["Total Excess Lower Estimate in 2020"].max() | |
| higher_excess = df_state["Total Excess Higher Estimate in 2020"].max() | |
| excess = (lower_excess + higher_excess) / 2 | |
| state_excesses[state] = excess | |
| for (state, excess) in state_excesses.items(): | |
| print(state) | |
| print(excess / state_pop_mils[state]) | |
| all_ny_excess = state_excesses["New York"] + state_excesses["New York City"] | |
| all_ny_pop_mil = state_pop_mils["New York"] + state_pop_mils["New York City"] | |
| print("New York with New York City") | |
| print(all_ny_excess / all_ny_pop_mil) | |
| nyc_deaths = df[ | |
| (df["State"] == "New York City") | |
| & (df["Week Ending Date"] >= "2020") | |
| & (df["Type"] == "Predicted (weighted)") | |
| & (df["Outcome"] == "All causes") | |
| ][["Average Expected Count", "Observed Number"]].values | |
| print((nyc_deaths[:, 1] - nyc_deaths[:, 0]).sum()) | |
| lows = df.groupby("State")["Total Excess Lower Estimate in 2020"].max() | |
| highs = df.groupby("State")["Total Excess Higher Estimate in 2020"].max() | |
| avgs = (lows + highs) / 2 | |
| print(avgs.sum() - avgs[avgs.index == "United States"]) | |
| def canada(): | |
| url = "https://raw.githubusercontent.com/ishaberry/Covid19Canada/master/timeseries_prov/testing_timeseries_prov.csv" | |
| df = pd.read_csv(url) | |
| df_on = df[df["province"] == "Ontario"] | |
| dates = df_on["date_testing"].apply(lambda x: datetime.strptime(str(x), "%d-%m-%Y")) | |
| sns.lineplot(dates, df_on["testing"].rolling(7).mean()) | |
| plt.show() | |
| def age_deaths(): | |
| url = "https://data.cdc.gov/api/views/vsak-wrfu/rows.csv?accessType=DOWNLOAD" | |
| df = pd.read_csv(url) | |
| age_groups = [ | |
| "Under 1 year", | |
| "1-4 years", | |
| "5-14 years", | |
| "15-24 years", | |
| "25-34 years", | |
| "35-44 years", | |
| "45-54 years", | |
| "55-64 years", | |
| "65-74 years", | |
| "75-84 years", | |
| "85 years and over", | |
| ] | |
| df_age_sex = df.groupby(["MMWR Week", "Age Group"])[ | |
| ["Total Deaths", "COVID-19 Deaths"] | |
| ].sum() | |
| df_age_sex = df_age_sex.reset_index() | |
| fig = plt.figure(figsize=(10, 6)) | |
| for age_group in age_groups: | |
| df_age = df_age_sex[df_age_sex["Age Group"] == age_group] | |
| sns.lineplot(df_age["MMWR Week"], df_age["COVID-19 Deaths"]) | |
| fig.legend(age_groups, loc=(0.85, 0.15)) | |
| plt.show() | |
| fig = plt.figure(figsize=(10, 6)) | |
| for age_group in age_groups: | |
| df_age = df_age_sex[df_age_sex["Age Group"] == age_group] | |
| sns.lineplot( | |
| df_age["MMWR Week"], df_age["COVID-19 Deaths"] / df_age["Total Deaths"] | |
| ) | |
| fig.legend(age_groups, loc=(0.85, 0.15)) | |
| plt.show() | |
| fig = plt.figure(figsize=(10, 6)) | |
| for age_group in age_groups[3:]: | |
| df_age = df_age_sex[df_age_sex["Age Group"] == age_group] | |
| max_deaths = df_age["COVID-19 Deaths"].max() | |
| sns.lineplot(df_age["MMWR Week"], df_age["COVID-19 Deaths"] / max_deaths) | |
| fig.legend(age_groups[3:], loc=(0.85, 0.15)) | |
| plt.show() | |
| df_age_totals = df_age_sex.groupby("Age Group")["COVID-19 Deaths"].sum().to_dict() | |
| fig = plt.figure(figsize=(10, 6)) | |
| for age_group in age_groups: | |
| df_age = df_age_sex[df_age_sex["Age Group"] == age_group] | |
| cum_totals = df_age["COVID-19 Deaths"].cumsum() | |
| sns.lineplot(df_age["MMWR Week"], cum_totals / df_age_totals[age_group]) | |
| fig.legend(age_groups, loc=(0.85, 0.15)) | |
| plt.show() | |
| def death_risk(): | |
| url = "https://data.cdc.gov/api/views/y5bj-9g5w/rows.csv?accessType=DOWNLOAD&bom=true&format=true%20target=" | |
| df = pd.read_csv(url) | |
| comp_states = [ | |
| "New York City", | |
| "New York", | |
| "New Jersey", | |
| "Louisiana", | |
| "Massachusetts", | |
| "Mississippi", | |
| "Florida", | |
| "Alabama", | |
| ] | |
| comp_years = [2019, 2020] | |
| comp_age_groups = [ | |
| "25-44 years", | |
| "65-74 years", | |
| "75-84 years", | |
| "85 years and older", | |
| ] | |
| comp_age_groups = df["Age Group"].unique() | |
| data = [] | |
| for comp_state in comp_states: | |
| print(f"\nJurisdiction: {comp_state}") | |
| row = {"state": comp_state} | |
| for comp_age_group in comp_age_groups: | |
| print(f"\nAge Group: {comp_age_group}") | |
| death_data = [] | |
| for comp_year in comp_years: | |
| deaths = df[ | |
| (df["Jurisdiction"] == comp_state) | |
| & (df["Year"] == comp_year) | |
| & (df["Week Ending Date"] < "10") | |
| & (df["Age Group"] == comp_age_group) | |
| & (df["Type"] == "Unweighted") | |
| ]["Number of Deaths"].sum() | |
| if comp_state == "New York": | |
| deaths += df[ | |
| (df["Jurisdiction"] == "New York City") | |
| & (df["Year"] == comp_year) | |
| & (df["Week Ending Date"] < "10") | |
| & (df["Age Group"] == comp_age_group) | |
| & (df["Type"] == "Unweighted") | |
| ]["Number of Deaths"].sum() | |
| death_data.append(deaths) | |
| print(f"{comp_year}: {int(deaths)}") | |
| per_inc = 100 * (death_data[1] / death_data[0] - 1) | |
| col = f"{comp_age_group}" | |
| row[col] = per_inc | |
| print(f"Percent Increase: {per_inc:.2f}") | |
| data.append(row) | |
| df_deaths = pd.DataFrame(data) | |
| df_melted = df_deaths.melt( | |
| "state", var_name="age_group", value_name="deaths % inc." | |
| ) | |
| sns.barplot(x="state", y="deaths % inc.", hue="age_group", data=df_melted) | |
| plt.show() | |
| def obesity(): | |
| us_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_United%20States%20of%20America.csv" | |
| de_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_Germany.csv" | |
| vn_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_Viet%20Nam.csv" | |
| urls = [us_url, de_url, vn_url] | |
| dfs = [] | |
| for url in urls: | |
| dfs.append(pd.read_csv(url, encoding="ISO-8859-1")) | |
| df = pd.concat(dfs) | |
| keep_columns = [ | |
| column | |
| for column in df.columns | |
| if (("Prevalence" in column) and ("interval" not in column)) | |
| ][2:] | |
| country_col = "Country/Region/World" | |
| keep_columns.append(country_col) | |
| df_2016 = df[(df["Sex"] == "Men") & (df["Year"] == 2016)][keep_columns] | |
| cols = {"bmi": [], "percent": [], "country": []} | |
| for (row_idx, row) in df_2016.iterrows(): | |
| country = row[country_col] | |
| for col in keep_columns: | |
| if col == country_col: | |
| continue | |
| bmi_str = re.search("BMI.*m²", col).group() | |
| bmi_str = bmi_str.replace("BMI ", "") | |
| bmi_str = bmi_str.replace("BMI", "") | |
| bmi_str = bmi_str.replace(" kg/m²", "") | |
| cols["bmi"].append(bmi_str) | |
| cols["percent"].append(row[col]) | |
| cols["country"].append(country) | |
| df = pd.DataFrame(cols) | |
| sns.barplot(x="bmi", y="percent", hue="country", data=df) | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment