airalcorn2 · November 18, 2020 05:38
diff --git a/covid19.py b/covid19.py
 import matplotlib
 import matplotlib.cm as cm
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pgeocode
 import re
 import seaborn as sns

 from datetime import datetime

 # from mpl_toolkits.basemap import Basemap
 from mpl_toolkits.mplot3d import Axes3D
 from scipy import stats

 nyc_msa_fips = {
    36119: "Westchester",
    34003: "Bergen",
    34017: "Hudson",
    34031: "Passaic",
    36079: "Putnam",
    36087: "Rockland",
    36103: "Suffolk",
    36059: "Nassau",
    34023: "Middlesex",
    34025: "Monmouth",
    34029: "Ocean",
    34035: "Somerset",
    34013: "Essex",
    34039: "Union",
    34027: "Morris",
    34037: "Sussex",
    34019: "Hunterdon",
    42103: "Pike",
 }
 nyc_csa_fips = {
    9001: "Fairfield",
    9009: "New Haven",
    34021: "Mercer",
    9005: "Litchfield",
    36111: "Ulster",
    42089: "Monroe",
    36027: "Dutchess",
    36071: "Orange",
 }
 all_nyc_fips = set(nyc_msa_fips) | set(nyc_csa_fips)
 state_pops = {
    "AL": 4_903_185,
    "AK": 731_545,
    "AZ": 7_278_717,
    "AR": 3_017_825,
    "CA": 39_512_223,
    "CO": 5_758_736,
    "CT": 3_565_287,
    "DE": 973_764,
    "FL": 21_477_737,
    "GA": 10_617_423,
    "HI": 1_415_872,
    "ID": 1_787_065,
    "IL": 12_671_821,
    "IN": 6_732_219,
    "IA": 3_155_070,
    "KS": 2_913_314,
    "KY": 4_467_673,
    "LA": 4_648_794,
    "ME": 1_344_212,
    "MD": 6_045_680,
    "MA": 6_949_503,
    "MI": 9_986_857,
    "MN": 5_639_632,
    "MS": 2_976_149,
    "MO": 6_137_428,
    "MT": 1_068_778,
    "NE": 1_934_408,
    "NV": 3_080_156,
    "NH": 1_359_711,
    "NJ": 8_882_190,
    "NM": 2_096_829,
    "NY": 19_453_561,
    "NC": 10_488_084,
    "ND": 762_062,
    "OH": 11_689_100,
    "OK": 3_956_971,
    "OR": 4_217_737,
    "PA": 12_801_989,
    "RI": 1_059_361,
    "SC": 5_148_714,
    "SD": 884_659,
    "TN": 6_833_174,
    "TX": 28_995_881,
    "UT": 3_205_958,
    "VT": 623_989,
    "VA": 8_535_519,
    "WA": 7_614_893,
    "WV": 1_792_147,
    "WI": 5_822_434,
    "WY": 578_759,
 }
 markers = ".,ov^<>12348spP*hH+xXDd|_"
 colors = "bgrcmyk"


 def fixed_infection():
    # Infect the population.
    pop_size = 1_000_000
    pop = np.random.rand(pop_size)
    prop_infected = 0.01
    pop_dis_status = pop > (1.0 - prop_infected)
    print(pop_dis_status[:100])
    print(sum(pop_dis_status) / pop_size)

    # Determine the number of tests over time.
    tests_day_one = 1
    new_test_r = 0.3
    days = np.arange(14)
    cumu_tests = np.array(tests_day_one * (1.0 + new_test_r) ** days, dtype="int")
    print(cumu_tests)

    # Plot the total number of tests over time.
    sns.scatterplot(x=days, y=cumu_tests)
    plt.show()

    # Count the number of detected cases over time.
    cumu_detects = []
    total_detected = 0
    prev_total_tests = 0
    ratio_detecteds = []
    for (day, total_tests) in enumerate(cumu_tests):
        new_detected = sum(pop_dis_status[prev_total_tests:total_tests])
        ratio_detected = new_detected / (total_tests - prev_total_tests)
        ratio_detecteds.append(ratio_detected)
        total_detected += new_detected
        cumu_detects.append(total_detected)
        prev_total_tests = total_tests
        print(
            f"Day: {day} Detected: {total_detected} Proportion: {ratio_detected:.2f}",
            flush=True,
        )

    # Plot the total number of detected cases over time.
    sns.scatterplot(x=days, y=cumu_detects)
    plt.show()

    # Plot the ratio of detected cases per day.
    sns.scatterplot(x=days, y=ratio_detecteds)
    plt.show()


 def spreading_infection():
    # Infect the population.
    full_pop = 1_000_000
    num_infected = 1000  # 0.1% of the population.
    spread_rate = 0.02

    # Determine the number of tests over time.
    tests_day_one = 1
    new_test_r = 0.3
    days = np.arange(14)
    cumu_tests = np.array(tests_day_one * (1.0 + new_test_r) ** days, dtype="int")

    cumu_detects = []
    total_detected = 0
    prev_total_tests = 0
    ratio_detecteds = []
    for (day, total_tests) in enumerate(cumu_tests):

        pop_size = full_pop - total_detected
        pop = np.random.rand(pop_size)
        prop_infected = (num_infected - total_detected) / len(pop)
        pop_dis_status = pop > (1.0 - prop_infected)

        new_tests = total_tests - prev_total_tests
        new_detected = sum(pop_dis_status[:new_tests])
        ratio_detected = new_detected / new_tests
        ratio_detecteds.append(ratio_detected)
        total_detected += new_detected
        cumu_detects.append(total_detected)
        prev_total_tests = total_tests

        num_infected *= 1 + spread_rate

    sns.scatterplot(x=days, y=cumu_detects)
    plt.show()

    sns.scatterplot(x=days, y=ratio_detecteds)
    plt.show()


 def diff_countries():
    # Different countries with different infection and testing rates.

    num_countries = 200
    infection_rates = np.random.uniform(0.01, 0.1, num_countries)
    infection_rates_per_mil = 1_000_000 * infection_rates
    # Range from: https://ourworldindata.org/covid-testing#per-capita-tests-by-country
    testing_rates = np.random.uniform(10 / 1e6, 10000 / 1e6, num_countries)
    testing_rates_per_mil = 1_000_000 * testing_rates
    detected_rates = testing_rates * infection_rates
    detected_rates_per_mil = 1_000_000 * detected_rates

    sns.scatterplot(x=np.log10(detected_rates_per_mil), y=np.log(testing_rates_per_mil))
    plt.show()


 def states():
    url = "https://covidtracking.com/api/states/daily.csv"
    df_states = pd.read_csv(url).sort_values("date")
    df_states["population"] = df_states["state"].apply(
        lambda state: state_pops.get(state, -1)
    )

    lag = 1

    multi_plot = False
    (rows, cols) = (10, 5)
    if multi_plot:
        (fig, axs) = plt.subplots(rows, cols)
    else:
        fig = plt.figure(figsize=(10, 6))

    pos_or_death = "Death"
    tot_or_pos = "Total" if pos_or_death == "Positive" else "Positive"
    state_test_rates = []
    state_death_rates = []
    max_per_capita_death = {}
    for (state_idx, state) in enumerate(state_pops):
        df_state = df_states[df_states["state"] == state]
        keep = df_state[df_state["total"] > 1000]
        inc_tests_rate_state = (
            keep["total"].iloc[lag:].values / keep["total"].iloc[:-lag].values
        )
        state_test_rates.append(inc_tests_rate_state.mean())
        inc_deaths_rate_state = (
            keep["death"].iloc[lag:].values / keep["death"].iloc[:-lag].values
        )
        state_death_rates.append(inc_deaths_rate_state.mean())

        (row, col) = (state_idx // cols, state_idx % cols)
        tests_per_mil = 1_000_000 * df_state[tot_or_pos.lower()] / state_pops[state]
        res_per_mil = 1_000_000 * df_state[pos_or_death.lower()] / state_pops[state]
        ax = axs[row][col] if multi_plot else None
        sns.scatterplot(
            x=tests_per_mil,
            y=res_per_mil,
            marker=markers[state_idx % len(markers)],
            color=colors[state_idx % len(colors)],
            ax=ax,
        )
        if multi_plot:
            ax.set_xlabel("")
            ax.set_ylabel("")

        max_per_capita_death[state] = (
            1_000_000
            * df_state["deathIncrease"].rolling(7).mean().max()
            / state_pops[state]
        )

    fig.legend(list(state_pops), ncol=2, loc=(0.85, 0.15))
    if not multi_plot:
        plt.xlabel(f"{tot_or_pos}s per million")
        plt.ylabel(f"{pos_or_death}s per million")

    plt.show()

    max_per_capita_death = pd.DataFrame(
        [
            {"state": state, "death": death}
            for (state, death) in max_per_capita_death.items()
        ]
    )
    max_per_capita_death = max_per_capita_death.sort_values("death", ascending=False)
    sns.barplot(x="state", y="death", data=max_per_capita_death)
    plt.show()

    state_death_rates = np.array(state_death_rates)
    is_nans = np.isnan(state_death_rates)
    state_death_rates = state_death_rates[~is_nans]
    state_test_rates = np.array(state_test_rates)
    state_test_rates = state_test_rates[~is_nans]

    df_states["per_positive"] = (
        df_states["positiveIncrease"] / df_states["totalTestResultsIncrease"]
    )
    fig = plt.figure(figsize=(10, 6))
    how_many = 45
    for (state_idx, state) in enumerate(state_pops):
        df_state = df_states[df_states["state"] == state]
        sns.lineplot(
            x=np.arange(how_many),
            y=df_state["per_positive"].iloc[-how_many:],
            marker=markers[state_idx % len(markers)],
        )

    fig.legend(list(state_pops), ncol=2, loc=(0.85, 0.15))
    plt.ylabel("Proportion positives")
    plt.show()

    states = ["FL", "TX", "AZ"]
    lag = 60
    fig = plt.figure(figsize=(10, 6))
    for state in states:
        df_state = df_states[df_states["state"] == state]
        dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))

        avg_pos = df_state["positiveIncrease"].iloc[-lag:].rolling(7).mean()
        avg_test = df_state["totalTestResultsIncrease"].iloc[-lag:].rolling(7).mean()
        avg_rate = avg_pos / avg_test
        sns.lineplot(x=dates.iloc[-lag:], y=avg_rate)

    plt.ylim((0, 0.35))
    fig.legend(list(states), ncol=2, loc=(0.85, 0.15))
    plt.show()

    # states = ["FL", "GA", "AL", "NC", "SC", "TX", "MS", "LA", "TN", "AZ"]
    states = ["FL", "GA", "AL", "NC", "SC"]
    lag = 60
    fig = plt.figure(figsize=(10, 6))
    for state in states:
        df_state = df_states[df_states["state"] == state]
        dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))

        avg_pos = (
            1_000_000
            * df_state["positiveIncrease"].iloc[-lag:].rolling(7).mean()
            / state_pops[state]
        )
        sns.lineplot(x=dates.iloc[-lag:], y=avg_pos)

    # plt.ylim((0, 0.35))
    fig.legend(list(states), ncol=2, loc=(0.85, 0.15))
    plt.show()

    date_increase = df_states.groupby("date")["totalTestResultsIncrease"].sum()
    dates = [datetime.strptime(str(date), "%Y%m%d") for date in date_increase.index]
    sns.lineplot(x=dates[-lag:], y=date_increase.values[-lag:])
    plt.show()

    fig = plt.figure(figsize=(10, 6))
    for state in states:
        df_state = df_states[df_states["state"] == state]
        dates = df_state["date"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))
        avg_test = df_state["totalTestResultsIncrease"].iloc[-lag:].rolling(7).mean()
        sns.lineplot(x=dates.iloc[-lag:], y=avg_test)

    fig.legend(list(states), ncol=2, loc=(0.85, 0.15))
    plt.show()

    sns.scatterplot(x=df_state["total"], y=df_state["positive"])
    plt.show()
    sns.scatterplot(x=df_state["positive"], y=df_state["death"])
    plt.show()

    sns.lineplot(x=dates, y=df_state["positiveIncrease"])
    plt.show()
    sns.lineplot(x=dates, y=df_state["deathIncrease"])
    plt.show()

    df_al = df_states[df_states["state"] == "AL"]
    df_ny = df_states[df_states["state"] == "NY"]
    sns.scatterplot(df_ny.iloc[3:]["positive"].values, df_al["positive"].values)
    plt.show()
    al_diffs = df_al.iloc[1:]["positive"].values - df_al.iloc[:-1]["positive"].values
    ny_diffs = (
        df_ny.iloc[3 + 1 :]["positive"].values - df_ny.iloc[3:-1]["positive"].values
    )
    print(al_diffs / ny_diffs)

    df_state = df_states[df_states["state"] == "HI"]
    df_state["month_day"] = df_state["date"].astype("str").str[4:]
    sns.barplot(df_state["month_day"], df_state["positiveIncrease"])
    plt.show()

    today = df_states["date"].max()
    df_today = df_states[df_states["date"] == today]
    print(
        df_today[["state", "deathIncrease"]].sort_values(
            "deathIncrease", ascending=False
        )
    )
    df_today["death_inc_per_capita"] = (
        df_today["deathIncrease"] / df_today["population"]
    )
    print(
        df_today[["state", "death_inc_per_capita"]].sort_values(
            "death_inc_per_capita", ascending=False
        )
    )
    (x, y, z) = ([], [], [])
    states = []
    per_capita = True
    use_log = False
    log_func = np.log if use_log else lambda x: x
    for (state_idx, state) in enumerate(state_pops):
        df_state = df_today[df_states["state"] == state]
        if per_capita:
            # x.append((1_000_000 * df_state["positive"] / state_pops[state]).values[0])
            x.append(
                (log_func(df_state["positive"]) / log_func(df_state["total"])).values[0]
            )
            y.append((1_000_000 * df_state["death"] / state_pops[state]).values[0])
        else:
            x.append(
                (log_func(df_state["positive"]) / log_func(df_state["total"])).values[0]
            )
            y.append(
                (log_func(df_state["death"]) / log_func(df_state["total"])).values[0]
            )

        z.append((1_000_000 * df_state["total"] / state_pops[state]).values[0])
        states.append(state)

    scatter = sns.regplot(x, y)
    for (idx, val) in enumerate(x):
        scatter.text(
            val,
            y[idx],
            states[idx],
            horizontalalignment="left",
            verticalalignment="top",
            size="small",
            color="black",
        )

    plt.show()

    scatter = sns.regplot(z, y)
    for (idx, val) in enumerate(z):
        scatter.text(
            val,
            y[idx],
            states[idx],
            horizontalalignment="left",
            verticalalignment="top",
            size="small",
            color="black",
        )

    plt.show()

    scatter = sns.scatterplot(x=x, y=y, size=z)
    for (idx, val) in enumerate(x):
        scatter.text(
            val,
            y[idx],
            states[idx],
            horizontalalignment="left",
            verticalalignment="top",
            size="small",
            color="black",
        )

    plt.xlabel("Proportion positive")
    plt.ylabel("Deaths per million")
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(x, y, z)
    ax.set_xlabel("positive")
    ax.set_ylabel("death")
    ax.set_zlabel("total")

    plt.show()

    state = "FL"
    df_state = df_states[df_states["state"] == state]
    pos_len = 11
    kernel = (
        np.array(list(range(1, pos_len)) + [pos_len] + list(range(1, pos_len))[::-1])
        / pos_len
    )
    cases = df_state.iloc[-(2 * pos_len - 1) :]["positiveIncrease"].values
    tests = df_state.iloc[-(2 * pos_len - 1) :]["totalTestResultsIncrease"].values
    print(kernel @ cases / tests.sum())


 def death_lags():
    url = "https://covidtracking.com/api/states/daily.csv"
    df_states = pd.read_csv(url).sort_values("date")
    df_states["date"] = df_states["date"].apply(lambda x: str(x))
    min_date = "20200525"
    max_lag = 30

    multi_plot = True
    (rows, cols) = (10, 5)
    if multi_plot:
        (fig, axs) = plt.subplots(rows, cols)
    else:
        fig = plt.figure(figsize=(10, 6))

    best_corrs = []
    best_lags = []
    legend = []
    only_inc = False
    use_diff = False
    use_rank = False
    state2cor_lags = {}
    for (state_idx, state) in enumerate(state_pops):
        df_state = df_states[
            (df_states["state"] == state) & (df_states["date"] >= min_date)
        ]
        if use_diff:
            avg_pos = df_state["positiveIncrease"].rolling(7).mean().diff().values[7:]
            avg_death = df_state["deathIncrease"].rolling(7).mean().diff().values[7:]
        else:
            avg_pos = df_state["positiveIncrease"].rolling(7).mean().values[6:]
            avg_death = df_state["deathIncrease"].rolling(7).mean().values[6:]

        if only_inc and (avg_death[0] > avg_death[-1]):
            continue

        corrs = []
        state2cor_lags[state] = {"corrs": [], "lags": []}
        for lag in range(1, max_lag + 1):
            if use_rank:
                corr = stats.kendalltau(avg_pos[:-lag], avg_death[lag:]).correlation
            else:
                corr = stats.pearsonr(avg_pos[:-lag], avg_death[lag:])[0]

            corrs.append(corr)
            state2cor_lags[state]["corrs"].append(corr)
            state2cor_lags[state]["lags"].append(lag)

        best_lag = np.argmax(np.abs(corrs)) + 1
        (row, col) = (state_idx // cols, state_idx % cols)
        ax = axs[row][col] if multi_plot else None
        sns.scatterplot(
            x=avg_pos[:-best_lag],
            y=avg_death[best_lag:],
            marker=markers[state_idx % len(markers)],
            color=colors[state_idx % len(colors)],
            ax=ax,
        )
        best_lags.append(best_lag)
        best_corrs.append(corrs[best_lag - 1])
        legend.append(f"{state}:{best_lag}:{corrs[best_lag - 1]:.2f}")

    fig.legend(list(legend), ncol=2, loc=(0.8, 0.15))
    plt.show()

    sns.scatterplot(x=best_lags, y=best_corrs)
    plt.show()

    if multi_plot:
        (fig, axs) = plt.subplots(rows, cols)
    else:
        fig = plt.figure(figsize=(10, 6))

    for (state_idx, state) in enumerate(state_pops):
        (row, col) = (state_idx // cols, state_idx % cols)
        ax = axs[row][col] if multi_plot else None
        sns.scatterplot(
            x=state2cor_lags[state]["lags"],
            y=state2cor_lags[state]["corrs"],
            marker=markers[state_idx % len(markers)],
            color=colors[state_idx % len(colors)],
            ax=ax,
        )

    fig.legend(list(legend), ncol=2, loc=(0.8, 0.15))
    plt.show()

    corrs = []
    n = 30
    for samp in range(10000):
        steps = 2 * np.random.randint(0, 1 + 1, (2, n)) - 1
        positions = np.cumsum(steps, axis=1)
        corrs.append(stats.pearsonr(positions[0], positions[1])[0])

    sns.scatterplot(x=positions[0], y=positions[1])
    plt.show()

    sns.distplot(corrs)
    plt.show()


 def us():
    total_pop_us = 328_239_523
    url = "https://covidtracking.com/api/us/daily.csv"
    df_us = pd.read_csv(url).sort_values("date")

    lag = 1
    new_pos = df_us["positive"].iloc[lag:].values - df_us["positive"].iloc[:-lag].values
    new_tests = df_us["total"].iloc[lag:].values - df_us["total"].iloc[:-lag].values
    new_deaths = df_us["death"].iloc[lag:].values - df_us["death"].iloc[:-lag].values
    inc_tests_rate_us = (
        df_us["total"].iloc[lag:].values / df_us["total"].iloc[:-lag].values
    )
    print(inc_tests_rate_us[-13:].mean())
    inc_deaths_rate_us = (
        df_us["death"].iloc[lag:].values / df_us["death"].iloc[:-lag].values
    )
    print(inc_deaths_rate_us[-13:].mean())

    sns.scatterplot(x=np.arange(len(new_tests)), y=new_tests)

    # Plot the actual total number of cases over time.
    sns.scatterplot(x=np.arange(len(df_us)), y=df_us["positive"])
    plt.show()
    sns.scatterplot(x=np.arange(len(df_us)), y=np.log10(df_us["positive"]))
    plt.show()
    sns.scatterplot(x=np.arange(len(df_us)), y=df_us["positive"] / total_pop_us)
    plt.show()

    # Plot the actual total number of tests over time.
    sns.scatterplot(x=np.arange(len(df_us)), y=df_us["total"])
    plt.show()
    sns.scatterplot(x=np.arange(len(df_us)), y=np.log10(df_us["total"]))
    plt.show()
    sns.scatterplot(x=np.arange(len(df_us)), y=df_us["total"] / total_pop_us)
    plt.show()

    # Plot the actual ratio of detected cases per day.
    sns.scatterplot(x=np.arange(len(new_pos)), y=new_pos / new_tests)
    plt.show()

    # Plot the actual number of tests vs. the number of cases.
    sns.regplot(df_us["total"], df_us["positive"])
    plt.show()
    sns.scatterplot(x=np.log10(df_us["total"]), y=np.log10(df_us["positive"]))
    plt.show()

    # Plot the actual number of tests vs. the number of deaths.
    sns.regplot(df_us["total"], df_us["death"])
    plt.show()
    sns.scatterplot(x=np.log10(df_us["total"]), y=np.log10(df_us["death"]))
    plt.show()

    # Plot the actual number of new tests vs. the number of new deaths.
    sns.scatterplot(x=new_tests, y=new_deaths)
    plt.show()
    sns.scatterplot(x=np.log10(new_tests), y=np.log10(new_deaths))
    plt.show()

    # Plot the actual number of cases vs. the number of deaths.
    sns.scatterplot(x=df_us["positive"], y=df_us["death"])
    plt.show()
    sns.scatterplot(x=np.log10(df_us["positive"]), y=np.log10(df_us["death"]))
    plt.show()


 def it():
    total_pop_it = 60_317_546
    url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv"
    df_it = pd.read_csv(url)

    dates = df_it["data"].str.split("T").str.get(0).str.split("-", expand=True)
    df_it["month_day"] = dates[1] + dates[2]
    lag = 1
    inc_tests_rate_it = (
        df_it["tamponi"].iloc[lag:].values / df_it["tamponi"].iloc[:-lag].values
    )
    print(inc_tests_rate_it[-13:].mean())
    inc_deaths_rate_it = (
        df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values
    )
    print(inc_deaths_rate_it[-13:].mean())

    # Plot the actual total number of cases over time.
    sns.scatterplot(x=df_it["month_day"], y=df_it["totale_casi"])
    plt.show()
    sns.scatterplot(x=df_it["month_day"], y=np.log10(df_it["totale_casi"]))
    plt.show()
    sns.scatterplot(x=df_it["month_day"], y=df_it["totale_casi"] / total_pop_it)
    plt.show()

    # Plot the actual total number of tests over time.
    sns.scatterplot(x=df_it["month_day"], y=df_it["tamponi"])
    plt.show()
    sns.scatterplot(x=df_it["month_day"], y=np.log10(df_it["tamponi"]))
    plt.show()
    sns.scatterplot(x=df_it["month_day"], y=df_it["tamponi"] / total_pop_it)
    plt.show()

    # Plot the actual ratio of detected cases per day.
    new_pos = (
        df_it["totale_casi"].iloc[lag:].values - df_it["totale_casi"].iloc[:-lag].values
    )
    new_tests = df_it["tamponi"].iloc[lag:].values - df_it["tamponi"].iloc[:-lag].values
    sns.scatterplot(x=np.arange(len(new_pos)), y=new_pos / new_tests)
    plt.show()

    # Plot the ratio of new deaths to new cases over time.
    new_deaths = (
        df_it["deceduti"].iloc[lag:].values - df_it["deceduti"].iloc[:-lag].values
    )
    sns.scatterplot(x=df_it["month_day"].iloc[lag:], y=new_deaths / new_pos)
    plt.show()

    # Plot the ratio of of positive cases increase rate vs. death increase rate.
    case_rate_inc = (
        df_it["totale_casi"].iloc[lag:].values / df_it["totale_casi"].iloc[:-lag].values
    )
    death_rate_inc = (
        df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values
    )
    sns.scatterplot(x=df_it["month_day"].iloc[lag:], y=case_rate_inc / death_rate_inc)
    plt.show()


 def lom_and_ven():
    total_pop_lom = 10_088_484
    total_pop_ven = 4_865_380
    url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv"
    df_regions = pd.read_csv(url)

    dates = df_regions["data"].str.split("T").str.get(0).str.split("-", expand=True)
    df_regions["month_day"] = dates[1] + dates[2]

    fig = plt.figure(figsize=(10, 6))

    pos_or_death = "Death"
    test_or_pos = "Positive" if pos_or_death == "Death" else "Test"
    if pos_or_death == "Positive":
        cat_it = "totale_casi"
    else:
        cat_it = "deceduti"

    regions = ["Lombardia", "Veneto"]
    pops = {"Lombardia": total_pop_lom, "Veneto": total_pop_ven}
    for region in regions:
        df_region = df_regions[df_regions["denominazione_regione"] == region]
        tests_per_mil = 1_000_000 * df_region["tamponi"].values / pops[region]
        res_per_mil = 1_000_000 * df_region[cat_it].values / pops[region]
        scatter_it = sns.scatterplot(x=tests_per_mil, y=res_per_mil)

        for (idx, val) in enumerate(tests_per_mil):
            scatter_it.text(
                val,
                res_per_mil[idx],
                df_region.iloc[idx]["month_day"].split("T")[0],
                horizontalalignment="left",
                verticalalignment="top",
                size="small",
                color="black",
            )

    fig.legend(regions, loc=(0.8, 0.25))
    plt.xlabel(f"{test_or_pos}s per million")
    plt.ylabel(f"{pos_or_death}s per million")
    plt.show()


 def us_vs_it():
    lag = 1
    total_pop_us = 328_239_523
    total_pop_it = 60_317_546

    url = "https://covidtracking.com/api/us/daily.csv"
    df_us = pd.read_csv(url).sort_values("date")

    inc_tests_rate_us = (
        df_us["total"].iloc[lag:].values / df_us["total"].iloc[:-lag].values
    )
    print(inc_tests_rate_us[-13:].mean())
    inc_deaths_rate_us = (
        df_us["death"].iloc[lag:].values / df_us["death"].iloc[:-lag].values
    )
    print(inc_deaths_rate_us[-13:].mean())

    url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv"
    df_it = pd.read_csv(url)
    dates = df_it["data"].str.split("T").str.get(0).str.split("-", expand=True)
    df_it["month_day"] = dates[1] + dates[2]

    inc_tests_rate_it = (
        df_it["tamponi"].iloc[lag:].values / df_it["tamponi"].iloc[:-lag].values
    )
    inc_deaths_rate_it = (
        df_it["deceduti"].iloc[lag:].values / df_it["deceduti"].iloc[:-lag].values
    )

    threshold_us = 150_000

    keep = df_us[df_us["total"] > threshold_us]
    sns.scatterplot(keep["total"], keep["death"])
    plt.show()
    sns.scatterplot(
        1_000_000 * keep["total"] / total_pop_us,
        1_000_000 * keep["positive"] / total_pop_us,
    )
    plt.show()

    threshold_it = 50000
    keep = df_it[df_it["tamponi"] >= threshold_it]
    sns.scatterplot(
        1_000_000 * keep["tamponi"] / total_pop_it,
        1_000_000 * keep["deceduti"] / total_pop_it,
    )
    plt.show()
    sns.scatterplot(
        1_000_000 * keep["tamponi"] / total_pop_it,
        1_000_000 * keep["totale_casi"] / total_pop_it,
    )
    plt.show()

    # (A, B) = np.polyfit(np.log(keep["tamponi"]), keep["totale_casi"], 1)
    # keep["pred_totale_casi"] = A * np.log(keep["tamponi"]) + B
    # sns.scatterplot(keep["tamponi"], keep["totale_casi"])
    # sns.scatterplot(keep["tamponi"], keep["pred_totale_casi"])
    # plt.show()

    pos_or_death = "Positive"
    if pos_or_death == "Positive":
        (cat_it, cat_us) = ("totale_casi", "positive")
    else:
        (cat_it, cat_us) = ("deceduti", "death")

    fig = plt.figure(figsize=(10, 6))
    keep = df_it[df_it["tamponi"] >= threshold_it]
    tests_per_mil = 1_000_000 * keep["tamponi"].values / total_pop_it
    res_per_mil = 1_000_000 * keep[cat_it].values / total_pop_it
    scatter_it = sns.scatterplot(tests_per_mil, res_per_mil)

    for (idx, val) in enumerate(tests_per_mil):
        scatter_it.text(
            val,
            res_per_mil[idx],
            keep.iloc[idx]["month_day"].split("T")[0],
            horizontalalignment="left",
            verticalalignment="top",
            size="small",
            color="black",
        )

    keep = df_us[df_us["total"] > threshold_us]
    tests_per_mil = 1_000_000 * keep["total"].values / total_pop_us
    res_per_mil = 1_000_000 * keep[cat_us].values / total_pop_us
    scatter_us = sns.scatterplot(tests_per_mil, res_per_mil)

    for (idx, val) in enumerate(tests_per_mil):
        scatter_us.text(
            val,
            res_per_mil[idx],
            str(keep.iloc[idx]["date"])[4:],
            horizontalalignment="left",
            verticalalignment="top",
            size="small",
            color="black",
        )

    fig.legend(["Italy", "U.S."], loc=(0.8, 0.15))
    plt.xlabel("Tests per million")
    plt.ylabel(f"{pos_or_death}s per million")
    plt.show()

    start_cases = 250
    steps = 13
    rate_1 = inc_tests_rate_it[-steps:].mean()
    rate_2 = inc_deaths_rate_us[-steps:].mean()
    x = threshold_it * rate_1 ** np.arange(steps)
    y = start_cases * rate_2 ** np.arange(steps)
    sns.regplot(x, y)
    plt.show()

    rate_1 = inc_tests_rate_us[-steps:].mean()
    rate_2 = inc_deaths_rate_it[-steps:].mean()
    x = threshold_us * rate_1 ** np.arange(steps)
    y = start_cases * rate_2 ** np.arange(steps)
    sns.regplot(x, y)
    plt.show()


 def total_mortality():
    # CSV from: https://gis.cdc.gov/grasp/fluview/mortality.html.
    df_totals = pd.read_csv("National_2015-20_Data.csv")
    df_totals["TOTAL DEATHS"] = (
        df_totals["TOTAL DEATHS"].str.replace(",", "").astype("int")
    )
    seasons = list(set(df_totals["SEASON"]))
    seasons.sort()
    fig = plt.figure(figsize=(10, 6))
    for season in seasons:
        df_season = df_totals[df_totals["SEASON"] == season]
        sns.scatterplot(df_season["WEEK"], df_season["TOTAL DEATHS"])

    fig.legend(list(seasons), loc=(0.85, 0.15))
    plt.xticks(1 + np.arange(52))
    plt.show()


 def counties():
    url = (
        "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
    )
    df_counties = pd.read_csv(url)
    today = df_counties["date"].max()
    df_counties_today = df_counties[df_counties["date"] == today].sort_values(
        "deaths", ascending=False
    )
    total_cases = df_counties_today["cases"].sum()
    total_deaths = df_counties_today["deaths"].sum()
    df_counties_today["per_deaths"] = df_counties_today["deaths"] / total_deaths
    df_counties_today["per_cases"] = df_counties_today["cases"] / total_cases
    df_counties_today["over_rep"] = (
        df_counties_today["per_deaths"] - df_counties_today["per_cases"]
    )
    df_counties_today = df_counties_today.sort_values("over_rep", ascending=False)
    df_counties_today["over_rep_rat"] = (
        df_counties_today["per_deaths"] / df_counties_today["per_cases"]
    )
    df_counties_today = df_counties_today.sort_values("over_rep_rat", ascending=False)
    df_counties_today = df_counties_today.sort_values("deaths", ascending=False)
    sns.scatterplot(df_counties_today["per_cases"], df_counties_today["per_deaths"])
    plt.show()

    ny_nj_ct = df_counties_today[
        (df_counties_today["state"] == "New York")
        | (df_counties_today["state"] == "New Jersey")
        | (df_counties_today["state"] == "Connecitcut")
    ]
    total_cases_ny_nj_ct = ny_nj_ct["cases"].sum()
    total_deaths_ny_nj_ct = ny_nj_ct["deaths"].sum()

    print(total_cases_ny_nj_ct / df_counties_today["cases"].sum())
    print(total_deaths_ny_nj_ct / df_counties_today["deaths"].sum())

    nyc_csa_deaths = df_counties_today[df_counties_today["county"] == "New York City"][
        "deaths"
    ].iloc[0]
    nyc_csa_cases = df_counties_today[df_counties_today["county"] == "New York City"][
        "cases"
    ].iloc[0]
    for fips in all_nyc_fips:
        df_fips = df_counties_today[df_counties_today["fips"] == fips]
        print(df_fips)
        nyc_csa_deaths += df_fips["deaths"].iloc[0]
        nyc_csa_cases += df_fips["cases"].iloc[0]

    nyc_csa_per_deaths = nyc_csa_deaths / df_counties_today["deaths"].sum()
    nyc_csa_per_cases = nyc_csa_cases / df_counties_today["cases"].sum()

    df_counties_today = df_counties_today.append(
        {
            "county": "NYC MSA",
            "per_deaths": nyc_csa_per_deaths,
            "per_cases": nyc_csa_per_cases,
        },
        ignore_index=True,
    )
    sns.scatterplot(df_counties_today["per_cases"], df_counties_today["per_deaths"])
    plt.show()

    print(nyc_csa_per_deaths)
    print(nyc_csa_per_cases)


 def surge_counties():
    url = (
        "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
    )
    df_counties = pd.read_csv(url)
    df_counties.sort_values(["fips", "date"], inplace=True)

    df_counties["new_cases"] = df_counties["cases"].diff()
    mask = df_counties["fips"] != df_counties["fips"].shift(1)
    df_counties["new_cases"][mask] = np.nan

    df_counties["new_deaths"] = df_counties["deaths"].diff()
    df_counties["new_deaths"][mask] = np.nan

    today = df_counties["date"].max()
    df_counties_today = df_counties[df_counties["date"] == today]

    df_counties_today["fips"] = df_counties_today["fips"].replace(np.nan, -1)
    df_counties_today["fips"] = df_counties_today["fips"].apply(
        lambda fips: str(int(fips)).zfill(5)
    )

    url_population = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
    df_population = pd.read_csv(url_population, encoding="ISO-8859-1")
    df_population["STATE"] = df_population["STATE"].apply(
        lambda state_fips: str(state_fips).zfill(2)
    )
    df_population["COUNTY"] = df_population["COUNTY"].apply(
        lambda county_fips: str(county_fips).zfill(3)
    )
    df_population["FIPS"] = df_population["STATE"] + df_population["COUNTY"]

    combined = df_counties_today.merge(
        df_population[["FIPS", "POPESTIMATE2019"]],
        left_on="fips",
        right_on="FIPS",
        how="left",
    )

    combined["new_cases_per_mil"] = (
        1000000 * combined["new_cases"] / combined["POPESTIMATE2019"]
    )
    combined["new_deaths_per_mil"] = (
        1000000 * combined["new_deaths"] / combined["POPESTIMATE2019"]
    )

    combined[
        [
            "county",
            "state",
            "new_cases_per_mil",
            "new_deaths_per_mil",
            "POPESTIMATE2019",
        ]
    ].dropna().to_csv("surging.csv", index=False)


 def ca_counties():
    url = "https://covidtracking.com/api/states/daily.csv"
    df_states = pd.read_csv(url).sort_values("date")

    county2data = {}
    region_pops = {}
    df_ca = pd.read_csv("ca-counties.csv")
    for (row_idx, row) in df_ca.iterrows():
        region = row["region"]
        population = row["population"]
        county2data[row["county"]] = {"region": region, "population": population}
        region_pops[region] = region_pops.get(region, 0) + int(
            population.replace(",", "")
        )

    url = (
        "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
    )
    df_counties = pd.read_csv(url)
    df_ca_counties = df_counties[df_counties["state"] == "California"]
    df_ca_counties["region"] = df_ca_counties["county"].apply(
        lambda county: county2data.get(county, {"region": "None"})["region"]
    )
    df_ca_counties = df_ca_counties[df_ca_counties["region"] != "None"]
    df_ca_counties = df_ca_counties[df_ca_counties["date"] >= "2020-05-01"]
    df_ca_counties["date"] = df_ca_counties["date"]
    df_ca_counties.sort_values(["county", "date"], inplace=True)
    df_ca_counties["date"] = df_ca_counties["date"].apply(
        lambda x: datetime.strptime(str(x), "%Y-%m-%d")
    )

    df_ca_counties["new_cases"] = df_ca_counties["cases"].diff()
    mask = df_ca_counties["county"] != df_ca_counties["county"].shift(1)
    df_ca_counties["new_cases"][mask] = np.nan

    df_ca_counties["new_deaths"] = df_ca_counties["deaths"].diff()
    df_ca_counties["new_deaths"][mask] = np.nan

    region_new_cases = (
        df_ca_counties.groupby(["date", "region"])["new_cases"].sum().reset_index()
    )
    region_new_deaths = (
        df_ca_counties.groupby(["date", "region"])["new_deaths"].sum().reset_index()
    )
    region_new_deaths["population"] = region_new_deaths["region"].apply(
        lambda region: region_pops[region]
    )
    region_new_deaths["per_mil"] = (
        1_000_000 * region_new_deaths["new_deaths"] / region_new_deaths["population"]
    )

    fig = plt.figure(figsize=(10, 6))
    ca_regions = list(region_pops)
    for region in ca_regions:
        df_region = region_new_deaths[region_new_deaths["region"] == region]
        sns.lineplot(x=df_region["date"], y=df_region["per_mil"].rolling(7).mean())

    state_pops = {"AZ": 7_278_717, "FL": 21_477_737}
    keep_states = list(state_pops)
    for state in keep_states:
        df_state = df_states[df_states["state"] == state]
        df_state = df_state[df_state["date"] >= 20_200_501]
        df_state["date"] = df_state["date"].apply(
            lambda x: datetime.strptime(str(x), "%Y%m%d")
        )
        df_state["per_mil"] = 1_000_000 * df_state["deathIncrease"] / state_pops[state]
        sns.lineplot(x=df_state["date"], y=df_state["per_mil"].rolling(7).mean())

    fig.legend(ca_regions + keep_states, loc=(0.85, 0.15))
    plt.show()


 def nursing_homes():
    # ZIP to FIPS from: https://data.world/niccolley/us-zipcode-to-county-state#.
    f_zip_fips = "ZIP-COUNTY-FIPS_2017-06.csv"
    df_zip_fips = pd.read_csv(f_zip_fips)
    df_zip_fips["ZIP"] = df_zip_fips["ZIP"].apply(
        lambda zip_code: str(zip_code).zfill(5)
    )
    df_zip_fips["STCOUNTYFP"] = df_zip_fips["STCOUNTYFP"].apply(
        lambda zip_code: str(zip_code).zfill(5)
    )
    zip2fips = dict(zip(df_zip_fips["ZIP"], df_zip_fips["STCOUNTYFP"]))

    # Nursing home data.
    url_nursing_homes = "https://opendata.arcgis.com/datasets/78c58035fb3942ba82af991bb4476f13_0.csv?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"
    df_nursing_homes = pd.read_csv(url_nursing_homes)
    df_nursing_homes["ZIP"] = df_nursing_homes["ZIP"].apply(
        lambda zip_code: str(zip_code).zfill(5)
    )
    df_nursing_homes["FIPS"] = df_nursing_homes["ZIP"].apply(
        lambda zip_code: zip2fips.get(zip_code, "")
    )
    print(len(df_nursing_homes))

    # Remove rows with missing ZIP codes.
    df_nursing_homes = df_nursing_homes[df_nursing_homes["FIPS"] != ""]
    print(len(df_nursing_homes))

    # Some rows are missing beds data (i.e., coded as -999). Impute the missing data
    # using the median number of beds per nursing home for rows without missing data.
    median_beds = df_nursing_homes[df_nursing_homes["BEDS"] > 0]["BEDS"].median()
    df_nursing_homes.loc[df_nursing_homes["BEDS"] < 0, "BEDS"] = median_beds

    # Get nursing home data per county.
    beds_by_fips = df_nursing_homes.groupby("FIPS", as_index=False)["BEDS"].sum()
    homes_by_fips = pd.DataFrame(df_nursing_homes["FIPS"].value_counts())
    homes_by_fips.columns = ["HOMES"]
    homes_by_fips["FIPS"] = homes_by_fips.index

    # County COVID-19 data.
    url_counties = (
        "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
    )
    df_counties = pd.read_csv(url_counties)
    today = df_counties["date"].max()
    df_counties_today = df_counties[df_counties["date"] == today].sort_values(
        "deaths", ascending=False
    )
    df_counties_today["fips"] = df_counties_today["fips"].replace(np.nan, -1)
    df_counties_today["fips"] = df_counties_today["fips"].apply(
        lambda fips: str(int(fips)).zfill(5)
    )

    # Combine with population data.
    url_population = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
    df_population = pd.read_csv(url_population, encoding="ISO-8859-1")
    df_population["STATE"] = df_population["STATE"].apply(
        lambda state_fips: str(state_fips).zfill(2)
    )
    df_population["COUNTY"] = df_population["COUNTY"].apply(
        lambda county_fips: str(county_fips).zfill(3)
    )
    df_population["FIPS"] = df_population["STATE"] + df_population["COUNTY"]
    combined = df_counties_today.merge(
        df_population[["FIPS", "POPESTIMATE2019"]],
        left_on="fips",
        right_on="FIPS",
        how="left",
    )

    # Combine with beds data.
    combined = combined.merge(beds_by_fips, left_on="fips", right_on="FIPS", how="left")
    # Fill in counties without nursing home data.
    combined["BEDS"] = combined["BEDS"].replace(np.nan, 0)

    # Combine with homes data.
    combined = combined.merge(
        homes_by_fips, left_on="fips", right_on="FIPS", how="left"
    )
    # Fill in counties without nursing home data.
    combined["HOMES"] = combined["HOMES"].replace(np.nan, 0)

    # New York City FIPS: https://data.ny.gov/Government-Finance/NY-Municipalities-and-County-FIPS-codes/79vr-2kdi.
    nyc_fips = {"36005", "36047", "36061", "36081", "36085"}
    nyc_beds = nyc_homes = nyc_pop = 0
    for fips in nyc_fips:
        nyc_beds += beds_by_fips[beds_by_fips["FIPS"] == fips]["BEDS"].iloc[0]
        nyc_homes += homes_by_fips[homes_by_fips["FIPS"] == fips]["HOMES"].iloc[0]
        nyc_pop += df_population[df_population["FIPS"] == fips]["POPESTIMATE2019"].iloc[
            0
        ]

    combined.loc[combined["county"] == "New York City", "BEDS"] = nyc_beds
    combined.loc[combined["county"] == "New York City", "HOMES"] = nyc_homes
    combined.loc[combined["county"] == "New York City", "POPESTIMATE2019"] = nyc_pop

    # Add ZIP codes.
    fips2zip = {f: z for (z, f) in zip2fips.items()}
    combined["zip"] = combined["fips"].apply(lambda fips: fips2zip.get(fips, fips))

    # Add latitudes and longitudes.
    nomi = pgeocode.Nominatim("us")
    (combined["lat"], combined["long"]) = zip(
        *combined["zip"].map(
            lambda zip_code: tuple(
                nomi.query_postal_code(zip_code)[["latitude", "longitude"]]
            )
        )
    )
    combined.loc[combined["county"] == "New York City", "lat"] = 40.7128
    combined.loc[combined["county"] == "New York City", "long"] = -74.0060

    # Exclude locations without latitude/longitude coordinates.
    print(len(combined))
    final = combined[~combined["lat"].isnull()]
    print(len(final))

    # No obvious relationship.
    sns.scatterplot(final["BEDS"], final["deaths"])
    plt.show()
    sns.scatterplot(final["HOMES"], final["deaths"])
    plt.show()

    sns.scatterplot(final["cases"], final["deaths"])
    plt.show()

    # Add New York MSA.
    nyc_beds = final[final["county"] == "New York City"]["BEDS"].iloc[0]
    nyc_homes = final[final["county"] == "New York City"]["HOMES"].iloc[0]
    nyc_pop = final[final["county"] == "New York City"]["POPESTIMATE2019"].iloc[0]
    nyc_cases = final[final["county"] == "New York City"]["cases"].iloc[0]
    nyc_deaths = final[final["county"] == "New York City"]["deaths"].iloc[0]
    for fips in nyc_msa_fips:
        fips = str(fips).zfill(5)
        nyc_beds += beds_by_fips[beds_by_fips["FIPS"] == fips]["BEDS"].iloc[0]
        nyc_homes += homes_by_fips[homes_by_fips["FIPS"] == fips]["HOMES"].iloc[0]
        nyc_pop += df_population[df_population["FIPS"] == fips]["POPESTIMATE2019"].iloc[
            0
        ]
        nyc_cases += df_counties_today[df_counties_today["fips"] == fips]["cases"].iloc[
            0
        ]
        nyc_deaths += df_counties_today[df_counties_today["fips"] == fips][
            "deaths"
        ].iloc[0]

    final = final.append(
        {
            "county": "NYC MSA",
            "cases": nyc_cases,
            "deaths": nyc_deaths,
            "BEDS": nyc_beds,
            "HOMES": nyc_homes,
            "POPESTIMATE2019": nyc_pop,
            "lat": 40.7128,
            "long": -74.0060,
        },
        ignore_index=True,
    )

    scale = 500
    final["deaths_per_mil"] = 1_000_000 * final["deaths"] / final["POPESTIMATE2019"]
    which_zs = ["POPESTIMATE2019", "BEDS", "HOMES", "cases", "deaths", "deaths_per_mil"]
    for which_z in which_zs:
        m = Basemap(
            llcrnrlon=-119,
            llcrnrlat=22,
            urcrnrlon=-64,
            urcrnrlat=49,
            projection="lcc",
            lat_1=33,
            lat_2=45,
            lon_0=-95,
        )
        m.drawmapboundary()
        m.drawcoastlines()
        (x, y) = m(list(final["long"]), list(final["lat"]))
        z = final[which_z]
        log_z = np.log(z + 1)
        norm = matplotlib.colors.Normalize(
            vmin=log_z.min(), vmax=log_z.max(), clip=True
        )
        mapper = cm.ScalarMappable(norm=norm, cmap=cm.coolwarm)
        colors = mapper.to_rgba(log_z)
        z = z / z.max()
        m.scatter(x, y, c=colors, s=scale * z, alpha=0.5)
        plt.show()


 def maricopa_mobility():
    df = pd.read_csv("Global_Mobility_Report.csv")
    df_az = df[df["sub_region_1"] == "Arizona"]
    df_mari = df_az[df_az["sub_region_2"] == "Maricopa County"]
    df_mari["date"] = df_mari["date"].apply(
        lambda x: datetime.strptime(str(x), "%Y-%m-%d")
    )
    mobility_cols = [
        "retail_and_recreation_percent_change_from_baseline",
        "grocery_and_pharmacy_percent_change_from_baseline",
        "parks_percent_change_from_baseline",
        "transit_stations_percent_change_from_baseline",
        "workplaces_percent_change_from_baseline",
        "residential_percent_change_from_baseline",
    ]
    (rows, cols) = (2, 3)
    (fig, axs) = plt.subplots(rows, cols)
    for (col_idx, mobility_col) in enumerate(mobility_cols):
        (row, col) = (col_idx // cols, col_idx % cols)
        sns.lineplot(df_mari["date"], df_mari[mobility_col], ax=axs[row][col])

    plt.show()


 def excess():
    url = "https://data.cdc.gov/api/views/xkkf-xrst/rows.csv?accessType=DOWNLOAD&bom=true&format=true%20target="
    df = pd.read_csv(url)
    state_pop_mils = {
        "Florida": 21.48,
        "Texas": 29,
        "New Jersey": 8.882,
        "New York": 11.051,
        "New York City": 8.339,
    }
    state_excesses = {}
    for state in state_pop_mils:
        df_state = df[df["State"] == state]
        lower_excess = df_state["Total Excess Lower Estimate in 2020"].max()
        higher_excess = df_state["Total Excess Higher Estimate in 2020"].max()
        excess = (lower_excess + higher_excess) / 2
        state_excesses[state] = excess

    for (state, excess) in state_excesses.items():
        print(state)
        print(excess / state_pop_mils[state])

    all_ny_excess = state_excesses["New York"] + state_excesses["New York City"]
    all_ny_pop_mil = state_pop_mils["New York"] + state_pop_mils["New York City"]
    print("New York with New York City")
    print(all_ny_excess / all_ny_pop_mil)

    nyc_deaths = df[
        (df["State"] == "New York City")
        & (df["Week Ending Date"] >= "2020")
        & (df["Type"] == "Predicted (weighted)")
        & (df["Outcome"] == "All causes")
    ][["Average Expected Count", "Observed Number"]].values
    print((nyc_deaths[:, 1] - nyc_deaths[:, 0]).sum())

    lows = df.groupby("State")["Total Excess Lower Estimate in 2020"].max()
    highs = df.groupby("State")["Total Excess Higher Estimate in 2020"].max()
    avgs = (lows + highs) / 2
    print(avgs.sum() - avgs[avgs.index == "United States"])


 def canada():
    url = "https://raw.githubusercontent.com/ishaberry/Covid19Canada/master/timeseries_prov/testing_timeseries_prov.csv"
    df = pd.read_csv(url)
    df_on = df[df["province"] == "Ontario"]
    dates = df_on["date_testing"].apply(lambda x: datetime.strptime(str(x), "%d-%m-%Y"))
    sns.lineplot(dates, df_on["testing"].rolling(7).mean())
    plt.show()


 def age_deaths():
    url = "https://data.cdc.gov/api/views/vsak-wrfu/rows.csv?accessType=DOWNLOAD"
    df = pd.read_csv(url)
    age_groups = [
        "Under 1 year",
        "1-4 years",
        "5-14 years",
        "15-24 years",
        "25-34 years",
        "35-44 years",
        "45-54 years",
        "55-64 years",
        "65-74 years",
        "75-84 years",
        "85 years and over",
    ]
    df_age_sex = df.groupby(["MMWR Week", "Age Group"])[
        ["Total Deaths", "COVID-19 Deaths"]
    ].sum()
    df_age_sex = df_age_sex.reset_index()

    fig = plt.figure(figsize=(10, 6))
    for age_group in age_groups:
        df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
        sns.lineplot(df_age["MMWR Week"], df_age["COVID-19 Deaths"])

    fig.legend(age_groups, loc=(0.85, 0.15))
    plt.show()

    fig = plt.figure(figsize=(10, 6))
    for age_group in age_groups:
        df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
        sns.lineplot(
            df_age["MMWR Week"], df_age["COVID-19 Deaths"] / df_age["Total Deaths"]
        )

    fig.legend(age_groups, loc=(0.85, 0.15))
    plt.show()

    fig = plt.figure(figsize=(10, 6))
    for age_group in age_groups[3:]:
        df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
        max_deaths = df_age["COVID-19 Deaths"].max()
        sns.lineplot(df_age["MMWR Week"], df_age["COVID-19 Deaths"] / max_deaths)

    fig.legend(age_groups[3:], loc=(0.85, 0.15))
    plt.show()

    df_age_totals = df_age_sex.groupby("Age Group")["COVID-19 Deaths"].sum().to_dict()

    fig = plt.figure(figsize=(10, 6))
    for age_group in age_groups:
        df_age = df_age_sex[df_age_sex["Age Group"] == age_group]
        cum_totals = df_age["COVID-19 Deaths"].cumsum()
        sns.lineplot(df_age["MMWR Week"], cum_totals / df_age_totals[age_group])

    fig.legend(age_groups, loc=(0.85, 0.15))
    plt.show()


 def death_risk():
    url = "https://data.cdc.gov/api/views/y5bj-9g5w/rows.csv?accessType=DOWNLOAD&bom=true&format=true%20target="
    df = pd.read_csv(url)

    comp_states = [
        "New York City",
        "New York",
        "New Jersey",
        "Louisiana",
        "Massachusetts",
        "Mississippi",
        "Florida",
        "Alabama",
    ]
    comp_years = [2019, 2020]
    comp_age_groups = [
        "25-44 years",
        "65-74 years",
        "75-84 years",
        "85 years and older",
    ]
    comp_age_groups = df["Age Group"].unique()
    data = []
    for comp_state in comp_states:
        print(f"\nJurisdiction: {comp_state}")
        row = {"state": comp_state}
        for comp_age_group in comp_age_groups:
            print(f"\nAge Group: {comp_age_group}")
            death_data = []
            for comp_year in comp_years:
                deaths = df[
                    (df["Jurisdiction"] == comp_state)
                    & (df["Year"] == comp_year)
                    & (df["Week Ending Date"] < "10")
                    & (df["Age Group"] == comp_age_group)
                    & (df["Type"] == "Unweighted")
                ]["Number of Deaths"].sum()
                if comp_state == "New York":
                    deaths += df[
                        (df["Jurisdiction"] == "New York City")
                        & (df["Year"] == comp_year)
                        & (df["Week Ending Date"] < "10")
                        & (df["Age Group"] == comp_age_group)
                        & (df["Type"] == "Unweighted")
                    ]["Number of Deaths"].sum()

                death_data.append(deaths)
                print(f"{comp_year}: {int(deaths)}")

            per_inc = 100 * (death_data[1] / death_data[0] - 1)
            col = f"{comp_age_group}"
            row[col] = per_inc
            print(f"Percent Increase: {per_inc:.2f}")

        data.append(row)

    df_deaths = pd.DataFrame(data)
    df_melted = df_deaths.melt(
        "state", var_name="age_group", value_name="deaths % inc."
    )
    sns.barplot(x="state", y="deaths % inc.", hue="age_group", data=df_melted)
    plt.show()


 def obesity():
    us_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_United%20States%20of%20America.csv"
    de_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_Germany.csv"
    vn_url = "http://www.ncdrisc.org/downloads/bmi/adult_country/NCD_RisC_Lancet_2017_BMI_age_standardised_Viet%20Nam.csv"
    urls = [us_url, de_url, vn_url]
    dfs = []
    for url in urls:
        dfs.append(pd.read_csv(url, encoding="ISO-8859-1"))

    df = pd.concat(dfs)
    keep_columns = [
        column
        for column in df.columns
        if (("Prevalence" in column) and ("interval" not in column))
    ][2:]
    country_col = "Country/Region/World"
    keep_columns.append(country_col)
    df_2016 = df[(df["Sex"] == "Men") & (df["Year"] == 2016)][keep_columns]

    cols = {"bmi": [], "percent": [], "country": []}
    for (row_idx, row) in df_2016.iterrows():
        country = row[country_col]
        for col in keep_columns:
            if col == country_col:
                continue

            bmi_str = re.search("BMI.*m²", col).group()
            bmi_str = bmi_str.replace("BMI ", "")
            bmi_str = bmi_str.replace("BMI", "")
            bmi_str = bmi_str.replace(" kg/m²", "")
            cols["bmi"].append(bmi_str)
            cols["percent"].append(row[col])
            cols["country"].append(country)

    df = pd.DataFrame(cols)
    sns.barplot(x="bmi", y="percent", hue="country", data=df)
    plt.show()
No results found