Last active
August 12, 2021 16:45
-
-
Save lucasrodes/0183e55d87f21544cd0bb4d4c4431445 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Checking if `people_fully_vaccinated` metric (from Our World in Data vaccination dataset) follows Benford's Law. | |
Reference: https://twitter.com/lucasrodesg/status/1425770193993744386?s=20 | |
""" | |
import os | |
from datetime import datetime, timedelta | |
import pandas as pd | |
import numpy as np | |
from plotly.offline import plot | |
import plotly.graph_objs as go | |
os.makedirs("images") | |
# theoretical | |
x = np.arange(1, 10) | |
f_the = np.log10(1+1/x) | |
def entities_accepted(): | |
"""Only return country names (exclude continents, aggregates, etc).""" | |
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/scripts/input/un/population_2020.csv" | |
df_pop = pd.read_csv(url).dropna() | |
entities_accepted = df_pop[-df_pop.iso_code.apply(lambda x: "OWID" in x and x!="OWID_KOS")].entity | |
return entities_accepted | |
def load_data(metric="people_fully_vaccinated"): | |
# Load data | |
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv" | |
df = pd.read_csv(url) | |
df = df[["location", "date", metric]].dropna() | |
df = df[df[metric]!=0] | |
df = df[df.location.isin(entities_accepted())] | |
return df | |
# Experimental | |
def create_chart(df, date_str, metric="people_fully_vaccinated"): | |
df = df[df.date<date_str].copy() | |
df = df.assign(first_digit=df[metric].apply(lambda x: str(x)[0])) | |
f_exp = df.first_digit.value_counts(normalize=True) | |
# Plot | |
trace = go.Scatter(x=x, y=f_the, marker_color="yellow", mode='markers', marker_line_width=2, marker_size=30, name="Benford's law") | |
fig = go.Figure(data=[trace], layout={"title": f"Distribution of the first digit of '{metric}' values ({date_str})"}) | |
fig.add_bar(x=x, y=f_exp, marker_color='rgb(55, 83, 109)', name="Experiment") | |
fig.write_image(f"images/{metric}-benford-{date_str}.png") | |
return fig# plot(fig, filename=f"{metric}-benford-{date_str}.html") | |
# Generate charts for several dates | |
base_date = datetime(2021, 1, 1) | |
base_increment = timedelta(days=14) | |
df = load_data() | |
for i in range(30): | |
date_raw = (base_date + i*base_increment) | |
if base_date + i*base_increment > datetime.now(): | |
break | |
date_str = date_raw.strftime("%Y-%m-%d") | |
print(date_str) | |
_ = create_chart(df, date_str) | |
date_str = datetime.now().strftime("%Y-%m-%d") | |
_ = create_chart(df, date_str) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment