Skip to content

Instantly share code, notes, and snippets.

@lucasrodes
Last active August 12, 2021 16:45
Show Gist options
  • Save lucasrodes/0183e55d87f21544cd0bb4d4c4431445 to your computer and use it in GitHub Desktop.
Save lucasrodes/0183e55d87f21544cd0bb4d4c4431445 to your computer and use it in GitHub Desktop.
"""Checking if `people_fully_vaccinated` metric (from Our World in Data vaccination dataset) follows Benford's Law.
Reference: https://twitter.com/lucasrodesg/status/1425770193993744386?s=20
"""
import os
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from plotly.offline import plot
import plotly.graph_objs as go
os.makedirs("images")
# theoretical
x = np.arange(1, 10)
f_the = np.log10(1+1/x)
def entities_accepted():
"""Only return country names (exclude continents, aggregates, etc)."""
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/scripts/input/un/population_2020.csv"
df_pop = pd.read_csv(url).dropna()
entities_accepted = df_pop[-df_pop.iso_code.apply(lambda x: "OWID" in x and x!="OWID_KOS")].entity
return entities_accepted
def load_data(metric="people_fully_vaccinated"):
# Load data
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv"
df = pd.read_csv(url)
df = df[["location", "date", metric]].dropna()
df = df[df[metric]!=0]
df = df[df.location.isin(entities_accepted())]
return df
# Experimental
def create_chart(df, date_str, metric="people_fully_vaccinated"):
df = df[df.date<date_str].copy()
df = df.assign(first_digit=df[metric].apply(lambda x: str(x)[0]))
f_exp = df.first_digit.value_counts(normalize=True)
# Plot
trace = go.Scatter(x=x, y=f_the, marker_color="yellow", mode='markers', marker_line_width=2, marker_size=30, name="Benford's law")
fig = go.Figure(data=[trace], layout={"title": f"Distribution of the first digit of '{metric}' values ({date_str})"})
fig.add_bar(x=x, y=f_exp, marker_color='rgb(55, 83, 109)', name="Experiment")
fig.write_image(f"images/{metric}-benford-{date_str}.png")
return fig# plot(fig, filename=f"{metric}-benford-{date_str}.html")
# Generate charts for several dates
base_date = datetime(2021, 1, 1)
base_increment = timedelta(days=14)
df = load_data()
for i in range(30):
date_raw = (base_date + i*base_increment)
if base_date + i*base_increment > datetime.now():
break
date_str = date_raw.strftime("%Y-%m-%d")
print(date_str)
_ = create_chart(df, date_str)
date_str = datetime.now().strftime("%Y-%m-%d")
_ = create_chart(df, date_str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment