Skip to content

Instantly share code, notes, and snippets.

@rejsmont
Last active March 20, 2020 20:20
Show Gist options
  • Save rejsmont/aa06757ebfa4c92e35685bd3e1a602a8 to your computer and use it in GitHub Desktop.
Save rejsmont/aa06757ebfa4c92e35685bd3e1a602a8 to your computer and use it in GitHub Desktop.
Fetch COVID-19 data for a country from Wikipedia
import requests
import pandas as pd
def get_wikipedia_data(country):
s = requests.Session()
url = "https://en.wikipedia.org/w/api.php"
p = {
"action": "query",
"titles": "Template:2019–20_coronavirus_pandemic_data/" + country + "_medical_cases_chart",
"prop": "revisions",
"format": "json",
"rvslots": "*",
"rvprop": "content",
"formatversion": "2"
}
r = s.get(url=url, params=p)
d = r.json()
content = d['query']['pages'][0]['revisions'][0]['slots']['main']['content']
data = []
for line in content.split("\n"):
if "Medical cases chart/Row" in line and line.startswith("{{"):
data.append(line.strip("{}").split("|"))
df = pd.DataFrame(data).loc[:,1:4]
df.columns=['Date', 'Dead', 'Recovered', 'Confirmed']
df.set_index('Date', inplace=True)
df.index = pd.to_datetime(df.index, infer_datetime_format=True)
return df.loc[df.index.dropna()]
def get_wikipedia_countries():
s = requests.Session()
url = "https://en.wikipedia.org/w/api.php"
p = {
"action": "query",
"titles": "Template:2019–20 coronavirus pandemic",
"prop": "revisions",
"format": "json",
"rvslots": "*",
"rvprop": "content",
"formatversion": "2"
}
r = s.get(url=url, params=p)
d = r.json()
content = d['query']['pages'][0]['revisions'][0]['slots']['main']['content']
data = []
for line in content.split("\n"):
if "medical cases chart" in line:
country = line.strip("** [[ ]]") \
.replace("Template:2019–20 coronavirus pandemic data/", "") \
.replace(" medical cases chart|chart", "")
if country not in data:
data.append(country)
return data
df = None
countries = get_wikipedia_countries()
for country in countries:
try:
data = get_wikipedia_data(country)
cols = pd.MultiIndex.from_product([[country], data.columns], names =['Country', 'Cases'])
data.columns = cols
except:
continue
if df is None:
df = data
else:
df = df.join(data, how="outer", rsuffix="_duplicate")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment