Last active
February 11, 2021 18:35
-
-
Save jsundram/530324b4dbdaf83692d4997967d41bdc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from dateutil import parser | |
import csv | |
import json | |
import requests | |
def id_for_page(page): | |
"""Uses the wikipedia api to find the wikidata id for a page""" | |
api = "https://en.wikipedia.org/w/api.php" | |
query = "?action=query&prop=pageprops&titles=%s&format=json" | |
slug = page.split('/')[-1] | |
response = json.loads(requests.get(api + query % slug).content) | |
# Assume we got 1 page result and it is correct. | |
page_info = list(response['query']['pages'].values())[0] | |
return page_info['pageprops']['wikibase_item'] | |
def lifespan_for_id(wikidata_id): | |
"""Uses the wikidata API to retrieve wikidata for the given id.""" | |
data_url = "https://www.wikidata.org/wiki/Special:EntityData/%s.json" | |
page = json.loads(requests.get(data_url % wikidata_id).content) | |
claims = list(page['entities'].values())[0]['claims'] | |
# P569 (birth) and P570 (death) ... not everyone has died yet. | |
return [get_claim_as_time(claims, cid) for cid in ['P569', 'P570']] | |
def get_claim_as_time(claims, claim_id): | |
"""Helper function to work with data returned from wikidata api""" | |
try: | |
claim = claims[claim_id][0]['mainsnak']['datavalue'] | |
assert claim['type'] == 'time', "Expecting time data type" | |
# dateparser chokes on leading '+', thanks wikidata. | |
return parser.parse(claim['value']['time'][1:]) | |
except KeyError as e: | |
print(e) | |
return None | |
def main(): | |
page = 'https://en.wikipedia.org/wiki/Albert_Einstein' | |
# 1. use the wikipedia api to find the wikidata id for this page | |
wikidata_id = id_for_page(page) | |
# 2. use the wikidata id to get the birth and death dates | |
span = lifespan_for_id(wikidata_id) | |
for label, dt in zip(["birth", "death"], span): | |
print(label, " = ", datetime.strftime(dt, "%b %d, %Y")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment