Last active
June 9, 2016 21:20
-
-
Save harej/1ab12a75f8ed4e755af2 to your computer and use it in GitHub Desktop.
Generates list of items and properties used on NPG-related Wikidata entries and assesses existence of labels in other languages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard | |
# Step 2: Iterate through each item for invoked items and properties | |
# (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id']) | |
# and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item') | |
# Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH | |
# Step 4: Check labels: en, es, zh, fr, de | |
# Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing | |
# Step 6: Take percentages of coverage in each language; save to a timestamped log | |
import requests | |
def wdqs(encoded_query): | |
# Takes URL-encoded SPARQL query for the Wikidata Query Service | |
# Returns list of Wikidata items | |
base_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={0}&format=json" | |
r = requests.get(base_url.format(encoded_query)) | |
blob = r.json() | |
output = [] | |
for item in blob['results']['bindings']: | |
cleaned_value = item['item']['value'].replace("http://www.wikidata.org/entity/", "") | |
output.append(cleaned_value) | |
return output | |
def entitydata(identifier): | |
# Takes Wikidata identifier | |
# Returns dictionary based on JSON blob from Special:EntityData | |
def linked_on_page(blob): | |
# Takes EntityData dictionary and returns list of items and properties linked on a Wikidata item | |
def other_language_labels(blob, language_codes): | |
# Takes EntityData dictionary, list of ISO language codes (e.g. ['en', 'de']) | |
# Returns dictionary of language code -> label (or language code -> None) | |
def gap_analysis(manifest): | |
# Takes a dictionary of dictionaries {item -> {language: label}} | |
# Returns a dictionary of language -> percent covered | |
def web_page_generator(manifest): | |
# Takes a dictionary of dictionaries {item -> {language: label}} | |
# Returns nothing; creates two web pages | |
def main(): | |
language_codes = ['en', 'es', 'zh', 'fr', 'de'] | |
print("Querying for list of chemical/exposure items...") | |
chemicals_and_exposures_query = # TODO: put URL-encoded query here | |
chemicals_and_exposures = wdqs(chemicals_and_exposures_query) | |
master_list = {} | |
for item in chemicals_and_exposure: | |
print("Processing chemical/exposure item: " + item) | |
blob = entitydata(item) | |
chemicals_and_exposure_labels = other_language_labels(blob, language_codes) | |
for entry in chemicals_and_exposure_labels: | |
master_list[item] = entry | |
for link in linked_on_page(blob): | |
if link in master_list: | |
continue | |
else: | |
print("Processing linked entity: " + link) | |
labels = other_language_labels(entitydata(link), language_codes) | |
for entry in labels: | |
master_list[link] = entry | |
master_list = list(set(master_list)) # just in case | |
gap_report = gap_analysis(master_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment