harej · June 9, 2016 21:20
diff --git a/npg_gap_analysis.py b/npg_gap_analysis.py
 # Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard
 # Step 2: Iterate through each item for invoked items and properties
 #         (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id'])
 #          and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item')
 # Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH
 # Step 4: Check labels: en, es, zh, fr, de
 # Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing
 # Step 6: Take percentages of coverage in each language; save to a timestamped log

 import requests

 def wdqs(encoded_query):
    # Takes URL-encoded SPARQL query for the Wikidata Query Service
    # Returns list of Wikidata items
    base_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={0}&format=json"
    r = requests.get(base_url.format(encoded_query))
    blob = r.json()
    output = []
    for item in blob['results']['bindings']:
        cleaned_value = item['item']['value'].replace("http://www.wikidata.org/entity/", "")
        output.append(cleaned_value)
    return output

 def entitydata(identifier):
    # Takes Wikidata identifier
    # Returns dictionary based on JSON blob from Special:EntityData

 def linked_on_page(blob):
    # Takes EntityData dictionary and returns list of items and properties linked on a Wikidata item

 def other_language_labels(blob, language_codes):
    # Takes EntityData dictionary, list of ISO language codes (e.g. ['en', 'de'])
    # Returns dictionary of language code -> label (or language code -> None)

 def gap_analysis(manifest):
    # Takes a dictionary of dictionaries {item -> {language: label}}
    # Returns a dictionary of language -> percent covered

 def web_page_generator(manifest):
    # Takes a dictionary of dictionaries {item -> {language: label}}
    # Returns nothing; creates two web pages

 def main():
    language_codes = ['en', 'es', 'zh', 'fr', 'de']
    
    print("Querying for list of chemical/exposure items...")
    chemicals_and_exposures_query = # TODO: put URL-encoded query here
    chemicals_and_exposures = wdqs(chemicals_and_exposures_query)
    
    master_list = {}
    for item in chemicals_and_exposure:
        print("Processing chemical/exposure item: " + item)
        blob = entitydata(item)
        chemicals_and_exposure_labels = other_language_labels(blob, language_codes)
        for entry in chemicals_and_exposure_labels:
            master_list[item] = entry
        for link in linked_on_page(blob):
            if link in master_list:
                continue
            else:
                print("Processing linked entity: " + link)
                labels = other_language_labels(entitydata(link), language_codes)
                for entry in labels:
                    master_list[link] = entry
                    
    master_list = list(set(master_list))  # just in case
    gap_report = gap_analysis(master_list)
	# Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard
	# Step 2: Iterate through each item for invoked items and properties
	# (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id'])
	# and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item')
	# Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH
	# Step 4: Check labels: en, es, zh, fr, de
	# Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing
	# Step 6: Take percentages of coverage in each language; save to a timestamped log

	import requests

	def wdqs(encoded_query):
	# Takes URL-encoded SPARQL query for the Wikidata Query Service
	# Returns list of Wikidata items
	base_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={0}&format=json"
	r = requests.get(base_url.format(encoded_query))
	blob = r.json()
	output = []
	for item in blob['results']['bindings']:
	cleaned_value = item['item']['value'].replace("http://www.wikidata.org/entity/", "")
	output.append(cleaned_value)
	return output

	def entitydata(identifier):
	# Takes Wikidata identifier
	# Returns dictionary based on JSON blob from Special:EntityData

	def linked_on_page(blob):
	# Takes EntityData dictionary and returns list of items and properties linked on a Wikidata item

	def other_language_labels(blob, language_codes):
	# Takes EntityData dictionary, list of ISO language codes (e.g. ['en', 'de'])
	# Returns dictionary of language code -> label (or language code -> None)

	def gap_analysis(manifest):
	# Takes a dictionary of dictionaries {item -> {language: label}}
	# Returns a dictionary of language -> percent covered

	def web_page_generator(manifest):
	# Takes a dictionary of dictionaries {item -> {language: label}}
	# Returns nothing; creates two web pages

	def main():
	language_codes = ['en', 'es', 'zh', 'fr', 'de']

	print("Querying for list of chemical/exposure items...")
	chemicals_and_exposures_query = # TODO: put URL-encoded query here
	chemicals_and_exposures = wdqs(chemicals_and_exposures_query)

	master_list = {}
	for item in chemicals_and_exposure:
	print("Processing chemical/exposure item: " + item)
	blob = entitydata(item)
	chemicals_and_exposure_labels = other_language_labels(blob, language_codes)
	for entry in chemicals_and_exposure_labels:
	master_list[item] = entry
	for link in linked_on_page(blob):
	if link in master_list:
	continue
	else:
	print("Processing linked entity: " + link)
	labels = other_language_labels(entitydata(link), language_codes)
	for entry in labels:
	master_list[link] = entry

	master_list = list(set(master_list)) # just in case
	gap_report = gap_analysis(master_list)