Skip to content

Instantly share code, notes, and snippets.

@putnik
Last active January 9, 2022 10:38
Show Gist options
  • Save putnik/7dcb92c0168392ec8b1c718093e36d1a to your computer and use it in GitHub Desktop.
Save putnik/7dcb92c0168392ec8b1c718093e36d1a to your computer and use it in GitHub Desktop.
Generate a sorted list of awards from Wikidata
# Copyright 2018-2022 Sergey Leschina <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from collections import deque
from requests import post
from SPARQLWrapper import SPARQLWrapper, JSON, POSTDIRECTLY
from urllib.parse import quote
def append_value(graph, key, value):
if key == 'Q19798647':
raise ValueError("'No value' instead of novalue: %s" % value)
if value == 'Q19798647':
raise ValueError("'No value' instead of novalue: %s" % key)
if key is None or value is None:
return
if not key in graph:
graph[key] = []
try:
graph[key].index(value)
except Exception as e:
graph[key].append(value)
def connected_components(graph):
components = []
seen = set()
full_graph = copy.deepcopy(graph)
for key in graph:
for value in graph[key]:
append_value(full_graph, value, key)
def dfs(v):
vset = set([v])
component = []
while vset:
v = vset.pop()
seen.add(v)
vset |= set(full_graph.get(v, ())) - seen
component.append(v)
return component
for v in full_graph:
if v not in seen:
component_keys = dfs(v)
component = { key: graph.get(key, []) for key in component_keys }
components.append(component)
return components
def error_message(v, path):
query = '''
#defaultView:Graph
SELECT ?item ?itemLabel ?linkTo ?linkToLabel WHERE {
{
VALUES ?item { wd:%s wd:%s }.
?item p:P3730 ?pHigher .
?pHigher ps:P3730 ?linkTo .
FILTER NOT EXISTS { ?pHigher pq:P582 ?end . }
} UNION {
VALUES ?linkTo { wd:%s wd:%s }.
?linkTo p:P3729 ?pLower .
?pLower ps:P3729 ?item .
FILTER NOT EXISTS { ?pLower pq:P582 ?end . }
}
SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".
}
}
'''.strip() % (v, " wd:".join(path), v, " wd:".join(path))
sparql_url = 'https://query.wikidata.org/#%s' % quote(query)
api_url = 'https://meta.wikimedia.org/w/api.php?action=shortenurl&format=json'
response = post(api_url, {'url': sparql_url})
json_response = response.json()
try:
url = json_response['shortenurl']['shorturl']
except KeyError:
print("URL Shortener error: %s" % json_response['error']['info'])
url = sparql_url
return "Cycle (%s): %s\nGraph: %s" % (v, " -> ".join(path), url)
def topological_sort(graph):
VISITED, FINISHED = 0, 1
order = deque()
state = {}
stack = []
def dfs(v):
stack.append(v)
state[v] = VISITED
for u in graph.get(v, ()):
if state.get(u, None) == VISITED:
path = stack[stack.index(u):stack.index(v)]
raise ValueError(error_message(v, path))
if state.get(u, None) == None:
dfs(u)
state[v] = FINISHED
order.append(v)
stack.pop(-1)
for v in graph:
if state.get(v, None) == None:
dfs(v)
return order
def raw_value(value):
if value is None:
return None
return value["value"].replace("http://www.wikidata.org/entity/", "")
def load_data():
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT DISTINCT ?item ?lower ?higher ?country
WHERE {
{ ?item wdt:P31/wdt:P279* wd:Q618779 }
UNION
{ ?item wdt:P31/wdt:P279* wd:Q38033430 }
.
{
?item p:P3729 ?pLower .
?pLower ps:P3729 ?lower .
FILTER NOT EXISTS { ?pLower pq:P582 ?end . }
} UNION {
?item p:P3730 ?pHigher .
?pHigher ps:P3730 ?higher .
FILTER NOT EXISTS { ?pHigher pq:P582 ?end . }
}
.
OPTIONAL { ?item wdt:P17 ?country . }
}
ORDER BY ?country
""")
sparql.setRequestMethod(POSTDIRECTLY)
sparql.setReturnFormat(JSON)
data = []
for result in sparql.query().convert()["results"]["bindings"]:
data.append({
"item": raw_value(result['item']),
"lower": raw_value(result.get('lower', None)),
"higher": raw_value(result.get('higher', None)),
})
return data
def construct_graph():
graph = {}
data = load_data()
for row in data:
append_value(graph, row["lower"], row["item"])
append_value(graph, row["item"], row["higher"])
return graph
try:
graph = construct_graph()
components = connected_components(graph)
sorted_list = []
for component in components:
sorted_list += topological_sort(component)
prefix = "\t\"data\": [\n\t\t[\n\t\t\t\""
body = "\"\n\t\t],\n\t\t[\n\t\t\t\"".join(sorted_list)
postfix = "\"\n\t\t]\n\t]"
print(prefix + body + postfix)
except ValueError as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment