Last active
October 7, 2015 04:12
-
-
Save romanlv/ffac9b420ef817052820 to your computer and use it in GitHub Desktop.
Script to save some of the connected pages (subjects) for dbpedia.org resource in a tree in json file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python resource_tree.py --limit=100 --file_path=data.json Internet |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
click==5.1 | |
SPARQLWrapper==1.6.4 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from SPARQLWrapper import SPARQLWrapper, JSON | |
from collections import namedtuple | |
import json | |
import random | |
Obj = namedtuple('Obj', 'id,label') | |
class Node(object): | |
def __init__(self, id, label): | |
self.id = id | |
self.label = label | |
self.children = [] | |
def to_json(self): | |
return { | |
'id': self.id, | |
'name': self.label, | |
'children': [c.to_json() for c in self.children] if self.children else None | |
} | |
class TreeFetch(object): | |
def __init__(self, counter_limit=50, depth=5): | |
self.visited = {} | |
self.counter = 0 | |
self.COUNTER_LIMIT = counter_limit | |
self.depth_limit = depth | |
def get_related(self, subject): | |
sparql = SPARQLWrapper("http://dbpedia.org/sparql") | |
sparql.setQuery(""" | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX dbo: <http://dbpedia.org/ontology/> | |
PREFIX dbr: <http://dbpedia.org/resource/> | |
SELECT * { | |
{ | |
SELECT (?obj as ?name) ?label | |
WHERE { | |
%s ?predicate ?obj. | |
FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) . | |
?obj rdfs:label ?label. | |
FILTER( lang(?label) = "en" || lang(?label) = "" ) . | |
} | |
} | |
UNION | |
{ | |
SELECT (?subject as ?name) ?label | |
WHERE { | |
?subject ?predicate %s. | |
FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) . | |
?subject rdfs:label ?label. | |
FILTER( lang(?label) = "en" || lang(?label) = "" ) . | |
} | |
} | |
} | |
LIMIT 15 | |
""" % (subject, subject)) | |
sparql.setReturnFormat(JSON) | |
results = sparql.query().convert() | |
# import ipdb; ipdb.set_trace() | |
# print('collect_all, subject={}, {} results'.format(subject, len(results["results"]["bindings"]))) | |
values = [Obj(r['name']['value'], r['label']['value']) for r in results["results"]["bindings"]] | |
return values | |
def select_depth(self, depth_limit): | |
# randomly selecting max_depth for each branch, so graph becomes more interesting | |
if not depth_limit: | |
depth_limit = random.randrange(self.depth_limit/2, self.depth_limit) +1 | |
print 'selected depth_limit={}'.format(depth_limit) | |
return depth_limit | |
def collect_all(self, subject, label, depth=1, depth_limit=None): | |
if subject in self.visited: | |
return | |
max_depth = depth_limit or self.depth_limit | |
if depth >= max_depth: | |
return | |
# self.counter+=1 | |
# if self.counter > self.COUNTER_LIMIT: | |
# return | |
self.visited[subject] = True | |
node = Node(subject, label) | |
related = self.get_related(subject) | |
for item in related: | |
if depth+1 >= max_depth: | |
break | |
branch_depth = self.select_depth(depth_limit) | |
item_children = self.collect_all('<{}>'.format(item.id), item.label, (depth+1), branch_depth) | |
if item_children: | |
node.children.append(item_children) | |
self.counter+=1 | |
if self.counter > self.COUNTER_LIMIT: | |
break | |
print('collect_all={}, depth={}, counter={}'.format(subject, depth, self.counter)) | |
return node | |
import click | |
import sys | |
@click.command() | |
@click.option('--limit', default=100, help='Number of nodes to generate') | |
@click.option('--max-depth', default=5, help='max depth of the tree') | |
@click.option('--file_path', default=None, help='file to save to.') | |
@click.argument('resource') | |
def main(limit, max_depth, file_path, resource): | |
random.seed(resource) | |
fetch = TreeFetch(counter_limit=limit, depth=max_depth) | |
res = fetch.collect_all("<http://dbpedia.org/resource/{}>".format(resource), resource) | |
f = open(file_path, 'wb') if file_path else sys.stdout | |
f.write(json.dumps(res.to_json())) | |
#print(res.to_json()) | |
# print('done') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment