Skip to content

Instantly share code, notes, and snippets.

@jprante
Last active August 29, 2015 14:17
Show Gist options
  • Save jprante/4ffde9124de846e0a7ce to your computer and use it in GitHub Desktop.
Save jprante/4ffde9124de846e0a7ce to your computer and use it in GitHub Desktop.
Crawling dewey.info
#!/usr/bin/env python
"""
cralws triples from dewey.info and writes them to RDF N3 file
"""
import rdflib
def crawl(uri,file):
g = rdflib.ConjunctiveGraph()
g.load(uri)
for s in g.subjects():
if not s in seen:
seen.add(s)
uris.append(s)
for o in g.objects():
if isinstance(o, rdflib.URIRef) and not o in seen:
seen.add(o)
uris.append(o)
file.write(g.serialize(format='n3'))
seen = set()
uris = ['http://dewey.info/class/%s/' % n for n in range(0,10)]
file = open("dewey.n3", "a")
while len(uris) > 0:
uri = uris.pop(0)
if 'http://dewey.info' not in uri:
print "skipping: %s" % uri
continue
try:
print "crawling: %s" % uri
crawl(uri,file)
except KeyboardInterrupt:
break
except Exception, e:
print e
# missing in first version
file.flush()
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment