Skip to content

Instantly share code, notes, and snippets.

@gicmo
Last active March 10, 2017 03:48
Show Gist options
  • Save gicmo/bc3cbe4a8b9e32448d9f to your computer and use it in GitHub Desktop.
Save gicmo/bc3cbe4a8b9e32448d9f to your computer and use it in GitHub Desktop.
Textfile 2 RDF
#!/usr/bin/env python
# generate a list of files with: find `pwd` -type f | sed -e "s/\//file:\/\/`hostname`/" > list.txt
from __future__ import print_function
import argparse
from urlparse import urlparse
import rdflib
import sys
import mimetypes
mimetypes.init()
rdf = rdflib.RDF
uri = rdflib.URIRef
nao = rdflib.Namespace("http://www.semanticdesktop.org/ontologies/2007/08/15/nao#")
nie = rdflib.Namespace("http://www.semanticdesktop.org/ontologies/2007/01/19/nie#")
nfo = rdflib.Namespace("http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#")
all_tags = {}
def parse_prefix(prefix):
k = prefix.find(":")
return prefix[:k], prefix[k+1:]
def setup_graph(prefixes=None):
g = rdflib.Graph(identifier="files")
g.bind("nao", str(nao))
g.bind("nie", str(nie))
g.bind("nfo", str(nfo))
[g.bind(p[0], p[1]) for p in prefixes]
return g
def tag(label, url, known_tags, g, prefix):
u = uri("%s:%s" % (prefix, label))
if label not in known_tags:
print("new Tag %s" % str(u), file=sys.stderr)
g.add((u, rdf.type, nao['Tag']))
g.add((u, nao['prefLabel'], rdflib.Literal(label)))
known_tags[label] = u
g.add((u, nao['isTagFor'], url))
return u
def convert_line(g, line, tag_prefix):
iu = urlparse(line)
u = uri(line)
components = filter(bool, iu.path.split('/'))
filename = components[-1]
components = components[:-1]
g.add((u, rdf.type, nfo['RemoteDataObject']))
[g.add((u, nao['hasTag'], tag(comp, u, all_tags, g, tag_prefix))) for comp in components]
mtype, _ = mimetypes.guess_type(line)
if mtype is not None:
g.add((u, nie['mimeType'], rdflib.Literal(mtype)))
g.add((u, nfo['fileName'], rdflib.Literal(filename)))
g.add((u, nfo['fileUrl'], uri(line)))
def main(args):
prefixes = [parse_prefix(p) for p in [args.tag_prefix, args.prefix]]
g = setup_graph(prefixes)
with open(args.textfile, mode='r') as fd:
maxl = args.max_lines
[convert_line(g, line.strip('\n'), prefixes[0][0]) for i, line in enumerate(fd) if maxl is None or i < maxl]
print("Format: %s" % args.format, file=sys.stderr)
g.serialize(sys.stdout, format=args.format)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='text 2 rdf converter')
parser.add_argument('textfile')
parser.add_argument('--format', default='turtle')
parser.add_argument('-N,--max-lines', dest='max_lines', type=int, default=None)
parser.add_argument('--prefix', default="lkf:file://localhost#")
parser.add_argument('--tag-prefix', default="lkt:file://localhost/tags")
arguments = parser.parse_args()
main(arguments)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment