Skip to content

Instantly share code, notes, and snippets.

@tommyettinger
Forked from nlacasse/dbpedia.clj
Last active January 1, 2016 19:49
Show Gist options
  • Save tommyettinger/8192768 to your computer and use it in GitHub Desktop.
Save tommyettinger/8192768 to your computer and use it in GitHub Desktop.
(ns opal.dbpedia
(:use [clojure.tools.logging :only [log]])
(:require [clojure.java.io :as io])
(:import [uk.ac.manchester.cs.owl.owlapi.turtle.parser TurtleParser]
[org.neo4j.graphdb Label]
[org.neo4j.index.lucene.unsafe.batchinsert LuceneBatchInserterIndexProvider]
[org.neo4j.unsafe.batchinsert BatchInserters]
[org.neo4j.graphdb DynamicRelationshipType]))
;; PARSING METHODS
(defn get-next-tuple
[parser]
(let [last-item (atom nil)
tuple (transient [])]
(while (and (not= "." @last-item)
(not= "" @last-item))
(reset! last-item
(-> parser
(.getNextToken)
(.toString)))
(conj! tuple @last-item))
(when-not (empty? (get tuple 0)) ; .getNextToken returns "" once you are out of data
(persistent! tuple))))
(defn seq-of-parser
[parser]
(if-let [next-tuple (get-next-tuple parser)]
(lazy-cat [next-tuple]
(seq-of-parser parser))))
(defn parse-file
[filename]
(seq-of-parser
(TurtleParser.
(io/input-stream filename))))
;; BATCH UPSERT METHODS
(defn -main [graph-path & files]
(let [inserter (BatchInserters/inserter graph-path)
id-map (transient {})
insert-resource-node! (fn
[inserter res]
(if-let [id (get id-map res)]
; If the resource has aleady been added, just return the id.
id
; Otherwise, add the node for the node, and remember its id for later.
(let [id (.createNode inserter {"resource" res} (make-array org.neo4j.graphdb.Label 0))]
(assoc! id-map res id)
id)))
connect-resource-nodes! (fn
[inserter node1 node2 label]
(let [relationship (DynamicRelationshipType/withName label)]
(.createRelationship inserter node1 node2 relationship nil)))
insert-tuple! (fn
[inserter tuple]
; Get the resource and label names out of the tuple.
(let [[resource-1 label resource-2 & _ ] tuple
; Upsert the resource nodes.
node-1 (insert-resource-node! inserter resource-1)
node-2 (insert-resource-node! inserter resource-2)]
; Connect the nodes with an edge.
(connect-resource-nodes! inserter node-1 node-2 label)))
]
(doseq [file files]
(log :debug (str "Loading file: " file))
(let [c (atom 0)]
(doseq [tuple (parse-file file)]
(if (= (mod @c 10000) 0)
(log :debug (str file ": " @c)))
(swap! c inc)
(insert-tuple! inserter tuple))))
(log :debug "Loading complete.")
(log :debug "Shutting down.")
(.shutdown inserter)
(log :debug "Shutdown complete!")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment