-
-
Save tommyettinger/8192768 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns opal.dbpedia | |
(:use [clojure.tools.logging :only [log]]) | |
(:require [clojure.java.io :as io]) | |
(:import [uk.ac.manchester.cs.owl.owlapi.turtle.parser TurtleParser] | |
[org.neo4j.graphdb Label] | |
[org.neo4j.index.lucene.unsafe.batchinsert LuceneBatchInserterIndexProvider] | |
[org.neo4j.unsafe.batchinsert BatchInserters] | |
[org.neo4j.graphdb DynamicRelationshipType])) | |
;; PARSING METHODS | |
(defn get-next-tuple | |
[parser] | |
(let [last-item (atom nil) | |
tuple (transient [])] | |
(while (and (not= "." @last-item) | |
(not= "" @last-item)) | |
(reset! last-item | |
(-> parser | |
(.getNextToken) | |
(.toString))) | |
(conj! tuple @last-item)) | |
(when-not (empty? (get tuple 0)) ; .getNextToken returns "" once you are out of data | |
(persistent! tuple)))) | |
(defn seq-of-parser | |
[parser] | |
(if-let [next-tuple (get-next-tuple parser)] | |
(lazy-cat [next-tuple] | |
(seq-of-parser parser)))) | |
(defn parse-file | |
[filename] | |
(seq-of-parser | |
(TurtleParser. | |
(io/input-stream filename)))) | |
;; BATCH UPSERT METHODS | |
(defn -main [graph-path & files] | |
(let [inserter (BatchInserters/inserter graph-path) | |
id-map (transient {}) | |
insert-resource-node! (fn | |
[inserter res] | |
(if-let [id (get id-map res)] | |
; If the resource has aleady been added, just return the id. | |
id | |
; Otherwise, add the node for the node, and remember its id for later. | |
(let [id (.createNode inserter {"resource" res} (make-array org.neo4j.graphdb.Label 0))] | |
(assoc! id-map res id) | |
id))) | |
connect-resource-nodes! (fn | |
[inserter node1 node2 label] | |
(let [relationship (DynamicRelationshipType/withName label)] | |
(.createRelationship inserter node1 node2 relationship nil))) | |
insert-tuple! (fn | |
[inserter tuple] | |
; Get the resource and label names out of the tuple. | |
(let [[resource-1 label resource-2 & _ ] tuple | |
; Upsert the resource nodes. | |
node-1 (insert-resource-node! inserter resource-1) | |
node-2 (insert-resource-node! inserter resource-2)] | |
; Connect the nodes with an edge. | |
(connect-resource-nodes! inserter node-1 node-2 label))) | |
] | |
(doseq [file files] | |
(log :debug (str "Loading file: " file)) | |
(let [c (atom 0)] | |
(doseq [tuple (parse-file file)] | |
(if (= (mod @c 10000) 0) | |
(log :debug (str file ": " @c))) | |
(swap! c inc) | |
(insert-tuple! inserter tuple)))) | |
(log :debug "Loading complete.") | |
(log :debug "Shutting down.") | |
(.shutdown inserter) | |
(log :debug "Shutdown complete!"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment