Skip to content

Instantly share code, notes, and snippets.

@nlacasse
Created January 24, 2013 20:35
Show Gist options
  • Save nlacasse/4627414 to your computer and use it in GitHub Desktop.
Save nlacasse/4627414 to your computer and use it in GitHub Desktop.
dbpedia batch inserter
(ns opal.dbpedia
(:use [clojure.tools.logging :only [log]])
(:require [clojure.java.io :as io])
(:import [uk.ac.manchester.cs.owl.owlapi.turtle.parser TurtleParser]
[org.neo4j.unsafe.batchinsert BatchInserters
LuceneBatchInserterIndexProvider]
[org.neo4j.graphdb DynamicRelationshipType]))
;; PARSING METHODS
(defn get-next-tuple
[parser]
(let [last-item (atom nil)
tuple (atom [])]
(while (and (not= "." @last-item)
(not= "" @last-item))
(reset! last-item
(-> parser
(.getNextToken)
(.toString)))
(swap! tuple conj @last-item))
(when-not (empty? (first @tuple)) ; .getNextToken returns "" once you are out of data
@tuple)))
(defn seq-of-parser
[parser]
(if-let [next-tuple (get-next-tuple parser)]
(lazy-cat [next-tuple]
(seq-of-parser parser))))
(defn parse-file
[filename]
(seq-of-parser
(TurtleParser.
(io/input-stream filename))))
;; BATCH UPSERT METHODS
(def id-map (atom (transient {})))
(defn upsert-resource-node!
[inserter idx res]
(if-let [id (get @id-map res)]
; If the resource has aleady been added, just return the id.
id
; Otherwise, add the node for the node, and remember its id for later.
(let [id (.createNode inserter {"resource" res})]
(swap! id-map #(assoc! % res id))
id)))
(defn upconnect-resource-nodes!
[inserter node1 node2 label]
(let [relationship (DynamicRelationshipType/withName label)]
(.createRelationship inserter node1 node2 relationship nil)))
(defn upsert-tuple!
[inserter idx tuple]
; Get the resource and label names out of the tuple.
(let [[resource-1 label resource-2 & _ ] tuple
; Upsert the resource nodes.
node-1 (upsert-resource-node! inserter idx resource-1)
node-2 (upsert-resource-node! inserter idx resource-2)]
; Connect the nodes with an edge.
(upconnect-resource-nodes! inserter node-1 node-2 label)))
(defn -main [graph-path & files]
(let [inserter (BatchInserters/inserter graph-path)
idx-provider (LuceneBatchInserterIndexProvider. inserter)
idx (.nodeIndex idx-provider "lookup-index" {"type" "exact"})]
(doseq [file files]
(log :debug (str "Loading file: " file))
(let [c (atom 0)]
(doseq [tuple (parse-file file)]
(if (= (mod @c 10000) 0)
(log :debug (str file ": " @c)))
(swap! c inc)
(upsert-tuple! inserter idx tuple))))
(log :debug "Loading complete.")
(log :debug "Shutting down.")
(.shutdown idx-provider)
(.shutdown inserter)
(log :debug "Shutdown complete!")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment