Created
January 24, 2013 20:35
-
-
Save nlacasse/4627414 to your computer and use it in GitHub Desktop.
dbpedia batch inserter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns opal.dbpedia | |
(:use [clojure.tools.logging :only [log]]) | |
(:require [clojure.java.io :as io]) | |
(:import [uk.ac.manchester.cs.owl.owlapi.turtle.parser TurtleParser] | |
[org.neo4j.unsafe.batchinsert BatchInserters | |
LuceneBatchInserterIndexProvider] | |
[org.neo4j.graphdb DynamicRelationshipType])) | |
;; PARSING METHODS | |
(defn get-next-tuple | |
[parser] | |
(let [last-item (atom nil) | |
tuple (atom [])] | |
(while (and (not= "." @last-item) | |
(not= "" @last-item)) | |
(reset! last-item | |
(-> parser | |
(.getNextToken) | |
(.toString))) | |
(swap! tuple conj @last-item)) | |
(when-not (empty? (first @tuple)) ; .getNextToken returns "" once you are out of data | |
@tuple))) | |
(defn seq-of-parser | |
[parser] | |
(if-let [next-tuple (get-next-tuple parser)] | |
(lazy-cat [next-tuple] | |
(seq-of-parser parser)))) | |
(defn parse-file | |
[filename] | |
(seq-of-parser | |
(TurtleParser. | |
(io/input-stream filename)))) | |
;; BATCH UPSERT METHODS | |
(def id-map (atom (transient {}))) | |
(defn upsert-resource-node! | |
[inserter idx res] | |
(if-let [id (get @id-map res)] | |
; If the resource has aleady been added, just return the id. | |
id | |
; Otherwise, add the node for the node, and remember its id for later. | |
(let [id (.createNode inserter {"resource" res})] | |
(swap! id-map #(assoc! % res id)) | |
id))) | |
(defn upconnect-resource-nodes! | |
[inserter node1 node2 label] | |
(let [relationship (DynamicRelationshipType/withName label)] | |
(.createRelationship inserter node1 node2 relationship nil))) | |
(defn upsert-tuple! | |
[inserter idx tuple] | |
; Get the resource and label names out of the tuple. | |
(let [[resource-1 label resource-2 & _ ] tuple | |
; Upsert the resource nodes. | |
node-1 (upsert-resource-node! inserter idx resource-1) | |
node-2 (upsert-resource-node! inserter idx resource-2)] | |
; Connect the nodes with an edge. | |
(upconnect-resource-nodes! inserter node-1 node-2 label))) | |
(defn -main [graph-path & files] | |
(let [inserter (BatchInserters/inserter graph-path) | |
idx-provider (LuceneBatchInserterIndexProvider. inserter) | |
idx (.nodeIndex idx-provider "lookup-index" {"type" "exact"})] | |
(doseq [file files] | |
(log :debug (str "Loading file: " file)) | |
(let [c (atom 0)] | |
(doseq [tuple (parse-file file)] | |
(if (= (mod @c 10000) 0) | |
(log :debug (str file ": " @c))) | |
(swap! c inc) | |
(upsert-tuple! inserter idx tuple)))) | |
(log :debug "Loading complete.") | |
(log :debug "Shutting down.") | |
(.shutdown idx-provider) | |
(.shutdown inserter) | |
(log :debug "Shutdown complete!"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment