Skip to content

Instantly share code, notes, and snippets.

@calebphillips
Created April 1, 2013 18:53
Show Gist options
  • Save calebphillips/5286858 to your computer and use it in GitHub Desktop.
Save calebphillips/5286858 to your computer and use it in GitHub Desktop.
(ns nouns
(:require [clojure.java.io :as io]
[clojure.data.generators :as gen]
[datomic.api :as d])
(:import [java.io PushbackReader]))
;; Knobs
(def transaction-size 1000)
(def transaction-count (atom 0))
(def document-count 200000)
(def phrase-count 5000000)
(def schema
'[{:db/id #db/id[:db.part/db]
:db/ident :document/file-name
:db/valueType :db.type/string
:db/cardinality :db.cardinality/one
:db/index true
:db.install/_attribute :db.part/db}
{:db/id #db/id[:db.part/db]
:db/ident :noun-phrase/phrase
:db/valueType :db.type/string
:db/cardinality :db.cardinality/one
:db/index true
:db.install/_attribute :db.part/db}
{:db/id #db/id[:db.part/db]
:db/ident :noun-phrase/document
:db/valueType :db.type/ref
:db/cardinality :db.cardinality/one
;; :db/isComponent true
:db.install/_attribute :db.part/db}])
(defn init-db!
"Creates the database at the supplied URI and loads the schema.
Returns a database connection."
[db-uri]
(when (d/create-database db-uri)
(let [conn (d/connect db-uri)]
@(d/transact conn schema)
conn)))
(defn db-batch-transact
"Run the transaction in partitioned batches. Returns the result of
the last transaction."
[conn transaction]
(let [tx-result (atom nil)]
(doseq [res (pmap #(d/transact-async conn (apply concat %))
(partition transaction-size transaction))]
(prn {:tx (swap! transaction-count inc)})
@res)))
(def word-chars [\a \b \c \d \e \f
\g \h \i \j \k \l
\m \n \o \p \q \r
\s \t \u \v \w \x
\y \z])
(defn word-char
"Pick a random word character."
[]
(gen/rand-nth word-chars))
(defn file-name-chars
"Pick a random file name character."
[]
(gen/rand-nth (conj word-chars \- \_ \.)))
(defn- gen-document
"Generates a noun-phrase transaction with a single noun-phrase.
Should be concated together with others like it."
([]
(gen-document (d/tempid :db.part/user)))
([tempid]
[{:db/id tempid
:document/file-name (gen/string file-name-chars
(gen/uniform 1 25))}]))
(defn- gen-phrase
"Generate a phrase between 1 and 5 words, each word containing
between 1 and 10 characters each."
[]
(apply str
(interpose " "
(gen/list (partial gen/string
word-char
(gen/uniform 1 10))
(gen/uniform 1 5)))))
(defn- gen-noun-phrase
"Generates a noun-phrase transaction with a single noun-phrase.
Should be concated together with others like it."
([doc-id]
(gen-noun-phrase doc-id (d/tempid :db.part/user)))
([doc-id tempid]
[{:db/id tempid
:noun-phrase/phrase (gen-phrase)
:noun-phrase/document doc-id}]))
(defn generate-data!
"Generate some documents, and then some noun phrases in those documents."
[conn]
(prn {:msg "Generating documents"})
(db-batch-transact conn
(for [i (range document-count)]
(gen-document)))
(d/request-index conn)
(d/gc-storage conn (java.util.Date.))
(prn {:msg "Generating noun phrases"})
(let [docs (mapv first
(d/q '[:find ?e
:where [?e :document/file-name]]
(d/db conn)))]
(db-batch-transact conn
(for [i (range phrase-count)]
(gen-noun-phrase (gen/rand-nth docs)))))
(d/request-index conn)
(d/gc-storage conn (java.util.Date.))
(prn {:msg "..done."}))
;;;; Analysis helpers
(defn pbr
"Returns a PushbackReader for f, where f is any resource that can be
opened by io/reader"
[f]
(PushbackReader. (io/reader f)))
(defn form-seq
"Returns a lazy seq of forms from PushbackReader rdr."
[^java.io.PushbackReader rdr]
(lazy-seq
(let [form (read rdr nil nil)]
(when form (lazy-seq (cons form (form-seq rdr)))))))
(defmacro ms
"Evaluates expr and returns the time it took."
[expr]
`(let [start# (. System (nanoTime))]
~expr
(/ (double (- (. System (nanoTime)) start#))
1000000.0)))
(def probs [0 0.25 0.5 0.75 0.9 0.95 0.99])
(defn quantile
"From https://gist.github.com/scottdw/2960070"
([p vs]
(let [svs (sort vs)]
(quantile p (count vs) svs (first svs) (last svs))))
([p c svs mn mx]
(let [pic (* p (inc c))
k (int pic)
d (- pic k)
ndk (if (zero? k) mn (nth svs (dec k)))]
(cond
(zero? k) mn
(= c (dec k)) mx
(= c k) mx
:else (+ ndk (* d (- (nth svs k) ndk)))))))
(defn quantiles
[coll]
(let [msecs (map :ms coll)
quants (map #(quantile % msecs)
probs)]
(zipmap (map (comp keyword str) probs)
quants)))
(comment
(def uri "datomic:free://localhost:4334/nouns"
#_"datomic:mem://nouns")
(def conn (init-db! uri)
#_ (d/connect uri))
(generate-data! conn)
(def repetitions 1000)
(def output-file "nouns.out")
(with-open [wrtr (io/writer (io/file output-file))]
(binding [*out* wrtr]
(dotimes [i repetitions]
(let [db (d/db conn)
;; Find a random phrase
phrase (-> db
(d/datoms :aevt :noun-phrase/phrase)
.iterator
iterator-seq
(nth (rand 50000))
:v)]
;; Query, once "cold", once "warm"
(dotimes [i 2]
;; print to the output file
(prn {:phrase phrase
:iter (inc i)
:ms (ms
(d/q '[:find ?doc ?np2 ?phrase
:in $ ?in-phrase
:where
[?np1 :noun-phrase/phrase ?in-phrase]
[?np1 :noun-phrase/document ?doc]
[?np2 :noun-phrase/document ?doc]
[?np2 :noun-phrase/phrase ?phrase]]
(d/db conn)
phrase))}))))))
(def sample (with-open [rdr (pbr (io/file output-file))]
(into [] (form-seq rdr))))
(quantiles sample)
(def cold (filter #(= 1 (:iter %)) sample))
(def warm (filter #(= 2 (:iter %)) sample))
(quantiles cold)
(quantiles warm)
)
(defproject nouns "0.1.0-SNAPSHOT"
:description "FIXME: write description"
:url "http://example.com/FIXME"
:license {:name "Eclipse Public License"
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.clojure/clojure "1.5.1"]
[org.clojure/data.generators "0.1.2"]
[com.datomic/datomic-pro "0.8.3861"]]
:java-opts ["-Xmx4g" "-server"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment