Created
April 1, 2013 18:53
-
-
Save calebphillips/5286858 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns nouns | |
(:require [clojure.java.io :as io] | |
[clojure.data.generators :as gen] | |
[datomic.api :as d]) | |
(:import [java.io PushbackReader])) | |
;; Knobs | |
(def transaction-size 1000) | |
(def transaction-count (atom 0)) | |
(def document-count 200000) | |
(def phrase-count 5000000) | |
(def schema | |
'[{:db/id #db/id[:db.part/db] | |
:db/ident :document/file-name | |
:db/valueType :db.type/string | |
:db/cardinality :db.cardinality/one | |
:db/index true | |
:db.install/_attribute :db.part/db} | |
{:db/id #db/id[:db.part/db] | |
:db/ident :noun-phrase/phrase | |
:db/valueType :db.type/string | |
:db/cardinality :db.cardinality/one | |
:db/index true | |
:db.install/_attribute :db.part/db} | |
{:db/id #db/id[:db.part/db] | |
:db/ident :noun-phrase/document | |
:db/valueType :db.type/ref | |
:db/cardinality :db.cardinality/one | |
;; :db/isComponent true | |
:db.install/_attribute :db.part/db}]) | |
(defn init-db! | |
"Creates the database at the supplied URI and loads the schema. | |
Returns a database connection." | |
[db-uri] | |
(when (d/create-database db-uri) | |
(let [conn (d/connect db-uri)] | |
@(d/transact conn schema) | |
conn))) | |
(defn db-batch-transact | |
"Run the transaction in partitioned batches. Returns the result of | |
the last transaction." | |
[conn transaction] | |
(let [tx-result (atom nil)] | |
(doseq [res (pmap #(d/transact-async conn (apply concat %)) | |
(partition transaction-size transaction))] | |
(prn {:tx (swap! transaction-count inc)}) | |
@res))) | |
(def word-chars [\a \b \c \d \e \f | |
\g \h \i \j \k \l | |
\m \n \o \p \q \r | |
\s \t \u \v \w \x | |
\y \z]) | |
(defn word-char | |
"Pick a random word character." | |
[] | |
(gen/rand-nth word-chars)) | |
(defn file-name-chars | |
"Pick a random file name character." | |
[] | |
(gen/rand-nth (conj word-chars \- \_ \.))) | |
(defn- gen-document | |
"Generates a noun-phrase transaction with a single noun-phrase. | |
Should be concated together with others like it." | |
([] | |
(gen-document (d/tempid :db.part/user))) | |
([tempid] | |
[{:db/id tempid | |
:document/file-name (gen/string file-name-chars | |
(gen/uniform 1 25))}])) | |
(defn- gen-phrase | |
"Generate a phrase between 1 and 5 words, each word containing | |
between 1 and 10 characters each." | |
[] | |
(apply str | |
(interpose " " | |
(gen/list (partial gen/string | |
word-char | |
(gen/uniform 1 10)) | |
(gen/uniform 1 5))))) | |
(defn- gen-noun-phrase | |
"Generates a noun-phrase transaction with a single noun-phrase. | |
Should be concated together with others like it." | |
([doc-id] | |
(gen-noun-phrase doc-id (d/tempid :db.part/user))) | |
([doc-id tempid] | |
[{:db/id tempid | |
:noun-phrase/phrase (gen-phrase) | |
:noun-phrase/document doc-id}])) | |
(defn generate-data! | |
"Generate some documents, and then some noun phrases in those documents." | |
[conn] | |
(prn {:msg "Generating documents"}) | |
(db-batch-transact conn | |
(for [i (range document-count)] | |
(gen-document))) | |
(d/request-index conn) | |
(d/gc-storage conn (java.util.Date.)) | |
(prn {:msg "Generating noun phrases"}) | |
(let [docs (mapv first | |
(d/q '[:find ?e | |
:where [?e :document/file-name]] | |
(d/db conn)))] | |
(db-batch-transact conn | |
(for [i (range phrase-count)] | |
(gen-noun-phrase (gen/rand-nth docs))))) | |
(d/request-index conn) | |
(d/gc-storage conn (java.util.Date.)) | |
(prn {:msg "..done."})) | |
;;;; Analysis helpers | |
(defn pbr | |
"Returns a PushbackReader for f, where f is any resource that can be | |
opened by io/reader" | |
[f] | |
(PushbackReader. (io/reader f))) | |
(defn form-seq | |
"Returns a lazy seq of forms from PushbackReader rdr." | |
[^java.io.PushbackReader rdr] | |
(lazy-seq | |
(let [form (read rdr nil nil)] | |
(when form (lazy-seq (cons form (form-seq rdr))))))) | |
(defmacro ms | |
"Evaluates expr and returns the time it took." | |
[expr] | |
`(let [start# (. System (nanoTime))] | |
~expr | |
(/ (double (- (. System (nanoTime)) start#)) | |
1000000.0))) | |
(def probs [0 0.25 0.5 0.75 0.9 0.95 0.99]) | |
(defn quantile | |
"From https://gist.github.com/scottdw/2960070" | |
([p vs] | |
(let [svs (sort vs)] | |
(quantile p (count vs) svs (first svs) (last svs)))) | |
([p c svs mn mx] | |
(let [pic (* p (inc c)) | |
k (int pic) | |
d (- pic k) | |
ndk (if (zero? k) mn (nth svs (dec k)))] | |
(cond | |
(zero? k) mn | |
(= c (dec k)) mx | |
(= c k) mx | |
:else (+ ndk (* d (- (nth svs k) ndk))))))) | |
(defn quantiles | |
[coll] | |
(let [msecs (map :ms coll) | |
quants (map #(quantile % msecs) | |
probs)] | |
(zipmap (map (comp keyword str) probs) | |
quants))) | |
(comment | |
(def uri "datomic:free://localhost:4334/nouns" | |
#_"datomic:mem://nouns") | |
(def conn (init-db! uri) | |
#_ (d/connect uri)) | |
(generate-data! conn) | |
(def repetitions 1000) | |
(def output-file "nouns.out") | |
(with-open [wrtr (io/writer (io/file output-file))] | |
(binding [*out* wrtr] | |
(dotimes [i repetitions] | |
(let [db (d/db conn) | |
;; Find a random phrase | |
phrase (-> db | |
(d/datoms :aevt :noun-phrase/phrase) | |
.iterator | |
iterator-seq | |
(nth (rand 50000)) | |
:v)] | |
;; Query, once "cold", once "warm" | |
(dotimes [i 2] | |
;; print to the output file | |
(prn {:phrase phrase | |
:iter (inc i) | |
:ms (ms | |
(d/q '[:find ?doc ?np2 ?phrase | |
:in $ ?in-phrase | |
:where | |
[?np1 :noun-phrase/phrase ?in-phrase] | |
[?np1 :noun-phrase/document ?doc] | |
[?np2 :noun-phrase/document ?doc] | |
[?np2 :noun-phrase/phrase ?phrase]] | |
(d/db conn) | |
phrase))})))))) | |
(def sample (with-open [rdr (pbr (io/file output-file))] | |
(into [] (form-seq rdr)))) | |
(quantiles sample) | |
(def cold (filter #(= 1 (:iter %)) sample)) | |
(def warm (filter #(= 2 (:iter %)) sample)) | |
(quantiles cold) | |
(quantiles warm) | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defproject nouns "0.1.0-SNAPSHOT" | |
:description "FIXME: write description" | |
:url "http://example.com/FIXME" | |
:license {:name "Eclipse Public License" | |
:url "http://www.eclipse.org/legal/epl-v10.html"} | |
:dependencies [[org.clojure/clojure "1.5.1"] | |
[org.clojure/data.generators "0.1.2"] | |
[com.datomic/datomic-pro "0.8.3861"]] | |
:java-opts ["-Xmx4g" "-server"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment