Created
June 13, 2011 19:31
-
-
Save jcromartie/1023496 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns markov | |
(use clojure.test)) | |
;; ---------- basic markov chain utils and API ---------- | |
(defn n-grams | |
"Partition seq s into all sequential groups of n items" | |
[n s] | |
(partition n 1 (repeat nil) s)) | |
(defn scan-grams | |
"Return occurence map from sequeunce of n+1-grams" | |
[grams] | |
(reduce (fn [m gram] | |
(update-in m [(butlast gram)] conj (last gram))) | |
{} | |
grams)) | |
(defn- markov-next | |
"Return the next markov chain based on the occurance structure, | |
n-gram count, and generated chain vector so far" | |
[occ n text-vec] | |
;; (println "text-vec" text-vec) | |
(let [last-n (seq (subvec text-vec (- (count text-vec) n))) | |
next-item (rand-nth (get occ last-n))] | |
(conj text-vec next-item))) | |
(defn markov-seq | |
"Infinite lazy markov chain from occurrence map." | |
([occ init-chain] | |
(let [init-chain-vec (vec init-chain) | |
n (-> occ keys first count) | |
gen-next-chain #(markov-next occ n %) | |
;; drop 1 to skip init-chain | |
chain-seq (drop 1 (iterate gen-next-chain init-chain))] | |
(take-while identity (concat init-chain (map last chain-seq))))) | |
([occ] | |
(let [init (-> occ keys rand-nth)] | |
(markov-seq occ init)))) | |
;; ---------- "brain" API, for smarter text generation ---------- | |
(defn make-brain | |
"Returns empty brain for text generation training." | |
[] | |
{:starts [] :occur {} :originals #{}}) | |
(defn train | |
"Returns brain trained using n-grams of s. n defaults to 3." | |
([brain n s] | |
(let [s-n-grams (n-grams (inc n) s)] | |
(if (empty? s-n-grams) | |
brain | |
(-> brain | |
(update-in [:originals] conj s) | |
(update-in [:starts] conj (butlast (first s-n-grams))) | |
(update-in [:occur] #(merge-with into % (scan-grams s-n-grams))))))) | |
([brain s] | |
(train brain 3 s))) | |
(defn load-brain | |
"Returns brain trained with trigrams of each line in file at path" | |
[path] | |
(reduce train (make-brain) (-> path slurp (.split "\n")))) | |
(defn- speak-1 | |
"Generate string with optional max length n. n defaults to 50" | |
([brain] | |
(let [init (vec (-> brain :starts rand-nth)) | |
char-seq (markov-seq (:occur brain) init) | |
limit (:limit brain)] | |
(apply str (if limit (take limit char-seq) char-seq))))) | |
(defn speak | |
"Returns lazy seq of sentences generated by brain." | |
[brain] | |
(repeatedly #(speak-1 brain))) | |
(defn speak-unique | |
"Returns lazy seq of only sentences not present in original training | |
text." | |
[brain] | |
(remove (get brain :originals #{}) (speak brain))) | |
;; ---------- tests ---------- | |
(deftest simple-text-generation | |
(let [s "this is a test" | |
brain (-> (make-brain) (train s)) | |
text-seq (take 1000 (speak brain))] | |
(are [x] (some #{x} text-seq) | |
"this a test" | |
"this is is a test") | |
(is (some #{s} text-seq) | |
"Original in generated text") | |
(is (nil? (some #{s} (take 1000 (speak-unique brain)))) | |
"Original NOT in generated *unique* text"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment