Created
March 16, 2014 22:48
-
-
Save MoyTW/9591006 to your computer and use it in GitHub Desktop.
Extremely Primitive Markov Chain
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; Extremely Primitve Markov Chain | |
;;; Doesn't try to handle punctuation, capitalization, or anything; just | |
;;; generates text as-is from the corpus. | |
;;; Corpus I used was: http://www.shakespeares-sonnets.com/all.php | |
;;; Considering how important capitalization, plurals, and punctuation is to | |
;;; sonnets, I think it might work a little better this way...I should check | |
;;; that out, later. | |
;; This is so you can create the same chain multiple times. | |
;; Basically, it fixes the seed. | |
(def ^:dynamic ^java.util.Random *rnd* (java.util.Random. 1)) | |
(defn get-rand [x] | |
(.nextInt *rnd* x)) | |
(def delimiters #{\space \newline}) | |
(defn not-delimiter? [char] | |
(not (delimiters char))) | |
(defn snip-delimiters [coll] | |
(drop-while delimiters coll)) | |
;; Returns a map of maps of counts: | |
;; {"He" {"is" 2}, | |
;; "is" {"a" 2, "sad", 2}} | |
(defn parse-counts [coll] | |
(loop [counts {} corpus coll] | |
(let [[word rest-corpus] (split-with not-delimiter? (snip-delimiters corpus)) | |
follows (take-while not-delimiter? (snip-delimiters rest-corpus))] | |
(if (seq rest-corpus) | |
(recur (update-in counts [word follows] (fnil inc 0)) rest-corpus) | |
counts)))) | |
(defn pick-word [words] | |
(if-let [word-seq (seq words)] | |
(let [keys (vec (map first word-seq)) | |
counts (vec (map second word-seq)) | |
total-counts (reduce + counts) | |
r (get-rand total-counts)] | |
(loop [i 0 sum 0] | |
(if (< r (+ (counts i) sum)) | |
(nth keys i) | |
(recur (inc i) (+ (counts i) sum))))))) | |
;; That's a little convoluted. But basically, it generates a random from zero | |
;; to total. Then, if it's less than the magnitude of the first connection, it | |
;; chooses the first word. If it's not it adds that, then proceeds to the next | |
;; connection and sums that. | |
;; So if you have 10, 15, and 5, what it does is: | |
;; Is random < 10? If so, return first word. | |
;; Is random < 10+15=25? If so, return second word. | |
;; Is random < 25+5=30? If so, return third word. | |
;; You know, this should really take in a count, because currently? It goes | |
;; until it hits an end node. | |
(defn chain [start-word counts] | |
(loop [out [] word start-word] | |
(if-let [next-word (pick-word (counts word))] | |
(recur (conj out next-word) next-word) | |
out))) | |
(defn lazy-chain [start-word counts] | |
(if-let [next-word (pick-word (counts start-word))] | |
(cons start-word (lazy-seq (lazy-chain next-word counts))))) | |
(defn from-word [word n counts] | |
(clojure.string/join | |
\space | |
(take n (map #(apply str %) | |
(lazy-chain (seq word) counts))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment