MoyTW · March 16, 2014 22:48
diff --git a/mchain.clj b/mchain.clj
 ;;; Extremely Primitve Markov Chain
 ;;; Doesn't try to handle punctuation, capitalization, or anything; just
 ;;; generates text as-is from the corpus.
 ;;; Corpus I used was: http://www.shakespeares-sonnets.com/all.php
 ;;; Considering how important capitalization, plurals, and punctuation is to
 ;;; sonnets, I think it might work a little better this way...I should check
 ;;; that out, later.

 ;; This is so you can create the same chain multiple times.
 ;; Basically, it fixes the seed.
 (def ^:dynamic ^java.util.Random *rnd* (java.util.Random. 1))
 (defn get-rand [x]
  (.nextInt *rnd* x))

 (def delimiters #{\space \newline})
 
 (defn not-delimiter? [char]
  (not (delimiters char)))
 
 (defn snip-delimiters [coll]
  (drop-while delimiters coll))

 ;; Returns a map of maps of counts:
 ;; {"He" {"is" 2},
 ;;  "is" {"a" 2, "sad", 2}}
 (defn parse-counts [coll]
  (loop [counts {} corpus coll]
    (let [[word rest-corpus] (split-with not-delimiter? (snip-delimiters corpus))
          follows (take-while not-delimiter? (snip-delimiters rest-corpus))]
      (if (seq rest-corpus)
          (recur (update-in counts [word follows] (fnil inc 0)) rest-corpus)
          counts))))

 (defn pick-word [words]
  (if-let [word-seq (seq words)]
    (let [keys (vec (map first word-seq))
          counts (vec (map second word-seq))
          total-counts (reduce + counts)
          r (get-rand total-counts)]
      (loop [i 0 sum 0]
        (if (< r (+ (counts i) sum))
            (nth keys i)
            (recur (inc i) (+ (counts i) sum)))))))
 ;; That's a little convoluted. But basically, it generates a random from zero
 ;; to total. Then, if it's less than the magnitude of the first connection, it
 ;; chooses the first word. If it's not it adds that, then proceeds to the next
 ;; connection and sums that.
 ;; So if you have 10, 15, and 5, what it does is:
 ;; Is random < 10? If so, return first word.
 ;; Is random < 10+15=25? If so, return second word.
 ;; Is random < 25+5=30? If so, return third word.

 ;; You know, this should really take in a count, because currently? It goes
 ;; until it hits an end node.
 (defn chain [start-word counts]
  (loop [out [] word start-word]
    (if-let [next-word (pick-word (counts word))]
      (recur (conj out next-word) next-word)
      out)))

 (defn lazy-chain [start-word counts]
  (if-let [next-word (pick-word (counts start-word))]
    (cons start-word (lazy-seq (lazy-chain next-word counts)))))

 (defn from-word [word n counts]
  (clojure.string/join
    \space
    (take n (map #(apply str %) 
                  (lazy-chain (seq word) counts)))))
	;;; Extremely Primitve Markov Chain
	;;; Doesn't try to handle punctuation, capitalization, or anything; just
	;;; generates text as-is from the corpus.
	;;; Corpus I used was: http://www.shakespeares-sonnets.com/all.php
	;;; Considering how important capitalization, plurals, and punctuation is to
	;;; sonnets, I think it might work a little better this way...I should check
	;;; that out, later.

	;; This is so you can create the same chain multiple times.
	;; Basically, it fixes the seed.
	(def ^:dynamic ^java.util.Random rnd (java.util.Random. 1))
	(defn get-rand [x]
	(.nextInt rnd x))

	(def delimiters #{\space \newline})

	(defn not-delimiter? [char]
	(not (delimiters char)))

	(defn snip-delimiters [coll]
	(drop-while delimiters coll))

	;; Returns a map of maps of counts:
	;; {"He" {"is" 2},
	;; "is" {"a" 2, "sad", 2}}
	(defn parse-counts [coll]
	(loop [counts {} corpus coll]
	(let [[word rest-corpus] (split-with not-delimiter? (snip-delimiters corpus))
	follows (take-while not-delimiter? (snip-delimiters rest-corpus))]
	(if (seq rest-corpus)
	(recur (update-in counts [word follows] (fnil inc 0)) rest-corpus)
	counts))))

	(defn pick-word [words]
	(if-let [word-seq (seq words)]
	(let [keys (vec (map first word-seq))
	counts (vec (map second word-seq))
	total-counts (reduce + counts)
	r (get-rand total-counts)]
	(loop [i 0 sum 0]
	(if (< r (+ (counts i) sum))
	(nth keys i)
	(recur (inc i) (+ (counts i) sum)))))))
	;; That's a little convoluted. But basically, it generates a random from zero
	;; to total. Then, if it's less than the magnitude of the first connection, it
	;; chooses the first word. If it's not it adds that, then proceeds to the next
	;; connection and sums that.
	;; So if you have 10, 15, and 5, what it does is:
	;; Is random < 10? If so, return first word.
	;; Is random < 10+15=25? If so, return second word.
	;; Is random < 25+5=30? If so, return third word.

	;; You know, this should really take in a count, because currently? It goes
	;; until it hits an end node.
	(defn chain [start-word counts]
	(loop [out [] word start-word]
	(if-let [next-word (pick-word (counts word))]
	(recur (conj out next-word) next-word)
	out)))

	(defn lazy-chain [start-word counts]
	(if-let [next-word (pick-word (counts start-word))]
	(cons start-word (lazy-seq (lazy-chain next-word counts)))))

	(defn from-word [word n counts]
	(clojure.string/join
	\space
	(take n (map #(apply str %)
	(lazy-chain (seq word) counts)))))