Created
March 31, 2016 14:08
-
-
Save telent/ef6cb7b8e4d092c4840762e4e54ff474 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns money-tolkien.core | |
(:require [reaver :as r] | |
[clojure.string :as str] | |
[clojure.xml :as xml] | |
[clojure.zip :as zip])) | |
(def tolkien-text (slurp "http://scifi.stackexchange.com/feeds/tag?tagnames=tolkien&sort=newest")) | |
(def money-text (slurp "https://blog.moneyadviceservice.org.uk/tag/mortgages.atom")) | |
(defn zip-str [s] | |
(zip/xml-zip | |
(xml/parse (java.io.ByteArrayInputStream. (.getBytes s))))) | |
(defn entries [doc] | |
(filter #(= (:tag %) :entry) (tree-seq identity :content {:content doc}))) | |
(defn texts [entry] | |
(let [els (filter #(contains? #{:title :summary :content} (:tag %)) | |
(tree-seq identity :content entry))] | |
(mapcat :content els))) | |
(defn all-text [atom-doc] | |
(str/join " " (map #(-> % r/parse .text ) (mapcat texts (entries atom-doc))))) | |
;; we want a map in which each key is a pair of adjacent words and its | |
;; value is a collection (with duplicates) of words that may follow | |
(defn occurrences-map [ & docs] | |
(let [text (str/join " " (map all-text docs))] | |
(reduce (fn [m [prev1 prev2 next]] | |
(let [k [prev1 prev2]] | |
(assoc m k (conj (get m k []) next)))) | |
{} | |
(partition 3 1 (str/split text #" "))))) | |
(defn random-from-map [map [prev1 prev2]] | |
(let [choices (get map [prev1 prev2])] | |
[prev2 (get choices (rand-int (count choices)))])) | |
(defn chain [omap [word1 word2]] | |
(map second | |
(iterate | |
(partial random-from-map omap) | |
[word1 word2]))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment