Created
January 21, 2012 00:29
-
-
Save willtim/1650422 to your computer and use it in GitHub Desktop.
Building a book index
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(use '[clojure.core.match :only [match]]) | |
;; quick and simple trie implementation from the description | |
;; in the book "Purely Functional Data Structures" by Okasaki | |
(def empty-trie [:trie :value nil :edges {}]) | |
(defn lookup [key trie] | |
(match [key trie] | |
[[] [:trie :value nil :edges _]] nil | |
[[] [:trie :value x :edges _]] x | |
[[k & ks] [:trie :value v :edges m]] (lookup ks (get m k)) | |
)) | |
(defn bind [key val trie] | |
(match [key val trie] | |
[[] x [:trie :value nil :edges m]] [:trie :value x :edges m] | |
[[k & ks] x [:trie :value v :edges m]] | |
(let [t (get m k empty-trie) | |
t* (bind ks x t)] | |
[:trie :value v :edges (assoc m k t*)]))) | |
;; test the trie | |
(= 1 (lookup (vec "one") | |
(bind (vec "one") 1 empty-trie))) | |
(= 2 | |
(->> | |
empty-trie | |
(bind (vec "one") 1) | |
(bind (vec "two") 2) | |
(bind (vec "three") 3) | |
(lookup (vec "two")))) | |
;;;;;; using this trie to generate an index for a book | |
;; the book | |
(def example-book ["Page 1. The quick brown fox jumps over the lazy dog" | |
"Page 2. All good things must come to and end" | |
"Page 3. The older the fiddler the sweeter the tune"]) | |
;; the phrases we want to limit the index too | |
(def custom-phrases ["good things" "lazy dog" "dog" "brown fox" "fox" "fiddler"]) | |
;;;;;; build the tries from the pages | |
(defn tokenize [s] (seq (.split #" " s))) | |
(defn index-page [trie count page] | |
(let [words (tokenize page) | |
phrases (map (fn [[a b]] (apply str a " " b)) | |
(partition 2 (interleave words (rest words)))) | |
trie* (reduce | |
(fn [trie word] (bind (vec word) count trie)) | |
trie words) | |
trie** (reduce | |
(fn [trie phrase] (bind (vec phrase) count trie)) | |
trie* phrases)] | |
trie**)) | |
(defn index-book [pages] | |
(let [[trie _] | |
(reduce (fn [[trie count] page] | |
[(index-page trie count page) (inc count)]) | |
[empty-trie 1] | |
pages)] | |
trie)) | |
;; test | |
(= 1 (lookup (vec "lazy dog") (index-book example-book))) | |
(= 3 (lookup (vec "fiddler") (index-book example-book))) | |
;; finally we build the custom index | |
(defn build-custom-index [pages phrases] | |
(let [full-index (index-book pages)] | |
(map (fn [item] [item (lookup (vec item) full-index)]) (sort phrases)))) | |
;; test | |
(= [ ["brown fox" 1] ["dog" 1] ["fiddler" 3] | |
["fox" 1] ["good things" 2] ["lazy dog" 1]] | |
(build-custom-index example-book custom-phrases)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment