Created
January 22, 2013 17:51
-
-
Save samuelclay/4596630 to your computer and use it in GitHub Desktop.
A Radix Trie (aka PATRICIA trie) implemented in Clojure. For my JavaScript implementation, see this interactive JS fiddle: http://jsfiddle.net/jCYAw/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns radix | |
(:require [clojure.string :as string])) | |
(use 'clojure.java.io) | |
(use 'clojure.pprint) | |
(println "Loading names... ") | |
(time (def names | |
(with-open | |
[rdr (reader | |
"/usr/share/dict/ProperNames")] | |
(map string/lower-case | |
(doall | |
(take 30000 (line-seq rdr))))))) | |
(println "Loading words... ") | |
(time (def words | |
(with-open | |
[rdr (reader | |
"/usr/share/dict/words")] | |
(map string/lower-case | |
(doall | |
(line-seq rdr)))))) | |
;; From closure.contrib, but missing from clojure 1.4.0 for some reason. | |
(defn dissoc-in | |
"Dissociates an entry from a nested associative structure returning a new | |
nested structure. keys is a sequence of keys. Any empty maps that result | |
will not be present in the new structure." | |
[m [k & ks :as keys]] | |
(if ks | |
(if-let [nextmap (get m k)] | |
(let [newmap (dissoc-in nextmap ks)] | |
(if (seq newmap) | |
(assoc m k newmap) | |
(dissoc m k))) | |
m) | |
(dissoc m k))) | |
(defn word-seq | |
[word] | |
(map-indexed (fn [i l] (subs word 0 (inc i))) word)) | |
(defn lookup-tree | |
[T word & {:keys [parentKeys] :or {parentKeys []}}] | |
(if-let | |
[matchedPrefix (first (filter (fn [prefix] (get T prefix)) (word-seq word)))] | |
(let [subT (get T matchedPrefix)] | |
; (println "Match: " matchedPrefix (get T matchedPrefix) (conj parentKeys matchedPrefix)) | |
(lookup-tree subT | |
(subs word (count matchedPrefix)) | |
:parentKeys (conj parentKeys matchedPrefix))) | |
(do ;(println "No match: " word) | |
[T (conj parentKeys word)]))) | |
(defn common-prefix-length | |
[word1 word2] | |
(loop [i 0] | |
(if (= (get word1 i) (get word2 i)) | |
(recur (inc i)) | |
i))) | |
(defn common-sibling | |
[subT parentKeys] | |
; (println "Common siblings? " (keys subT) parentKeys) | |
(if-let [siblingKeys (keys subT)] | |
(if-let [sibling (first (filter | |
(fn [key] (< 0 (common-prefix-length key (last parentKeys)))) | |
siblingKeys))] | |
sibling))) | |
(defn insert | |
[T word] | |
(let [[subT parentKeys] (lookup-tree T word)] | |
; (println "\nInserting " word parentKeys subT) | |
(if-let [sibling (common-sibling subT parentKeys)] | |
(let [commonPrefixLength (common-prefix-length (last parentKeys) sibling) | |
taillessParentKeys (vec (take (dec (count parentKeys)) parentKeys)) | |
newKeys (conj taillessParentKeys | |
(subs (last parentKeys) 0 commonPrefixLength)) | |
oldKeys (conj taillessParentKeys | |
sibling)] | |
; (println "New keys: " newKeys oldKeys sibling) | |
(assoc-in (dissoc-in T oldKeys) newKeys { | |
(subs (last parentKeys) commonPrefixLength) {} | |
(subs sibling commonPrefixLength) {} | |
})) | |
(assoc-in T parentKeys {})))) | |
(defn assemble-nodes | |
[tree prefix] | |
(flatten (map (fn [t] | |
(if (empty? (tree t)) | |
(apply str (concat prefix t)) | |
(assemble-nodes (tree t) (concat prefix t)))) | |
(keys tree)))) | |
(defn lookup | |
[T word & {:keys [limit] :or {limit 10}}] | |
(println "\nLooking up: " word) | |
(let [word (string/lower-case word) | |
tree (get (lookup-tree T word) 0)] | |
(take limit (reverse (assemble-nodes tree word))))) | |
(println "Building names trie... ") | |
(time (def N (reduce insert {} names))) | |
(println "Building words trie... ") | |
(time (def W (reduce insert {} words))) | |
(pprint (lookup N "Al")) | |
(pprint (lookup N "Bo")) | |
(pprint (lookup W "Intell" :limit 3)) | |
(pprint (lookup W "Ear" :limit 3)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment