Created
July 8, 2013 20:38
-
-
Save ckirkendall/5952290 to your computer and use it in GitHub Desktop.
Clojure version of Norvig's spell checker for LambdaJam.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns spell-checker.speller) | |
(defonce training-data (re-seq #"[a-z]+" (.toLowerCase (slurp "/home/ckirkendall/Development/clojure/spelling-jam/data/big.txt")))) | |
(defonce NWORDS (atom (frequencies training-data))) | |
(defn deletes [word] | |
(map #(let [[pre [_ & post]] (split-at % word)] | |
(apply str (concat pre post))) | |
(range (count word)))) | |
(defn transposes [word] | |
(map #(let [[pre [a b & post]] (split-at % word)] | |
(apply str (concat pre [b a] post))) | |
(range (dec (count word))))) | |
(defn replaces [word] | |
(let [wrd (vec word)] | |
(for [ch "abcdefghijklmnopqrstuvwxyz" i (range (count wrd))] | |
(apply str (assoc wrd i ch))))) | |
(defn inserts [word] | |
(for [ch "abcdefghijklmnopqrstuvwxyz" i (range (count word))] | |
(let [[pre post] (split-at i word)] | |
(apply str (concat pre (cons ch post)))))) | |
(defn edits1 [word] | |
(set (concat (deletes word) | |
(transposes word) | |
(replaces word) | |
(inserts word)))) | |
(defn edits2 [word] | |
(set (mapcat #(edits1 %) (edits1 word)))) | |
(defn known [words] (filter @NWORDS words)) | |
(defn correct [word] | |
(let [w (known [word]) | |
w (if (empty? w) (known (edits1 word)) w) | |
w (if (empty? w) (known (edits2 word)) w) | |
w (if (empty? w) [word] w)] | |
(sort-by @NWORDS w))) | |
(def test1 (rest (map #(re-seq #"[a-z]+" %) (line-seq (clojure.java.io/reader "/home/ckirkendall/Development/clojure/spelling-jam/data/test1.txt"))))) | |
(def test2 (rest (map #(re-seq #"[a-z]+" %) (line-seq (clojure.java.io/reader "/home/ckirkendall/Development/clojure/spelling-jam/data/test2.txt"))))) | |
(defn testing-speller [test-data] | |
(reduce #(let [[c i] %1 | |
[expected & tail] %2 | |
cor (count | |
(filter (fn [word] | |
(= expected (first (correct word)))) tail))] | |
[(+ c (count tail)) (+ i (- (count tail) cor))]) | |
[0 0] | |
test-data)) | |
(defn add-word [word] | |
(swap! NWORDS assoc word 1)) | |
(testing-speller test2) | |
(correct "exstacy") | |
(correct "hizzy") | |
(add-word "hizzy") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment