Created
July 9, 2013 01:47
-
-
Save stuartsierra/5954022 to your computer and use it in GitHub Desktop.
Example implementation of Norvig's Spellchecker in Clojure, using core.async
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Example implementation of Norvig's Spellchecker in Clojure, | |
;; using core.async | |
;; | |
;; There are probably some bugs in this. | |
;; | |
;; Original problem: https://github.com/ericnormand/spelling-jam | |
;; from Lambda Jam, Chicago, 2013: http://lambdajam.com/ | |
;; | |
;; Clojure core.async introduction: | |
;; http://clojure.com/blog/2013/06/28/clojure-core-async-channels.html | |
(ns spellcheck | |
(:require [clojure.string :as str] | |
[clojure.core.async :refer :all])) | |
(defn word-freqs [words] | |
(let [c (double (count words))] | |
(reduce-kv (fn [m k v] | |
(assoc m k (/ (double v) c))) | |
{} | |
(frequencies words)))) | |
(def alphabet (map char (range (int \a) (inc (int \z))))) | |
;; From Alex Miller, | |
;; https://github.com/puredanger/spelling-corrector/blob/master/src/spelling/core.clj | |
(defn edits1 [word] | |
(let [splits (for [i (range (inc (count word)))] [(subs word 0 i) (subs word i)]) | |
deletes (for [[a b] splits :when (not (str/blank? b))] (str a (subs b 1))) | |
transposes (for [[a b] splits :when (> (count b) 1)] (str a (second b) (first b) (subs b 2))) | |
replaces (for [[a b] splits :when (> (count b) 0) | |
c alphabet] | |
(str a c (subs b 1))) | |
inserts (for [[a b] splits | |
c alphabet] | |
(str a c b))] | |
(set (concat deletes transposes replaces inserts)))) | |
(def max-working-set-size | |
"Hard limit to constrain the size of the search space for possible | |
words. Prevents the lazy sequence from trying forever to find the | |
next possible correction." | |
1000) | |
(defn corrections | |
"Given a dictionary (map of word frequencies) and a collection of | |
corrections being considered, returns an infinite lazy sequence of | |
corrections." | |
[dict ws] | |
(when (< (count ws) max-working-set-size) | |
(lazy-seq | |
(let [es (mapcat edits1 ws) | |
ws (->> es | |
(filter #(contains? dict %)) | |
(sort-by (comp - dict)))] | |
(concat ws (corrections dict es)))))) | |
(defn read-corpus | |
"Returns a sequence of words read from a file." | |
[file] | |
(re-seq #"[a-z]+" (clojure.string/lower-case (slurp file)))) | |
(def corpus-words | |
(delay (read-corpus | |
"big.txt"))) | |
(def corpus-freqs | |
(delay (word-freqs @corpus-words))) | |
(defn correct | |
"Returns a lazy sequence of possible corrections for word." | |
[word] | |
(distinct | |
(concat (when (contains? @corpus-freqs word) [word]) | |
(corrections @corpus-freqs [word])))) | |
(defn chan-seq | |
"Returns a channel on which successive elements of the lazy sequence | |
s will be put." | |
[s] | |
(let [out (chan)] | |
(go (loop [ss s] | |
(when-first [x ss] | |
(>! out x) | |
(recur (rest ss))))) | |
out)) | |
(defn corr-chan | |
"Returns a channel on which possible spelling corrections for word | |
will be written." | |
[word] | |
(chan-seq (correct word))) | |
(defn wait-max | |
"Returns a channel which receives successive values from channel ch. | |
If any element takes longer than msec milliseconds to produce, | |
closes the channel." | |
[msec ch] | |
(let [out (chan)] | |
(go (loop [] | |
(alt! ch ([x] (if x | |
(do (>! out x) (recur)) | |
(close! out))) | |
(timeout msec) (close! out)))) | |
out)) | |
(defn take-pr | |
"Reads and prints (to Java standard output) successive values from | |
channel ch until it is closed." | |
[ch] | |
(go (loop [] | |
(when-let [x (<! ch)] | |
(binding [*out* (java.io.PrintWriter. System/out)] | |
(prn x)) | |
(recur))))) | |
(defn correcting | |
"Searches for possible corrections to word and prints them to Java | |
standard output. Takes at most timeout milliseconds to generate a | |
correction." | |
[timeout word] | |
(take-pr (wait-max timeout (corr-chan word)))) | |
(comment | |
;; Usage examples. | |
;; If you're using this from a REPL client such as Emacs/nREPL, | |
;; remember that the output will appear in the STDOUT of the Java | |
;; process, not your REPL. | |
(correcting 5000 "colour") | |
;; "colour" | |
;; "color" | |
;; "colours" | |
;; "colon" | |
;; "colors" | |
;; "colony" | |
(correcting 5000 "milenium") | |
;; "selenium" | |
;; "millennium" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment