Last active
October 12, 2015 21:08
-
-
Save lojic/922187f2a9cc91bde0e7 to your computer and use it in GitHub Desktop.
First draft of a port of Peter Norvig's spelling corrector to Racket
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
; Thanks to Vincent St-Amour and Sam Tobin-Hochstadt for their tips on #racket | |
(define (words text) | |
(regexp-match* #rx"[a-z]+" (string-downcase text))) | |
(define (train features) | |
(define model (make-hash)) | |
(for ([f features]) | |
(hash-update! model f add1 1)) | |
model) | |
(define nwords | |
(train (words (file->string "spelling-words.txt")))) | |
(define alphabet "abcdefghijklmnopqrstuvwxyz") | |
(define (edits1 word) | |
(let* ([splits (for/list ([i (in-range (+ (string-length word) 1))]) | |
(cons (substring word 0 i) (substring word i)))] | |
[deletes (for/set ([(a b) (in-dict splits)] | |
#:when (> (string-length b) 0)) | |
(string-append a (substring b 1)))] | |
[transposes (for/set ([(a b) (in-dict splits)] | |
#:when (> (string-length b) 1)) | |
(string-append a (substring b 1 2) (substring b 0 1) (substring b 2)))] | |
[replaces (for/set ([(a b) (in-dict splits)] | |
#:when (> (string-length b) 0) | |
[c alphabet]) | |
(string-append a (string c) (substring b 1)))] | |
[inserts (for*/set ([(a b) (in-dict splits)] | |
[c alphabet]) | |
(string-append a (string c) b))]) | |
(set-union deletes transposes replaces inserts))) | |
(define (known-edits2 word) | |
(for*/set ([e1 (edits1 word)] | |
[e2 (edits1 e1)] | |
#:when (hash-has-key? nwords e2)) | |
e2)) | |
(define (known words) | |
(for/set ([w words] #:when (hash-has-key? nwords w)) | |
w)) | |
; If the set argument is non-empty, return it; otherwise, return #f | |
; nes = non-empty set | |
(define (nes set) | |
(if (set-empty? set) | |
#f | |
set)) | |
(define (correct word) | |
(let ([candidates (or (nes (known (list word))) | |
(nes (known (edits1 word))) | |
(nes (known-edits2 word)) | |
(set word))]) | |
(argmax (λ (w) (hash-ref nwords w 1)) (set->list candidates)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment