Last active
January 1, 2016 14:19
-
-
Save BinRoot/8157165 to your computer and use it in GitHub Desktop.
Haskell implementation of Norvig's simplified algorithm to autocorrect a spelling mistake
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # download the code | |
| wget https://gist.github.com/BinRoot/8157165/raw/9109a51173b76c4e233fd00165fb25e62b249aa5/spell.hs | |
| # download big.txt | |
| wget http://norvig.com/big.txt | |
| # run the code | |
| runhaskell spell |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import Data.Char (isAlpha, isSpace, toLower) | |
| import Data.List (group, sort, sortBy) | |
| import Data.Map (fromListWith, Map, member, (!)) | |
| main = do | |
| rawText <- readFile "big.txt" -- http://norvig.com/big.txt | |
| let m = train $ getWords rawText | |
| let sentence = "such codez many haskell very spellz so korrect" | |
| print $ autofix m sentence | |
| autofix m sentence = unwords $ map (correct m) (words sentence) | |
| getWords str = words $ filter (\x -> isAlpha x || isSpace x) lower | |
| where lower = map toLower str | |
| train :: Ord a => [a] -> Map a Int | |
| train = fromListWith (+) . (`zip` repeat 1) | |
| edits1 word = unique $ deletes ++ transposes ++ replaces ++ inserts | |
| where splits = [ (take i word', drop i word') | | |
| i <- [0..length word']] | |
| deletes = [ a ++ (tail b) | | |
| (a,b) <- splits, (not.null) b] | |
| transposes = [ a ++ [b !! 1] ++ [head b] ++ (drop 2 b) | | |
| (a,b) <- splits, length b > 1 ] | |
| replaces = [ a ++ [c] ++ (drop 1 b) | | |
| (a,b) <- splits, c <- alphabet, (not.null) b ] | |
| inserts = [a ++ [c] ++ b | | |
| (a,b) <- splits, c <- alphabet ] | |
| alphabet = ['a'..'z'] | |
| word' = map toLower word | |
| editsN word n = last $ take n $ iterate edits1' $ edits1 word | |
| where edits1' ls = unique $ concat $ map edits1 ls | |
| unique = map head.group.sort | |
| known [] _ = [] | |
| known (w:ws) m | |
| | w `member` m = w : known ws m | |
| | otherwise = known ws m | |
| correct m word = head $ sortBy (\a b -> compare (m!b) (m!a)) candidates | |
| where candidates = head $ filter (not.null) | |
| [ known [word] m | |
| , known (edits1 word) m | |
| , known (editsN word 2) m | |
| , [word] ] |
Author
@atondwal good catch! Added a line to download big.txt. curl is typically available on more platforms, but I switched to wget for that GNU karma.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
why not use
wgetinstead ofcurl? (and what aboutbig.txt?)