Created
October 2, 2017 18:18
-
-
Save PandaWhoCodes/aa05c89bdaf2de0a776a0d1e684eb225 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import Counter | |
# Peter Norvigs spell checker | |
def words(text): return re.findall(r'\w+', text.lower()) | |
WORDS = Counter(words(open('big.txt').read())) | |
def P(word, N=sum(WORDS.values())): | |
"Probability of `word`." | |
return WORDS[word] / N | |
def correction(word): | |
"Most probable spelling correction for word." | |
return max(candidates(word), key=P) | |
def candidates(word): | |
"Generate possible spelling corrections for word." | |
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) | |
def known(words): | |
"The subset of `words` that appear in the dictionary of WORDS." | |
return set(w for w in words if w in WORDS) | |
def edits1(word): | |
"All edits that are one edit away from `word`." | |
letters = 'abcdefghijklmnopqrstuvwxyz' | |
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] | |
deletes = [L + R[1:] for L, R in splits if R] | |
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] | |
replaces = [L + c + R[1:] for L, R in splits if R for c in letters] | |
inserts = [L + c + R for L, R in splits for c in letters] | |
return set(deletes + transposes + replaces + inserts) | |
def edits2(word): | |
"All edits that are two edits away from `word`." | |
return (e2 for e1 in edits1(word) for e2 in edits1(e1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment