Last active
September 24, 2019 11:50
-
-
Save g10guang/c9926b60183690a500d698851aca5543 to your computer and use it in GitHub Desktop.
Correct typo according to http://norvig.com/spell-correct.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
letter = 'abcdefghijklmnopqrstuvwxyz' | |
import requests, re, collections | |
# fetch the big.txt to calculate every words probability. | |
big_text = requests.get("http://norvig.com/big.txt").text | |
#use regex to match all words and then count their frequency | |
statistic = collections.Counter(re.findall("\w+", big_text)) | |
#%% | |
def edit1(word): | |
""" | |
candidate words which edit distance is 1 | |
""" | |
split = [(word[:i], word[i:]) for i in range(len(word)+1)] | |
delete_one = [L + R[1:] for L, R in split if len(R) > 0] | |
add_one = [L + w + R for L, R in split for w in letter] | |
replace_one = [L + w + R[1:] for L, R in split if len(R) > 0 for w in letter] | |
transcode = [L + R[1] + R[0] + R[2:] for L, R in split if len(R) > 2] | |
return delete_one + add_one + replace_one + transcode | |
def P(word): | |
""" | |
Calculate every words frequency | |
""" | |
return statistic[word] / sum(statistic.values()) | |
def edit2(word): | |
""" | |
edit distance is 2, edit2(word) = edit1(edit1(word)) | |
""" | |
return (w for e in edit1(word) for w in edit1(e)) | |
def known(words): | |
""" | |
filter words if not exist in big.txt | |
""" | |
return list(filter(lambda w: w in statistic, words)) | |
def candidates(word): | |
""" | |
1、if word exist, it is the corret spelling. | |
2、edit distance 1 exist word | |
3、edit distance 2 exist word | |
4、unknown word and edit distance 1 & 2 both does not exist, return the word | |
""" | |
# print(set(known(edit1(word)))) | |
# print(set(known(edit2(word)))) | |
return (known([word]) or known(edit1(word)) or known(edit2(word)) or [word]) | |
def correction(word): | |
""" | |
In all candidate just choose the highest probability word. | |
""" | |
return max(candidates(word), key=P) | |
#%% | |
print(correction('worlw')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment