Skip to content

Instantly share code, notes, and snippets.

@g10guang
Last active September 24, 2019 11:50
Show Gist options
  • Save g10guang/c9926b60183690a500d698851aca5543 to your computer and use it in GitHub Desktop.
Save g10guang/c9926b60183690a500d698851aca5543 to your computer and use it in GitHub Desktop.
Correct typo according to http://norvig.com/spell-correct.html
#%%
letter = 'abcdefghijklmnopqrstuvwxyz'
import requests, re, collections
# fetch the big.txt to calculate every words probability.
big_text = requests.get("http://norvig.com/big.txt").text
#use regex to match all words and then count their frequency
statistic = collections.Counter(re.findall("\w+", big_text))
#%%
def edit1(word):
"""
candidate words which edit distance is 1
"""
split = [(word[:i], word[i:]) for i in range(len(word)+1)]
delete_one = [L + R[1:] for L, R in split if len(R) > 0]
add_one = [L + w + R for L, R in split for w in letter]
replace_one = [L + w + R[1:] for L, R in split if len(R) > 0 for w in letter]
transcode = [L + R[1] + R[0] + R[2:] for L, R in split if len(R) > 2]
return delete_one + add_one + replace_one + transcode
def P(word):
"""
Calculate every words frequency
"""
return statistic[word] / sum(statistic.values())
def edit2(word):
"""
edit distance is 2, edit2(word) = edit1(edit1(word))
"""
return (w for e in edit1(word) for w in edit1(e))
def known(words):
"""
filter words if not exist in big.txt
"""
return list(filter(lambda w: w in statistic, words))
def candidates(word):
"""
1、if word exist, it is the corret spelling.
2、edit distance 1 exist word
3、edit distance 2 exist word
4、unknown word and edit distance 1 & 2 both does not exist, return the word
"""
# print(set(known(edit1(word))))
# print(set(known(edit2(word))))
return (known([word]) or known(edit1(word)) or known(edit2(word)) or [word])
def correction(word):
"""
In all candidate just choose the highest probability word.
"""
return max(candidates(word), key=P)
#%%
print(correction('worlw'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment