Last active
June 11, 2020 03:40
-
-
Save TrungNguyen1909/04b8c82ba712620ef3ed604de633e34f to your computer and use it in GitHub Desktop.
Simple spellchecker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import sys | |
import re | |
WORDLIST = input("Enter dictionary path:") | |
WORDS = Counter([a.strip() for a in open(WORDLIST).readlines()]) | |
def P(word, N=sum(WORDS.values())): | |
"Probability of `word`." | |
return WORDS[word] / N | |
def correction(word): | |
"Most probable spelling correction for word." | |
return sorted(candidates(word), key=P,reverse=True)[0] | |
def candidates(word): | |
"Generate possible spelling corrections for word." | |
return (known([word]) or known(edits1(word)) or known(edits2(word)) or word) | |
def known(words): | |
"The subset of `words` that appear in the dictionary of WORDS." | |
return set(w for w in words if w in WORDS) | |
def edits1(word): | |
"All edits that are one edit away from `word`." | |
letters = 'abcdefghijklmnopqrstuvwxyz' | |
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] | |
deletes = [L + R[1:] for L, R in splits if R] | |
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] | |
replaces = [L + c + R[1:] for L, R in splits if R for c in letters] | |
inserts = [L + c + R for L, R in splits for c in letters] | |
return set(deletes + transposes + replaces + inserts) | |
def edits2(word): | |
"All edits that are two edits away from `word`." | |
return (e2 for e1 in edits1(word) for e2 in edits1(e1)) | |
patterns = re.compile(r'[\w\']+') | |
text = input("Enter path to text needed checking:") | |
with open(text,"r") as f: | |
lines = f.readlines() | |
ok = True | |
for lidx,content in enumerate(lines): | |
idx = 0 | |
while True: | |
m = patterns.search(content,idx) | |
if m is None: | |
break | |
idx = m.end() | |
word = m.group(0) | |
fixed = correction(word) | |
if word == fixed or len(fixed)==1: | |
idx = m.end() | |
else: | |
ok = False | |
print(F'{f.name}:{lidx+1}:{m.start()+1}: {word} -> {fixed}') | |
if ok: | |
print("Hooray! No spelling mistakes was found!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment