Created
March 20, 2022 13:41
-
-
Save benob/df806ee98120d5f62a5849aa109ef3ee to your computer and use it in GitHub Desktop.
Minimalist implementation of Symmetric Delete Spelling Correction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import Counter, defaultdict | |
# create lexicon with word frequency from big text | |
def words(text): return re.findall(r'\w+', text.lower()) | |
WORDS = Counter(words(open('big.txt').read())) | |
# generate all deletion edits, plus original | |
def edits(word): | |
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] | |
deletes = [L + R[1:] for L, R in splits if R] | |
return deletes + [word] | |
# expand lexicon with edited words | |
EXPANDED = defaultdict(list) | |
for word in WORDS: | |
for deletion in edits(word): | |
EXPANDED[deletion].append(word) | |
# compute word frequency | |
def P(word, N=sum(WORDS.values())): | |
return WORDS[word] / N | |
# match all 1-deletion edits from word to all 1-deletion edits from lexicon | |
def correction(word): | |
candidates = [] | |
for deletion in edits(word): | |
candidates += EXPANDED[deletion] | |
return max(set(candidates), key=P) | |
print('lenght =>', correction('lenght')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment