Last active
June 16, 2017 12:27
-
-
Save PrithivirajDamodaran/5c9bcc58ed6c425b1e343c7973e9f863 to your computer and use it in GitHub Desktop.
Companion code to the post - http://bytecontinnum.com/natural-language-searches-lessons-spellcheck-autocorrect/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # by Prithiviraj Damodaran | |
| # Based on Peter Norvig’s blog post and toy spell correction logic | |
| from __future__ import division | |
| from memo import memo | |
| import re | |
| from collections import Counter | |
| def words(text): return re.findall(r'\w+', text.lower()) | |
| WORDS = Counter(words(open('big.txt').read())) | |
| def stopwordFilter(word): | |
| # Industry strength stopword list :-) | |
| stopwords = ['yes','his','her','is','was','the','no','some','what','does'] | |
| if word in stopwords: | |
| return True | |
| else: | |
| return False | |
| def P(word, N=sum(WORDS.values())): | |
| "Probability of `word`." | |
| #print (word, WORDS[word], N, WORDS[word] / N) | |
| return WORDS[word] / N | |
| def correction(word): | |
| "Most probable spelling correction for word." | |
| output = max(candidates(word), key=P) | |
| if stopwordFilter(output) == False: | |
| return output | |
| def candidates(word): | |
| "Generate possible spelling corrections for word." | |
| return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) | |
| def known(words): | |
| "The subset of `words` that appear in the dictionary of WORDS." | |
| return set(w for w in words if w in WORDS) | |
| def edits1(word): | |
| "All edits that are one edit away from `word`." | |
| letters = 'abcdefghijklmnopqrstuvwxyz' | |
| splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] | |
| deletes = [L + R[1:] for L, R in splits if R] | |
| transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] | |
| replaces = [L + c + R[1:] for L, R in splits if R for c in letters] | |
| inserts = [L + c + R for L, R in splits for c in letters] | |
| return set(deletes + transposes + replaces + inserts) | |
| def edits2(word): | |
| "All edits that are two edits away from `word`." | |
| return (e2 for e1 in edits1(word) for e2 in edits1(e1)) | |
| def splits(text, L=5): | |
| "Return a list of all possible (first, rem) pairs, len(first)<=L." | |
| splitsa = [(text[:i+1], text[i+1:]) | |
| for i in range(max(len(text), L))] | |
| #print splitsa | |
| return splitsa | |
| def segmentW(text): | |
| out = segment(text) | |
| output = [] | |
| for o in range(len(out)): | |
| output.append(correction(out[o])) | |
| print out | |
| return output | |
| @memo | |
| def segment(text): | |
| "Return a list of words that is the best segmentation of text." | |
| if not text: return [] | |
| candidates = ([first]+[rem] for first,rem in splits(text)) | |
| return max(candidates, key=Pwords) | |
| def Pwords(words): | |
| "The Naive Bayes probability of a sequence of words." | |
| prod = 0.0 | |
| prod = product(P(correction(w)) for w in words) | |
| return prod | |
| def product(nums): | |
| "Multiply the numbers together. (Like `sum`, but with multiplication.)" | |
| result = 1 | |
| for x in nums: | |
| result *= x | |
| return result | |
| if __name__ == '__main__': | |
| #print(segmentW('victirusecreet')) | |
| print(segmentW('runningsooes')) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment