Skip to content

Instantly share code, notes, and snippets.

@PrithivirajDamodaran
Last active June 16, 2017 12:27
Show Gist options
  • Save PrithivirajDamodaran/5c9bcc58ed6c425b1e343c7973e9f863 to your computer and use it in GitHub Desktop.
Save PrithivirajDamodaran/5c9bcc58ed6c425b1e343c7973e9f863 to your computer and use it in GitHub Desktop.
# by Prithiviraj Damodaran
# Based on Peter Norvig’s blog post and toy spell correction logic
from __future__ import division
from memo import memo
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))
def stopwordFilter(word):
# Industry strength stopword list :-)
stopwords = ['yes','his','her','is','was','the','no','some','what','does']
if word in stopwords:
return True
else:
return False
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
#print (word, WORDS[word], N, WORDS[word] / N)
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
output = max(candidates(word), key=P)
if stopwordFilter(output) == False:
return output
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def splits(text, L=5):
"Return a list of all possible (first, rem) pairs, len(first)<=L."
splitsa = [(text[:i+1], text[i+1:])
for i in range(max(len(text), L))]
#print splitsa
return splitsa
def segmentW(text):
out = segment(text)
output = []
for o in range(len(out)):
output.append(correction(out[o]))
print out
return output
@memo
def segment(text):
"Return a list of words that is the best segmentation of text."
if not text: return []
candidates = ([first]+[rem] for first,rem in splits(text))
return max(candidates, key=Pwords)
def Pwords(words):
"The Naive Bayes probability of a sequence of words."
prod = 0.0
prod = product(P(correction(w)) for w in words)
return prod
def product(nums):
"Multiply the numbers together. (Like `sum`, but with multiplication.)"
result = 1
for x in nums:
result *= x
return result
if __name__ == '__main__':
#print(segmentW('victirusecreet'))
print(segmentW('runningsooes'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment