Skip to content

Instantly share code, notes, and snippets.

@language-engineering
Last active October 11, 2015 12:27
Show Gist options
  • Select an option

  • Save language-engineering/3858818 to your computer and use it in GitHub Desktop.

Select an option

Save language-engineering/3858818 to your computer and use it in GitHub Desktop.
import os, collections, nltk
class SpellChecker(object):
def __init__(self, probability_distribution=None):
if probability_distribution:
self.probabilities = probability_distribution
else:
#when working form home, the path below must be changed to reflect the location of the gutenberg data on your home machine
gutenberg_spelling_training = os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','gutenberg','spelling.txt')
with open(gutenberg_spelling_training) as fh:
data = fh.read()
samples = data.split()
fd = nltk.probability.FreqDist(samples)
self.probabilities = nltk.probability.LidstoneProbDist(fd, 0.001)
self.NWORDS = self.probabilities.samples()
self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(self,word):
'''Generate all tokens of an edit-distance of 1 away from *word*'''
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
replaces = [a + c + b[1:] for a, b in splits for c in self.alphabet if b]
inserts = [a + c + b for a, b in splits for c in self.alphabet]
return set(deletes + transposes + replaces + inserts)
def known(self,words):
'''Return only those tokens in *words* that appear in our training data.'''
return set(words).intersection(self.NWORDS)
def correct(self,word):
'''Given a word, spellcheck it'''
if self.known([word]) or not word.isalpha(): # if *word* is known, or non-alphabetic
return word #then return *word*
else:
#Generate words 1 edit away from *word*, and store the ones that also appeared in training
edits1away = self.edits1(word) #All words 1 edit away from *word* (including nonsense words)
known_edits1 = self.known(edits1away) #Only those edits that are known words
if known_edits1: #if any exist, then select the one that appeared most in training
return max(known_edits1, key=self.probabilities.prob)
else: #Otherwise no replacement was found, so just give up and return the original word
return word
# Example usage:
s = SpellChecker()
example_tokens = ["neve", "ascrybe", "to", "malise", ",", "tat", "whic", "can", "be", "explianed", "by", "incompetense"]
for word in example_tokens:
print "%s --> %s" % (word, s.correct(word))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment