Last active
October 11, 2015 12:27
-
-
Save language-engineering/3858818 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os, collections, nltk | |
| class SpellChecker(object): | |
| def __init__(self, probability_distribution=None): | |
| if probability_distribution: | |
| self.probabilities = probability_distribution | |
| else: | |
| #when working form home, the path below must be changed to reflect the location of the gutenberg data on your home machine | |
| gutenberg_spelling_training = os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','gutenberg','spelling.txt') | |
| with open(gutenberg_spelling_training) as fh: | |
| data = fh.read() | |
| samples = data.split() | |
| fd = nltk.probability.FreqDist(samples) | |
| self.probabilities = nltk.probability.LidstoneProbDist(fd, 0.001) | |
| self.NWORDS = self.probabilities.samples() | |
| self.alphabet = 'abcdefghijklmnopqrstuvwxyz' | |
| def edits1(self,word): | |
| '''Generate all tokens of an edit-distance of 1 away from *word*''' | |
| splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] | |
| deletes = [a + b[1:] for a, b in splits if b] | |
| transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] | |
| replaces = [a + c + b[1:] for a, b in splits for c in self.alphabet if b] | |
| inserts = [a + c + b for a, b in splits for c in self.alphabet] | |
| return set(deletes + transposes + replaces + inserts) | |
| def known(self,words): | |
| '''Return only those tokens in *words* that appear in our training data.''' | |
| return set(words).intersection(self.NWORDS) | |
| def correct(self,word): | |
| '''Given a word, spellcheck it''' | |
| if self.known([word]) or not word.isalpha(): # if *word* is known, or non-alphabetic | |
| return word #then return *word* | |
| else: | |
| #Generate words 1 edit away from *word*, and store the ones that also appeared in training | |
| edits1away = self.edits1(word) #All words 1 edit away from *word* (including nonsense words) | |
| known_edits1 = self.known(edits1away) #Only those edits that are known words | |
| if known_edits1: #if any exist, then select the one that appeared most in training | |
| return max(known_edits1, key=self.probabilities.prob) | |
| else: #Otherwise no replacement was found, so just give up and return the original word | |
| return word | |
| # Example usage: | |
| s = SpellChecker() | |
| example_tokens = ["neve", "ascrybe", "to", "malise", ",", "tat", "whic", "can", "be", "explianed", "by", "incompetense"] | |
| for word in example_tokens: | |
| print "%s --> %s" % (word, s.correct(word)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment