Created
July 11, 2013 07:40
-
-
Save joehillen/5973352 to your computer and use it in GitHub Desktop.
A crappy spellchecker implementation I wrote for a job interview. I didn't get the job =P
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Written by Joe Hillenbrand <[email protected]> 2012 | |
All rights reserved! | |
""" | |
import string | |
import re | |
import sys | |
def spellcheck1(word): | |
''' | |
The following is my first attempt. | |
The idea was to generate a regular expression for the given word and return | |
the first match found. | |
Unfortnately, this would give bad results for already correct words or less | |
than ideal matches. For example: | |
> sheeeeep | |
shap | |
> sheeple | |
shiploa | |
> jjoooobb | |
jab | |
> FOOD | |
fad | |
This method gives no way to compare possible matches and determine the best | |
one. | |
I'm leaving this here to demonstrate my thought process. | |
The following are tests using `doctest`. To run the tests do: | |
python spellcheck.py -t | |
TESTS | |
>>> spellcheck1('sandwich') | |
'sandwich' | |
>>> spellcheck1('little') | |
'little' | |
>>> spellcheck1('sheeeeep') | |
'sheep' | |
>>> spellcheck1('peepple') | |
'people' | |
>>> spellcheck1('sheeple') | |
'NO SUGGESTION' | |
>>> spellcheck1('inSIDE') | |
'inside' | |
>>> spellcheck1('jjoobbb') | |
'jab' | |
>>> spellcheck1('weke') | |
'wake' | |
>>> spellcheck1('CUNsperrICY') | |
'conspiracy' | |
>>> spellcheck1('ffoaoaoaoaoaoaaoaoaoaoaoadd') | |
'food' | |
>>> spellcheck1('FOOD') | |
'food' | |
PATHOLOGICAL TESTS | |
>>> spellcheck1('') | |
'NO SUGGESTION' | |
>>> spellcheck1('1337') | |
'NO SUGGESTION' | |
>>> spellcheck1('woot!') | |
'NO SUGGESTION' | |
>>> spellcheck1('two words') | |
'NO SUGGESTION' | |
>>> spellcheck1(u'ಠ_ಠ') | |
'NO SUGGESTION' | |
>>> spellcheck1('fo0bar') | |
'NO SUGGESTION' | |
''' | |
vowels = 'aeiou' # let's not consider 'y' a vowel in this case | |
# generate all the consonants | |
consonants = [c for c in string.ascii_lowercase if c not in vowels] | |
''' | |
The plan is to generate to a regular expression (wordre) that can be used to | |
search the dictionary. | |
''' | |
wordre = '' | |
prev_ltr = '' | |
word = word.lower() | |
for ltr in word: | |
ltr = ltr.lower() | |
if ltr in consonants: | |
if ltr != prev_ltr: | |
wordre = ''.join([wordre, ltr, '+']) | |
prev_ltr = ltr | |
elif ltr in vowels: | |
if prev_ltr not in vowels or prev_ltr == '': | |
wordre = ''.join([wordre, '[', vowels, ']+']) | |
prev_ltr = ltr | |
else: | |
return 'NO SUGGESTION' | |
if len(wordre) <= 0: | |
return 'NO SUGGESTION' | |
regex = re.compile(wordre) | |
with open('/usr/share/dict/words','r') as words: | |
for w in words: | |
m = regex.match(w) | |
if m is not None: | |
return m.group(0) | |
return 'NO SUGGESTION' | |
class SpellCheck: | |
''' | |
This is my second attempt. | |
It preloads the dictionary into a set for faster searching and uses | |
a breadth first search algorithm. | |
I am happier with its answers, but it can consume endless memory on | |
pathologically long cases that have lots of vowels. For example, | |
wwoweyuoaweoaueaowewasoiue | |
I'd love to work on this some more, but I've used up all my time. | |
I think I would like to try a solution that combines the 2 methods next, | |
or a version of the first attempt that grabs all the valid words and | |
determines the best one. | |
''' | |
def __init__(self): | |
''' | |
Pre-generate a word list so we don't do it at every attempt. | |
Use a set() because it has constant (O(1)) lookup time. | |
http://wiki.python.org/moin/PythonSpeed#Use_the_best_algorithms_and_fastest_tools | |
''' | |
self.words = set() | |
with open('/usr/share/dict/words','r') as f: | |
for word in f: | |
self.words.add(word.strip()) | |
def gen_possible(self, word): | |
''' | |
Generator for all the possible misspellings of "word". | |
This is implemented as a breadth first search. | |
This should also count as the extra credit. | |
''' | |
# First try the word itself | |
yield word | |
# Next, try it in all lowercase | |
word = word.lower() | |
vowels = 'aeiou' # let's not consider 'y' a vowel in this case | |
revowel = re.compile(r'[aeiou]') | |
repeat = re.compile(r'([a-z])\1') | |
visited = set() | |
possible = [word] | |
while len(possible) > 0: | |
word = possible.pop() | |
yield word | |
if word not in visited: | |
# Now try replacing vowels | |
matches = revowel.finditer(word) | |
for m in matches: # m is for "match" | |
s = word[:m.start(0)] # s is for "start" | |
e = word[m.end(0):] # e is for "end" | |
for v in vowels: # v is for "vowel" | |
w = ''.join([s, v, e]) # w is for "word" | |
if w not in possible: | |
possible.append(w) | |
# Try removing duplicates | |
matches = repeat.finditer(word) | |
for m in matches: | |
s = word[:m.start(0)] # s is for "start" | |
e = word[m.end(0)-1:] # e is for "end" | |
w = ''.join([s, e]) # w is for "word" | |
if w not in possible: | |
possible.append(w) | |
visited.add(word) | |
return | |
def spellcheck(self, word): | |
''' | |
The following are tests using `doctest`. To run the tests do: | |
python spellcheck.py -t | |
TESTS | |
>>> s.spellcheck('sandwich') | |
'sandwich' | |
>>> s.spellcheck('little') | |
'little' | |
>>> s.spellcheck('sheeeeep') | |
'sheep' | |
>>> s.spellcheck('peepple') | |
'people' | |
>>> s.spellcheck('sheeple') | |
'NO SUGGESTION' | |
>>> s.spellcheck('inSIDE') | |
'inside' | |
>>> s.spellcheck('jjoobbb') | |
'job' | |
>>> s.spellcheck('weke') | |
'wiki' | |
>>> s.spellcheck('CUNsperrICY') | |
'conspiracy' | |
>>> s.spellcheck('FOOD') | |
'food' | |
PATHOLOGICAL TESTS | |
>>> s.spellcheck('') | |
'NO SUGGESTION' | |
>>> s.spellcheck('1337') | |
'NO SUGGESTION' | |
>>> s.spellcheck('woot!') | |
'NO SUGGESTION' | |
>>> s.spellcheck('two words') | |
'NO SUGGESTION' | |
>>> s.spellcheck(u'ಠ_ಠ') | |
'NO SUGGESTION' | |
>>> s.spellcheck('fo0bar') | |
'NO SUGGESTION' | |
''' | |
for w in self.gen_possible(word): | |
if w in self.words: | |
return w | |
return 'NO SUGGESTION' | |
if __name__ == '__main__': | |
s = SpellCheck() | |
if len(sys.argv) > 1 and sys.argv[1] == '-t': | |
import doctest | |
doctest.testmod(extraglobs={'s': s }) | |
sys.exit(0) | |
while True: | |
word = raw_input('> ') | |
print s.spellcheck(word) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment