Last active
November 5, 2020 04:09
-
-
Save BoltzmannBrain/4e864155e0f59df85b6a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# Name: Did you mean 'python spell check'? | |
# Purpose: Correct the spelling of an input string. | |
# | |
# Author: Alex Lavin | |
# | |
# Created: 17/08/2014 | |
# Copyright: (c) Lavin 2014 | |
#------------------------------------------------------------------------------- | |
import re, string, operator | |
from collections import Counter | |
from math import log10 | |
## Process the input text database | |
sample_text = file('big.txt').read() | |
def tokens(text): | |
"List all word tokens of text, normalized to lowercase." | |
return re.findall('[a-z]+', text.lower()) | |
VOCAB = Counter(tokens(sample_text)) | |
## Text analysis functions | |
def edits_1(word): | |
"Return all string that are one edit away from word." | |
splits = [(word[:i], word[i:]) for i in range(len(word)+1)] # all possible splits of word | |
insertions = [a+b+c for a,b in splits for c in string.ascii_lowercase] | |
deletions = [a+b[1:] for a,b in splits if b] # at each split, delete the first character of 'b' | |
substitutions = [a+c+b[1:] for a,b in splits for c in string.ascii_lowercase if b] # same as deletion, but additionally insert 'c' | |
transpositions = [a+b[1]+b[0]+b[2:] for a,b in splits if len(b)>1] | |
return set(insertions + deletions + substitutions + transpositions) | |
def edits_2(word): # call edit function twice -> edit distance two | |
"Return all string that are two edits away from word." | |
return set(e2 for e1 in edits_1(word) for e2 in edits_1(e1)) | |
def known(words): | |
"Return subset of words that appear in the dictionary." | |
return filter(VOCAB.get, words) | |
def correct(word, version=0): | |
"Return the best spelling correction of word." | |
if version==1: # Prefer edit distances of 0, 1, then 2, and then default to original word. | |
candidates = (known({word}) or | |
known(edits_1(word)) or | |
known(edits_2(word)) or | |
[word]) | |
return max(candidates, key=VOCAB.get) | |
candidates = edits(word).items() | |
c, edit = max(candidates, key=lambda (c,e): Pedit(e) * P1w(c)) | |
return c | |
def correct_text(text): | |
"Return the correct spelling of all words in text." | |
return re.sub('[a-zA-Z]+', correct_match, text) | |
def correct_match(match): | |
"Correct the spelling of match, first making it lowercase and calling correct(), then putting in the proper case." | |
word = match.group() | |
return case(word)(correct(word.lower())) | |
def case(text): | |
"Return the case function for text." | |
return(str.upper if text.isupper() else | |
str.lower if text.islower() else | |
str.title if text.istitle() else | |
str) | |
## Probability distribution | |
class Pdist(dict): | |
"A probability distribution estimated from counts in datafile." | |
def __init__(self, data=[], N=None, smoothingfn=None): | |
for key,count in data: | |
self[key] = self.get(key, 0) + int(count) | |
self.N = float(N or sum(self.itervalues())) | |
self.smoothingfn = smoothingfn or (lambda k, N: 1./N) | |
def __call__(self, key): | |
if key in self: return self[key]/self.N | |
else: return self.smoothingfn(key, self.N) | |
def Laplace(counter, c=1): | |
N = sum(counter.values()) | |
Nnew = N + c * (len(counter)+1) | |
return lambda w: (counter[w]+c) / Nnew | |
def cPw(word, prev): | |
"Conditional probability of word, given previous word." | |
bigram = prev + ' ' + word | |
if P2w(bigram)>0 and P1w(prev)>0: | |
return P2w(bigram) / P1w(prev) | |
else: # average the back-off value and zero | |
return P1w(word) / 2 | |
def Pedit(edit): | |
"Returns the probability of an edit." | |
if edit == '': return (1. - p_error) | |
return p_error*product(P1edit(e) for e in edit.split('+')) | |
## Segmentation | |
def memo(f): | |
"Memoize function f, whose args must all be hashable." | |
cache = {} | |
def fmemo(*args): | |
if args not in cache: | |
cache[args] = f(*args) | |
return cache[args] | |
fmemo.cache = cache | |
return fmemo | |
@memo | |
def segment(text, prev='<S>'): | |
"Return best segmentation of text; words and their logP." | |
if not text: return 0.0, [] | |
candidates = (combine(log10(cPw(first,prev)), first, segment(rest, first)) # log10 to avoid underflow | |
for first,rest in splits(text)) | |
return max(candidates) | |
L = max(map(len, VOCAB)) # longest word we've seen | |
def splits(text, L=20): | |
"""Return a list of all (first, rest) pairs of the text, | |
where start<=len(first)<=L+2.""" | |
return [(text[:i+1], text[i+1:]) | |
for i in range(min(len(text),L+2))] | |
def combine(Pfirst, first, (Prest, rest)): | |
"Combine first and rest into words/probability." | |
return Pfirst+Prest, [first]+rest | |
## From Norvig | |
def edits(word, d=2): | |
"Return a dict of {correct: edit} pairs within d edits of word." | |
results = {} | |
def editsR(hd, tl, d, edits): | |
def ed(L,R): return edits+[R+'|'+L] | |
C = hd+tl | |
if C in P1w: | |
e = '+'.join(edits) | |
if C not in results: results[C] = e | |
else: results[C] = max(results[C], e, key=Pedit) | |
if d <= 0: return | |
extensions = [hd+c for c in string.ascii_lowercase if hd+c in PREFIXES] | |
p = (hd[-1] if hd else '<') ## previous character | |
## Insertion | |
for h in extensions: | |
editsR(h, tl, d-1, ed(p+h[-1], p)) | |
if not tl: return | |
## Deletion | |
editsR(hd, tl[1:], d-1, ed(p, p+tl[0])) | |
for h in extensions: | |
if h[-1] == tl[0]: ## Match | |
editsR(h, tl[1:], d, edits) | |
else: ## Replacement | |
editsR(h, tl[1:], d-1, ed(h[-1], tl[0])) | |
## Transpose | |
if len(tl)>=2 and tl[0]!=tl[1] and hd+tl[1] in PREFIXES: | |
editsR(hd+tl[1], tl[0]+tl[2:], d-1, | |
ed(tl[1]+tl[0], tl[0:2])) | |
## Body of edits: | |
editsR('', word, d, []) | |
return results | |
## Helper functions | |
def count_data(name, sep='\t'): | |
with open(name) as f: | |
for line in f: | |
yield line.split(sep) | |
def product(numbers): | |
"Return the product of a sequence of numbers." | |
return reduce(operator.mul, numbers, 1) | |
## Run program | |
N = 1024908267229 # Number of tokens in 'big.txt' | |
P1w = Pdist(count_data('unigram counts.txt'), N) | |
P2w = Pdist(count_data('bigram counts.txt'), N) | |
p_error = 1./10. # assume spelling error every 10 words | |
PREFIXES = set(w[:i] for w in P1w for i in range(len(w) + 1)) # Norvig code | |
P1edit = Pdist(count_data('count_1edit.txt')) # Probabilities of single edits | |
tests = ['mimosa brunhc resterants', 'Where is teh Golden Gtae Birdge?', 'mimosa brunhc resterants near teh Goldengate Birdge?'] | |
for test in tests: | |
new = correct_text(test) | |
print test, '-> Did you mean "', new, '"?' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks, I used it in my text editor.