Created
June 13, 2022 13:52
-
-
Save egorsmkv/4961a8dd1c58c0fcc043a355599dd336 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python implementation of Viterbi algorithm for word segmentation | |
A clean-up of this: http://norvig.com/ngrams/ch14.pdf | |
- | |
You also need 'unigrams.txt' and 'bigrams.txt' to run the segmentation. The ngrams | |
used in this implementation is from the 'count_1w.txt' and 'count_2w.txt' provided | |
here: http://norvig.com/ngrams/ | |
- | |
Usage: | |
>>> from segment import viterbi | |
>>> viterbi('thisisasentence') | |
(-8.89803279104842, ['this', 'is', 'a', 'sentence']) | |
>>> viterbi('idontthinkyoucandealwiththis') | |
(-15.311752931970325, ['i', 'dont', 'think', 'you', 'can', 'deal', 'with', 'this']) | |
Original code: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6 | |
Author: Mark Dong <[email protected]> | |
""" | |
import math | |
import functools | |
class ProbDist(dict): | |
### Probability distribution estimated from unigram/bigram data | |
def __init__(self, datafile=None, unigram=True, N=1024908267229): | |
data = {} | |
with open(datafile) as f: | |
for line in f: | |
k, v = line.rstrip().split('\t') | |
data[k] = int(v) | |
for k, c in data.items(): | |
self[k] = self.get(k, 0) + c | |
if unigram: | |
self.unknownprob = lambda k, N: 10/(N*10**len(k)) # avoid unknown long word | |
else: | |
self.unknownprob = lambda k, N: 1/N | |
self.N = N | |
def __call__(self, key): | |
if key in self: | |
return self[key]/self.N | |
else: | |
return self.unknownprob(key, self.N) | |
P_unigram = ProbDist('unigrams.txt') | |
P_bigram = ProbDist('bigrams.txt', False) | |
def conditionalProb(word_curr, word_prev): | |
### Conditional probability of current word given the previous word. | |
try: | |
return P_bigram[word_prev + ' ' + word_curr]/P_unigram[word_prev] | |
except KeyError: | |
return P_unigram(word_curr) | |
@functools.lru_cache(maxsize=2**10) | |
def viterbi(text, prev='<S>', maxlen=20): | |
if not text: | |
return 0.0, [] | |
textlen = min(len(text), maxlen) | |
splits = [(text[:i + 1], text[i + 1:]) for i in range(textlen)] | |
candidates = [] | |
for first_word, remain_word in splits: | |
#pdb.set_trace() | |
first_prob = math.log10(conditionalProb(first_word, prev)) | |
remain_prob, remain_word = viterbi(remain_word, first_word) | |
candidates.append((first_prob + remain_prob, [first_word] + remain_word)) | |
return max(candidates) | |
while True: | |
v = input('>') | |
v = v.lower().replace(' ', '') | |
print('Corrected to:', v) | |
z = viterbi(v) | |
print('Viterbi:', z) | |
print('Result:', ' '.join(z[1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment