Created
January 3, 2010 17:40
-
-
Save emk/268055 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Viterbi POS tagger using numpy and some libraries I wrote. | |
# | |
# To use: | |
# | |
# >>> viterbi.pos_tag('i want to go for a run .') | |
# ['PPSS', 'VB', 'TO', 'VB', 'IN', 'AT', 'NN'] | |
# >>> viterbi.pos_tag('i want to run .') | |
# ['PPSS', 'VB', 'TO', 'VB'] | |
# | |
# -Eric Kidd | |
import corpus | |
import train | |
from numpy import * | |
# Must be an even number to avoid a numpy crash! | |
tag_count = len(corpus.ID_TAG) | |
def pos_tag(sentence): | |
# Split our sentence into tokens (primitive). | |
sentence = ['.'] + sentence.split(' ') | |
# Initialize our trellis, setting a known probability in the first | |
# column. | |
trellis = zeros((len(sentence), tag_count), dtype=float32) | |
trellis[0][corpus.tag_id('.')] = 1.0 | |
backtrace = zeros((len(sentence), tag_count), dtype=uint8) | |
tag_tag_probs = array(train.TAG_TAG_PROB.todense()) | |
# Iterate over the remaining words. | |
for i in range(1, len(sentence)): | |
word_id = corpus.word_id(sentence[i]) | |
word_tag_probs = array(train.WORD_TAG_PROB[word_id,:].todense()) | |
# Calculate the probability of being in each state, together with a | |
# backtrace to the state before that. | |
prev_tag_probs = trellis[i-1,:] | |
scaled_tag_tag_probs = dot(diag(prev_tag_probs), tag_tag_probs) | |
trellis[i,:] = amax(scaled_tag_tag_probs, 0) * word_tag_probs | |
backtrace[i,:] = argmax(scaled_tag_tag_probs, 0) | |
# Walk back along our backtrace. We build up 'result' backwards. | |
result = [] | |
tag = argmax(trellis[len(sentence)-1,:]) | |
for i in range(len(sentence)-1, 0, -1): | |
result.append(tag) | |
tag = backtrace[i,tag] | |
result.reverse() | |
return [corpus.ID_TAG[t] for t in result] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment