Skip to content

Instantly share code, notes, and snippets.

@emk
Created January 3, 2010 17:40
Show Gist options
  • Save emk/268055 to your computer and use it in GitHub Desktop.
Save emk/268055 to your computer and use it in GitHub Desktop.
# Viterbi POS tagger using numpy and some libraries I wrote.
#
# To use:
#
# >>> viterbi.pos_tag('i want to go for a run .')
# ['PPSS', 'VB', 'TO', 'VB', 'IN', 'AT', 'NN']
# >>> viterbi.pos_tag('i want to run .')
# ['PPSS', 'VB', 'TO', 'VB']
#
# -Eric Kidd
import corpus
import train
from numpy import *
# Must be an even number to avoid a numpy crash!
tag_count = len(corpus.ID_TAG)
def pos_tag(sentence):
# Split our sentence into tokens (primitive).
sentence = ['.'] + sentence.split(' ')
# Initialize our trellis, setting a known probability in the first
# column.
trellis = zeros((len(sentence), tag_count), dtype=float32)
trellis[0][corpus.tag_id('.')] = 1.0
backtrace = zeros((len(sentence), tag_count), dtype=uint8)
tag_tag_probs = array(train.TAG_TAG_PROB.todense())
# Iterate over the remaining words.
for i in range(1, len(sentence)):
word_id = corpus.word_id(sentence[i])
word_tag_probs = array(train.WORD_TAG_PROB[word_id,:].todense())
# Calculate the probability of being in each state, together with a
# backtrace to the state before that.
prev_tag_probs = trellis[i-1,:]
scaled_tag_tag_probs = dot(diag(prev_tag_probs), tag_tag_probs)
trellis[i,:] = amax(scaled_tag_tag_probs, 0) * word_tag_probs
backtrace[i,:] = argmax(scaled_tag_tag_probs, 0)
# Walk back along our backtrace. We build up 'result' backwards.
result = []
tag = argmax(trellis[len(sentence)-1,:])
for i in range(len(sentence)-1, 0, -1):
result.append(tag)
tag = backtrace[i,tag]
result.reverse()
return [corpus.ID_TAG[t] for t in result]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment