emk · January 3, 2010 17:40
diff --git a/viterbi.py b/viterbi.py
 # Viterbi POS tagger using numpy and some libraries I wrote.
 #
 # To use:
 #
 #   >>> viterbi.pos_tag('i want to go for a run .')
 #   ['PPSS', 'VB', 'TO', 'VB', 'IN', 'AT', 'NN']
 #   >>> viterbi.pos_tag('i want to run .')
 #   ['PPSS', 'VB', 'TO', 'VB']
 #
 # -Eric Kidd

 import corpus
 import train
 from numpy import *

 # Must be an even number to avoid a numpy crash!
 tag_count = len(corpus.ID_TAG)

 def pos_tag(sentence):
    # Split our sentence into tokens (primitive).
    sentence = ['.'] + sentence.split(' ')

    # Initialize our trellis, setting a known probability in the first
    # column.
    trellis = zeros((len(sentence), tag_count), dtype=float32)
    trellis[0][corpus.tag_id('.')] = 1.0
    backtrace = zeros((len(sentence), tag_count), dtype=uint8)

    tag_tag_probs = array(train.TAG_TAG_PROB.todense())

    # Iterate over the remaining words.
    for i in range(1, len(sentence)):
        word_id = corpus.word_id(sentence[i])
        word_tag_probs = array(train.WORD_TAG_PROB[word_id,:].todense())

        # Calculate the probability of being in each state, together with a
        # backtrace to the state before that.
        prev_tag_probs = trellis[i-1,:]
        scaled_tag_tag_probs = dot(diag(prev_tag_probs), tag_tag_probs)
        trellis[i,:] = amax(scaled_tag_tag_probs, 0) * word_tag_probs
        backtrace[i,:] = argmax(scaled_tag_tag_probs, 0)

    # Walk back along our backtrace.  We build up 'result' backwards.
    result = []
    tag = argmax(trellis[len(sentence)-1,:])
    for i in range(len(sentence)-1, 0, -1):
        result.append(tag)
        tag = backtrace[i,tag]
    result.reverse()
    return [corpus.ID_TAG[t] for t in result]
	# Viterbi POS tagger using numpy and some libraries I wrote.
	#
	# To use:
	#
	# >>> viterbi.pos_tag('i want to go for a run .')
	# ['PPSS', 'VB', 'TO', 'VB', 'IN', 'AT', 'NN']
	# >>> viterbi.pos_tag('i want to run .')
	# ['PPSS', 'VB', 'TO', 'VB']
	#
	# -Eric Kidd

	import corpus
	import train
	from numpy import *

	# Must be an even number to avoid a numpy crash!
	tag_count = len(corpus.ID_TAG)

	def pos_tag(sentence):
	# Split our sentence into tokens (primitive).
	sentence = ['.'] + sentence.split(' ')

	# Initialize our trellis, setting a known probability in the first
	# column.
	trellis = zeros((len(sentence), tag_count), dtype=float32)
	trellis[0][corpus.tag_id('.')] = 1.0
	backtrace = zeros((len(sentence), tag_count), dtype=uint8)

	tag_tag_probs = array(train.TAG_TAG_PROB.todense())

	# Iterate over the remaining words.
	for i in range(1, len(sentence)):
	word_id = corpus.word_id(sentence[i])
	word_tag_probs = array(train.WORD_TAG_PROB[word_id,:].todense())

	# Calculate the probability of being in each state, together with a
	# backtrace to the state before that.
	prev_tag_probs = trellis[i-1,:]
	scaled_tag_tag_probs = dot(diag(prev_tag_probs), tag_tag_probs)
	trellis[i,:] = amax(scaled_tag_tag_probs, 0) * word_tag_probs
	backtrace[i,:] = argmax(scaled_tag_tag_probs, 0)

	# Walk back along our backtrace. We build up 'result' backwards.
	result = []
	tag = argmax(trellis[len(sentence)-1,:])
	for i in range(len(sentence)-1, 0, -1):
	result.append(tag)
	tag = backtrace[i,tag]
	result.reverse()
	return [corpus.ID_TAG[t] for t in result]