cheind · December 5, 2022 07:20 · cheind · Dec 17, 2017
diff --git a/hmm_train_tf.py b/hmm_train_tf.py
 __author__ = 'Christoph Heindl'
 __copyright__ = 'Copyright 2017'
 __license__ = 'BSD'

 """Trains a HMM based on gradient descent optimization. 

 The parameters (theta) of the model are transition and 
 emission probabilities, as well as the initial state probabilities. 

 Given a start solution, the negative log likelihood of data given the 
 model P(obs|theta) is minimized. 

 Since we cannot guarantee valid probability distributions after a gradient
 step, we add an additional layer to the trainable variables that turns
 them into probability distributions (here softmax is used). The optimized
 distributions are then the softmaxed version of optimized variables.

 Running this should give you output similar to the following lines
 Optimizing
 0 - log-likelihood -33.050
 100 - log-likelihood -31.512
 200 - log-likelihood -31.148
 300 - log-likelihood -30.633
 400 - log-likelihood -30.304
 500 - log-likelihood -30.038
 600 - log-likelihood -29.941
 700 - log-likelihood -29.916
 800 - log-likelihood -29.902
 900 - log-likelihood -29.893

 A:
 [[ 0.855  0.144  0.001  0.   ]
 [ 0.001  0.489  0.509  0.001]
 [ 0.001  0.001  0.001  0.997]
 [ 0.003  0.002  0.125  0.87 ]]
 
 B:
 [[ 0.833  0.003  0.995  0.143]
 [ 0.165  0.001  0.003  0.376]
 [ 0.001  0.996  0.002  0.481]]

 pi:
 [ 0.6  0.1  0.2  0.1]
 """

 import tensorflow as tf
 import numpy as np

 np.set_printoptions(precision=3, suppress=True)

 def log_likelihood(A, B, pi, obs):
    '''Computes the log-likelihood of the HMM as log(P(obs|theta)).

    The logarithmic version is preferred in optimization as it avoids
    underflows and scales the loss appropriately, so that backward sweeps
    are stay within meaningful ranges.
    
    See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
    for details.
    '''

    AT = tf.transpose(A)

    def forward(i, prev, sumlogc):   
        f = tf.reshape(B[obs[i]], (4,1)) * (AT @ prev)
        c = 1. / tf.reduce_sum(f)
        return tf.add(i, 1), f * c, tf.add(sumlogc, tf.log(c))
    
    i = tf.constant(0)
    c = lambda i, prev, sumlogc: tf.less(i, obs.shape[0].value)    
    b = lambda i, prev, sumlogc: forward(i, prev, sumlogc)
    r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1)), tf.constant(0.)])

    return -r[2]

 def likelihood(A, B, pi, obs):
    '''Computes the likelihood of the the HMM given data P(obs|theta).

    Uses the forward algorithm to compute P(obs, st|theta) and then
    sums over all possible values state values at time t to get P(obs|theta).

    Note that this version is not really suitable for optimization as
    it quickly leads to underflows due to repetitive multiplication of
    probabilities < 1. Use log_likelihood instead.
    '''

    AT = tf.transpose(A)

    def forward(i, prev):       
        return tf.add(i, 1), tf.reshape(B[obs[i]], (4,1)) * (AT @ prev)
    
    i = tf.constant(0)
    c = lambda i, prev: tf.less(i, obs.shape[0].value)    
    b = lambda i, prev: forward(i, prev)
    r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1))])

    return tf.reduce_sum(r[1])

 with tf.Graph().as_default() as g:

    # Transition probabilities
    A_ = tf.get_variable('A_', trainable=True, initializer=tf.constant(
        np.array([
            [0.8, 0.2, 0.0, 0.0],
            [0.1, 0.5, 0.4, 0.0],
            [0.0, 0.4, 0.2, 0.4],
            [0.0, 0.0, 0.4, 0.6],
        ]), dtype=tf.float32
    ))

    # Emission probabilities
    B_ = tf.get_variable('B_', trainable=True, initializer=tf.constant(
        np.array([
            [0.7, 0.1, 0.2, 0.1],
            [0.15, 0.4, 0.7, 0.3],
            [0.15, 0.5, 0.1, 0.6]
        ]), dtype=tf.float32
    ))

    # Initial state probabilities
    pi_ = tf.get_variable('pi', trainable=False, initializer=tf.constant(
        np.array([0.6, 0.1, 0.2, 0.1]), dtype=tf.float32
    ))

    # Need to rescale trained variables in order to get a valid propability distribution.
    A = tf.nn.softmax(A_, dim=1)
    B = tf.nn.softmax(B_, dim=0)
    pi = pi_
    
    obs = tf.placeholder(dtype=np.int32, shape=(30,), name='obs')    
    
    ll = log_likelihood(A, B, pi, obs)
    loss = -ll
    step = tf.train.AdamOptimizer(1e-2).minimize(loss)

    with tf.Session().as_default() as sess:
        sess.run(tf.global_variables_initializer())

        # Not ideal but for the sake of illustration: only one observation sequence. Will train on this sequence over and over...
        the_obs = np.array([0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0])

        print('Optimizing')
        for i in range(1000):            
            vals = sess.run({'ll':ll, 'step': step},  feed_dict={obs : the_obs})
            if i % 100 == 0:
                print('{} - log-likelihood {:.3f}'.format(i, vals['ll']))

        print(A.eval())
        print(B.eval())
        print(pi.eval())
	__author__ = 'Christoph Heindl'
	__copyright__ = 'Copyright 2017'
	__license__ = 'BSD'

	"""Trains a HMM based on gradient descent optimization.

	The parameters (theta) of the model are transition and
	emission probabilities, as well as the initial state probabilities.

	Given a start solution, the negative log likelihood of data given the
	model P(obs\|theta) is minimized.

	Since we cannot guarantee valid probability distributions after a gradient
	step, we add an additional layer to the trainable variables that turns
	them into probability distributions (here softmax is used). The optimized
	distributions are then the softmaxed version of optimized variables.

	Running this should give you output similar to the following lines
	Optimizing
	0 - log-likelihood -33.050
	100 - log-likelihood -31.512
	200 - log-likelihood -31.148
	300 - log-likelihood -30.633
	400 - log-likelihood -30.304
	500 - log-likelihood -30.038
	600 - log-likelihood -29.941
	700 - log-likelihood -29.916
	800 - log-likelihood -29.902
	900 - log-likelihood -29.893

	A:
	[[ 0.855 0.144 0.001 0. ]
	[ 0.001 0.489 0.509 0.001]
	[ 0.001 0.001 0.001 0.997]
	[ 0.003 0.002 0.125 0.87 ]]

	B:
	[[ 0.833 0.003 0.995 0.143]
	[ 0.165 0.001 0.003 0.376]
	[ 0.001 0.996 0.002 0.481]]

	pi:
	[ 0.6 0.1 0.2 0.1]
	"""

	import tensorflow as tf
	import numpy as np

	np.set_printoptions(precision=3, suppress=True)

	def log_likelihood(A, B, pi, obs):
	'''Computes the log-likelihood of the HMM as log(P(obs\|theta)).

	The logarithmic version is preferred in optimization as it avoids
	underflows and scales the loss appropriately, so that backward sweeps
	are stay within meaningful ranges.

	See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
	for details.
	'''

	AT = tf.transpose(A)

	def forward(i, prev, sumlogc):
	f = tf.reshape(B[obs[i]], (4,1)) * (AT @ prev)
	c = 1. / tf.reduce_sum(f)
	return tf.add(i, 1), f * c, tf.add(sumlogc, tf.log(c))

	i = tf.constant(0)
	c = lambda i, prev, sumlogc: tf.less(i, obs.shape[0].value)
	b = lambda i, prev, sumlogc: forward(i, prev, sumlogc)
	r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1)), tf.constant(0.)])

	return -r[2]

	def likelihood(A, B, pi, obs):
	'''Computes the likelihood of the the HMM given data P(obs\|theta).

	Uses the forward algorithm to compute P(obs, st\|theta) and then
	sums over all possible values state values at time t to get P(obs\|theta).

	Note that this version is not really suitable for optimization as
	it quickly leads to underflows due to repetitive multiplication of
	probabilities < 1. Use log_likelihood instead.
	'''

	AT = tf.transpose(A)

	def forward(i, prev):
	return tf.add(i, 1), tf.reshape(B[obs[i]], (4,1)) * (AT @ prev)

	i = tf.constant(0)
	c = lambda i, prev: tf.less(i, obs.shape[0].value)
	b = lambda i, prev: forward(i, prev)
	r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1))])

	return tf.reduce_sum(r[1])

	with tf.Graph().as_default() as g:

	# Transition probabilities
	A_ = tf.get_variable('A_', trainable=True, initializer=tf.constant(
	np.array([
	[0.8, 0.2, 0.0, 0.0],
	[0.1, 0.5, 0.4, 0.0],
	[0.0, 0.4, 0.2, 0.4],
	[0.0, 0.0, 0.4, 0.6],
	]), dtype=tf.float32
	))

	# Emission probabilities
	B_ = tf.get_variable('B_', trainable=True, initializer=tf.constant(
	np.array([
	[0.7, 0.1, 0.2, 0.1],
	[0.15, 0.4, 0.7, 0.3],
	[0.15, 0.5, 0.1, 0.6]
	]), dtype=tf.float32
	))

	# Initial state probabilities
	pi_ = tf.get_variable('pi', trainable=False, initializer=tf.constant(
	np.array([0.6, 0.1, 0.2, 0.1]), dtype=tf.float32
	))

	# Need to rescale trained variables in order to get a valid propability distribution.
	A = tf.nn.softmax(A_, dim=1)
	B = tf.nn.softmax(B_, dim=0)
	pi = pi_

	obs = tf.placeholder(dtype=np.int32, shape=(30,), name='obs')

	ll = log_likelihood(A, B, pi, obs)
	loss = -ll
	step = tf.train.AdamOptimizer(1e-2).minimize(loss)

	with tf.Session().as_default() as sess:
	sess.run(tf.global_variables_initializer())

	# Not ideal but for the sake of illustration: only one observation sequence. Will train on this sequence over and over...
	the_obs = np.array([0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0])

	print('Optimizing')
	for i in range(1000):
	vals = sess.run({'ll':ll, 'step': step}, feed_dict={obs : the_obs})
	if i % 100 == 0:
	print('{} - log-likelihood {:.3f}'.format(i, vals['ll']))

	print(A.eval())
	print(B.eval())
	print(pi.eval())