Skip to content

Instantly share code, notes, and snippets.

@cheind
Last active December 5, 2022 07:20
Show Gist options
  • Save cheind/ba1541322fa870f6d5134336a7726c37 to your computer and use it in GitHub Desktop.
Save cheind/ba1541322fa870f6d5134336a7726c37 to your computer and use it in GitHub Desktop.
HMM training based on gradient descent (Tensorflow version)
__author__ = 'Christoph Heindl'
__copyright__ = 'Copyright 2017'
__license__ = 'BSD'
"""Trains a HMM based on gradient descent optimization.
The parameters (theta) of the model are transition and
emission probabilities, as well as the initial state probabilities.
Given a start solution, the negative log likelihood of data given the
model P(obs|theta) is minimized.
Since we cannot guarantee valid probability distributions after a gradient
step, we add an additional layer to the trainable variables that turns
them into probability distributions (here softmax is used). The optimized
distributions are then the softmaxed version of optimized variables.
Running this should give you output similar to the following lines
Optimizing
0 - log-likelihood -33.050
100 - log-likelihood -31.512
200 - log-likelihood -31.148
300 - log-likelihood -30.633
400 - log-likelihood -30.304
500 - log-likelihood -30.038
600 - log-likelihood -29.941
700 - log-likelihood -29.916
800 - log-likelihood -29.902
900 - log-likelihood -29.893
A:
[[ 0.855 0.144 0.001 0. ]
[ 0.001 0.489 0.509 0.001]
[ 0.001 0.001 0.001 0.997]
[ 0.003 0.002 0.125 0.87 ]]
B:
[[ 0.833 0.003 0.995 0.143]
[ 0.165 0.001 0.003 0.376]
[ 0.001 0.996 0.002 0.481]]
pi:
[ 0.6 0.1 0.2 0.1]
"""
import tensorflow as tf
import numpy as np
np.set_printoptions(precision=3, suppress=True)
def log_likelihood(A, B, pi, obs):
'''Computes the log-likelihood of the HMM as log(P(obs|theta)).
The logarithmic version is preferred in optimization as it avoids
underflows and scales the loss appropriately, so that backward sweeps
are stay within meaningful ranges.
See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
for details.
'''
AT = tf.transpose(A)
def forward(i, prev, sumlogc):
f = tf.reshape(B[obs[i]], (4,1)) * (AT @ prev)
c = 1. / tf.reduce_sum(f)
return tf.add(i, 1), f * c, tf.add(sumlogc, tf.log(c))
i = tf.constant(0)
c = lambda i, prev, sumlogc: tf.less(i, obs.shape[0].value)
b = lambda i, prev, sumlogc: forward(i, prev, sumlogc)
r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1)), tf.constant(0.)])
return -r[2]
def likelihood(A, B, pi, obs):
'''Computes the likelihood of the the HMM given data P(obs|theta).
Uses the forward algorithm to compute P(obs, st|theta) and then
sums over all possible values state values at time t to get P(obs|theta).
Note that this version is not really suitable for optimization as
it quickly leads to underflows due to repetitive multiplication of
probabilities < 1. Use log_likelihood instead.
'''
AT = tf.transpose(A)
def forward(i, prev):
return tf.add(i, 1), tf.reshape(B[obs[i]], (4,1)) * (AT @ prev)
i = tf.constant(0)
c = lambda i, prev: tf.less(i, obs.shape[0].value)
b = lambda i, prev: forward(i, prev)
r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1))])
return tf.reduce_sum(r[1])
with tf.Graph().as_default() as g:
# Transition probabilities
A_ = tf.get_variable('A_', trainable=True, initializer=tf.constant(
np.array([
[0.8, 0.2, 0.0, 0.0],
[0.1, 0.5, 0.4, 0.0],
[0.0, 0.4, 0.2, 0.4],
[0.0, 0.0, 0.4, 0.6],
]), dtype=tf.float32
))
# Emission probabilities
B_ = tf.get_variable('B_', trainable=True, initializer=tf.constant(
np.array([
[0.7, 0.1, 0.2, 0.1],
[0.15, 0.4, 0.7, 0.3],
[0.15, 0.5, 0.1, 0.6]
]), dtype=tf.float32
))
# Initial state probabilities
pi_ = tf.get_variable('pi', trainable=False, initializer=tf.constant(
np.array([0.6, 0.1, 0.2, 0.1]), dtype=tf.float32
))
# Need to rescale trained variables in order to get a valid propability distribution.
A = tf.nn.softmax(A_, dim=1)
B = tf.nn.softmax(B_, dim=0)
pi = pi_
obs = tf.placeholder(dtype=np.int32, shape=(30,), name='obs')
ll = log_likelihood(A, B, pi, obs)
loss = -ll
step = tf.train.AdamOptimizer(1e-2).minimize(loss)
with tf.Session().as_default() as sess:
sess.run(tf.global_variables_initializer())
# Not ideal but for the sake of illustration: only one observation sequence. Will train on this sequence over and over...
the_obs = np.array([0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0])
print('Optimizing')
for i in range(1000):
vals = sess.run({'ll':ll, 'step': step}, feed_dict={obs : the_obs})
if i % 100 == 0:
print('{} - log-likelihood {:.3f}'.format(i, vals['ll']))
print(A.eval())
print(B.eval())
print(pi.eval())
@cheind
Copy link
Author

cheind commented Dec 17, 2017

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment