Last active
December 5, 2022 07:20
-
-
Save cheind/ba1541322fa870f6d5134336a7726c37 to your computer and use it in GitHub Desktop.
HMM training based on gradient descent (Tensorflow version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'Christoph Heindl' | |
__copyright__ = 'Copyright 2017' | |
__license__ = 'BSD' | |
"""Trains a HMM based on gradient descent optimization. | |
The parameters (theta) of the model are transition and | |
emission probabilities, as well as the initial state probabilities. | |
Given a start solution, the negative log likelihood of data given the | |
model P(obs|theta) is minimized. | |
Since we cannot guarantee valid probability distributions after a gradient | |
step, we add an additional layer to the trainable variables that turns | |
them into probability distributions (here softmax is used). The optimized | |
distributions are then the softmaxed version of optimized variables. | |
Running this should give you output similar to the following lines | |
Optimizing | |
0 - log-likelihood -33.050 | |
100 - log-likelihood -31.512 | |
200 - log-likelihood -31.148 | |
300 - log-likelihood -30.633 | |
400 - log-likelihood -30.304 | |
500 - log-likelihood -30.038 | |
600 - log-likelihood -29.941 | |
700 - log-likelihood -29.916 | |
800 - log-likelihood -29.902 | |
900 - log-likelihood -29.893 | |
A: | |
[[ 0.855 0.144 0.001 0. ] | |
[ 0.001 0.489 0.509 0.001] | |
[ 0.001 0.001 0.001 0.997] | |
[ 0.003 0.002 0.125 0.87 ]] | |
B: | |
[[ 0.833 0.003 0.995 0.143] | |
[ 0.165 0.001 0.003 0.376] | |
[ 0.001 0.996 0.002 0.481]] | |
pi: | |
[ 0.6 0.1 0.2 0.1] | |
""" | |
import tensorflow as tf | |
import numpy as np | |
np.set_printoptions(precision=3, suppress=True) | |
def log_likelihood(A, B, pi, obs): | |
'''Computes the log-likelihood of the HMM as log(P(obs|theta)). | |
The logarithmic version is preferred in optimization as it avoids | |
underflows and scales the loss appropriately, so that backward sweeps | |
are stay within meaningful ranges. | |
See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf | |
for details. | |
''' | |
AT = tf.transpose(A) | |
def forward(i, prev, sumlogc): | |
f = tf.reshape(B[obs[i]], (4,1)) * (AT @ prev) | |
c = 1. / tf.reduce_sum(f) | |
return tf.add(i, 1), f * c, tf.add(sumlogc, tf.log(c)) | |
i = tf.constant(0) | |
c = lambda i, prev, sumlogc: tf.less(i, obs.shape[0].value) | |
b = lambda i, prev, sumlogc: forward(i, prev, sumlogc) | |
r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1)), tf.constant(0.)]) | |
return -r[2] | |
def likelihood(A, B, pi, obs): | |
'''Computes the likelihood of the the HMM given data P(obs|theta). | |
Uses the forward algorithm to compute P(obs, st|theta) and then | |
sums over all possible values state values at time t to get P(obs|theta). | |
Note that this version is not really suitable for optimization as | |
it quickly leads to underflows due to repetitive multiplication of | |
probabilities < 1. Use log_likelihood instead. | |
''' | |
AT = tf.transpose(A) | |
def forward(i, prev): | |
return tf.add(i, 1), tf.reshape(B[obs[i]], (4,1)) * (AT @ prev) | |
i = tf.constant(0) | |
c = lambda i, prev: tf.less(i, obs.shape[0].value) | |
b = lambda i, prev: forward(i, prev) | |
r = tf.while_loop(c, b, [i, tf.reshape(pi, (4,1))]) | |
return tf.reduce_sum(r[1]) | |
with tf.Graph().as_default() as g: | |
# Transition probabilities | |
A_ = tf.get_variable('A_', trainable=True, initializer=tf.constant( | |
np.array([ | |
[0.8, 0.2, 0.0, 0.0], | |
[0.1, 0.5, 0.4, 0.0], | |
[0.0, 0.4, 0.2, 0.4], | |
[0.0, 0.0, 0.4, 0.6], | |
]), dtype=tf.float32 | |
)) | |
# Emission probabilities | |
B_ = tf.get_variable('B_', trainable=True, initializer=tf.constant( | |
np.array([ | |
[0.7, 0.1, 0.2, 0.1], | |
[0.15, 0.4, 0.7, 0.3], | |
[0.15, 0.5, 0.1, 0.6] | |
]), dtype=tf.float32 | |
)) | |
# Initial state probabilities | |
pi_ = tf.get_variable('pi', trainable=False, initializer=tf.constant( | |
np.array([0.6, 0.1, 0.2, 0.1]), dtype=tf.float32 | |
)) | |
# Need to rescale trained variables in order to get a valid propability distribution. | |
A = tf.nn.softmax(A_, dim=1) | |
B = tf.nn.softmax(B_, dim=0) | |
pi = pi_ | |
obs = tf.placeholder(dtype=np.int32, shape=(30,), name='obs') | |
ll = log_likelihood(A, B, pi, obs) | |
loss = -ll | |
step = tf.train.AdamOptimizer(1e-2).minimize(loss) | |
with tf.Session().as_default() as sess: | |
sess.run(tf.global_variables_initializer()) | |
# Not ideal but for the sake of illustration: only one observation sequence. Will train on this sequence over and over... | |
the_obs = np.array([0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0]) | |
print('Optimizing') | |
for i in range(1000): | |
vals = sess.run({'ll':ll, 'step': step}, feed_dict={obs : the_obs}) | |
if i % 100 == 0: | |
print('{} - log-likelihood {:.3f}'.format(i, vals['ll'])) | |
print(A.eval()) | |
print(B.eval()) | |
print(pi.eval()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
MXNet Version https://gist.github.com/cheind/6ae72effe60ad3c1f56187c4cac2bb4d