Skip to content

Instantly share code, notes, and snippets.

@cheind
Created December 17, 2017 10:15
Show Gist options
  • Save cheind/6ae72effe60ad3c1f56187c4cac2bb4d to your computer and use it in GitHub Desktop.
Save cheind/6ae72effe60ad3c1f56187c4cac2bb4d to your computer and use it in GitHub Desktop.
HMM training based on gradient descent (MXNet imperative version)
__author__ = 'Christoph Heindl'
__copyright__ = 'Copyright 2017'
__license__ = 'BSD'
"""Trains a HMM based on gradient descent optimization.
The parameters (theta) of the model are transition and
emission probabilities, as well as the initial state probabilities.
Given a start solution, the negative log likelihood of data given the
model P(obs|theta) is minimized.
Since we cannot guarantee valid probability distributions after a gradient
step, we add an additional layer to the trainable variables that turns
them into probability distributions (here softmax is used). The optimized
distributions are then the softmaxed version of optimized variables.
"""
import mxnet as mx
import numpy as np
def log_likelihood(A, B, pi, obs):
'''Computes the log-likelihood of the HMM as log(P(obs|theta)).
The logarithmic version is preferred in optimization as it avoids
underflows and scales the loss appropriately, so that backward sweeps
are stay within meaningful ranges.
See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
for details.
'''
AT = mx.ndarray.transpose(A)
def forward(prev, o):
f = B[o].reshape((4,1)) * (mx.ndarray.dot(AT, prev))
c = 1. / f.sum()
return f * c, c
f = pi
s = mx.ndarray.zeros(1, ctx=A.context)
for i, o in enumerate(obs):
f, c = forward(f, o)
s = s + c.log()
return -s
np.set_printoptions(precision=3, suppress=True)
model_ctx = mx.gpu(0)
# Transition probabilities
A_ = mx.ndarray.array(
np.array([
[0.8, 0.2, 0.0, 0.0],
[0.1, 0.5, 0.4, 0.0],
[0.0, 0.4, 0.2, 0.4],
[0.0, 0.0, 0.4, 0.6],
]), dtype=np.float32, ctx=mx.gpu(0)
)
# Emission probabilities
B_ = mx.ndarray.array(
np.array([
[0.7, 0.1, 0.2, 0.1],
[0.15, 0.4, 0.7, 0.3],
[0.15, 0.5, 0.1, 0.6]
]), dtype=np.float32, ctx=mx.gpu(0)
)
# Initial state probabilities
pi = mx.ndarray.array([0.6, 0.1, 0.2, 0.1], dtype=np.float32, ctx=mx.gpu(0))
params = [A_, B_]
for p in params:
p.attach_grad()
def step(params, lr):
for p in params:
p[:] = p[:] - lr * p.grad
obs = [0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0]
for i in range(1000):
with mx.autograd.record():
A = mx.ndarray.softmax(A_, axis=1)
B = mx.ndarray.softmax(B_, axis=0)
ll = log_likelihood(A, B, pi, obs)
loss = -ll
loss.backward()
step(params, 1e-2)
if i % 100 == 0:
print('{} - log-likelihood {}'.format(i, ll))
print(mx.ndarray.softmax(A_, axis=1))
print(mx.ndarray.softmax(B_, axis=0))
@kczarn
Copy link

kczarn commented Jul 26, 2018

Nice and helpful as an example of low-level imperative MXNet implementation. No symbols, just NDArray API. Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment