Created
December 17, 2017 10:15
-
-
Save cheind/6ae72effe60ad3c1f56187c4cac2bb4d to your computer and use it in GitHub Desktop.
HMM training based on gradient descent (MXNet imperative version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'Christoph Heindl' | |
__copyright__ = 'Copyright 2017' | |
__license__ = 'BSD' | |
"""Trains a HMM based on gradient descent optimization. | |
The parameters (theta) of the model are transition and | |
emission probabilities, as well as the initial state probabilities. | |
Given a start solution, the negative log likelihood of data given the | |
model P(obs|theta) is minimized. | |
Since we cannot guarantee valid probability distributions after a gradient | |
step, we add an additional layer to the trainable variables that turns | |
them into probability distributions (here softmax is used). The optimized | |
distributions are then the softmaxed version of optimized variables. | |
""" | |
import mxnet as mx | |
import numpy as np | |
def log_likelihood(A, B, pi, obs): | |
'''Computes the log-likelihood of the HMM as log(P(obs|theta)). | |
The logarithmic version is preferred in optimization as it avoids | |
underflows and scales the loss appropriately, so that backward sweeps | |
are stay within meaningful ranges. | |
See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf | |
for details. | |
''' | |
AT = mx.ndarray.transpose(A) | |
def forward(prev, o): | |
f = B[o].reshape((4,1)) * (mx.ndarray.dot(AT, prev)) | |
c = 1. / f.sum() | |
return f * c, c | |
f = pi | |
s = mx.ndarray.zeros(1, ctx=A.context) | |
for i, o in enumerate(obs): | |
f, c = forward(f, o) | |
s = s + c.log() | |
return -s | |
np.set_printoptions(precision=3, suppress=True) | |
model_ctx = mx.gpu(0) | |
# Transition probabilities | |
A_ = mx.ndarray.array( | |
np.array([ | |
[0.8, 0.2, 0.0, 0.0], | |
[0.1, 0.5, 0.4, 0.0], | |
[0.0, 0.4, 0.2, 0.4], | |
[0.0, 0.0, 0.4, 0.6], | |
]), dtype=np.float32, ctx=mx.gpu(0) | |
) | |
# Emission probabilities | |
B_ = mx.ndarray.array( | |
np.array([ | |
[0.7, 0.1, 0.2, 0.1], | |
[0.15, 0.4, 0.7, 0.3], | |
[0.15, 0.5, 0.1, 0.6] | |
]), dtype=np.float32, ctx=mx.gpu(0) | |
) | |
# Initial state probabilities | |
pi = mx.ndarray.array([0.6, 0.1, 0.2, 0.1], dtype=np.float32, ctx=mx.gpu(0)) | |
params = [A_, B_] | |
for p in params: | |
p.attach_grad() | |
def step(params, lr): | |
for p in params: | |
p[:] = p[:] - lr * p.grad | |
obs = [0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0] | |
for i in range(1000): | |
with mx.autograd.record(): | |
A = mx.ndarray.softmax(A_, axis=1) | |
B = mx.ndarray.softmax(B_, axis=0) | |
ll = log_likelihood(A, B, pi, obs) | |
loss = -ll | |
loss.backward() | |
step(params, 1e-2) | |
if i % 100 == 0: | |
print('{} - log-likelihood {}'.format(i, ll)) | |
print(mx.ndarray.softmax(A_, axis=1)) | |
print(mx.ndarray.softmax(B_, axis=0)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice and helpful as an example of low-level imperative MXNet implementation. No symbols, just NDArray API. Thanks!