cheind · December 17, 2017 10:15 · kczarn · Jul 26, 2018
diff --git a/hmm_train_mxnet_imp.py b/hmm_train_mxnet_imp.py
 __author__ = 'Christoph Heindl'
 __copyright__ = 'Copyright 2017'
 __license__ = 'BSD'

 """Trains a HMM based on gradient descent optimization. 

 The parameters (theta) of the model are transition and 
 emission probabilities, as well as the initial state probabilities. 

 Given a start solution, the negative log likelihood of data given the 
 model P(obs|theta) is minimized. 

 Since we cannot guarantee valid probability distributions after a gradient
 step, we add an additional layer to the trainable variables that turns
 them into probability distributions (here softmax is used). The optimized
 distributions are then the softmaxed version of optimized variables.
 """

 import mxnet as mx
 import numpy as np


 def log_likelihood(A, B, pi, obs):
    '''Computes the log-likelihood of the HMM as log(P(obs|theta)).

    The logarithmic version is preferred in optimization as it avoids
    underflows and scales the loss appropriately, so that backward sweeps
    are stay within meaningful ranges.
    
    See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
    for details.
    '''

    AT = mx.ndarray.transpose(A)
    
    def forward(prev, o):   
        f = B[o].reshape((4,1)) * (mx.ndarray.dot(AT, prev))
        c = 1. / f.sum()
        return f * c, c

    f = pi
    s = mx.ndarray.zeros(1, ctx=A.context)    
    for i, o in enumerate(obs):
        f, c = forward(f, o)
        s = s + c.log()
    return -s
    
 np.set_printoptions(precision=3, suppress=True)
 model_ctx = mx.gpu(0)

 # Transition probabilities
 A_ = mx.ndarray.array(
    np.array([
        [0.8, 0.2, 0.0, 0.0],
        [0.1, 0.5, 0.4, 0.0],
        [0.0, 0.4, 0.2, 0.4],
        [0.0, 0.0, 0.4, 0.6],
    ]), dtype=np.float32, ctx=mx.gpu(0)
 )

 # Emission probabilities
 B_ = mx.ndarray.array(
    np.array([
        [0.7, 0.1, 0.2, 0.1],
        [0.15, 0.4, 0.7, 0.3],
        [0.15, 0.5, 0.1, 0.6]
    ]), dtype=np.float32, ctx=mx.gpu(0)
 )

 # Initial state probabilities
 pi = mx.ndarray.array([0.6, 0.1, 0.2, 0.1], dtype=np.float32, ctx=mx.gpu(0))

 params = [A_, B_]
 for p in params:
    p.attach_grad()

 def step(params, lr):
    for p in params:
        p[:] = p[:] - lr * p.grad

 obs = [0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0]
 for i in range(1000):            

    with mx.autograd.record():
        A = mx.ndarray.softmax(A_, axis=1)
        B = mx.ndarray.softmax(B_, axis=0)
        ll = log_likelihood(A, B, pi, obs)
        loss = -ll
    loss.backward()    
    step(params, 1e-2)

    if i % 100 == 0:
        print('{} - log-likelihood {}'.format(i, ll))

 print(mx.ndarray.softmax(A_, axis=1))
 print(mx.ndarray.softmax(B_, axis=0))
	__author__ = 'Christoph Heindl'
	__copyright__ = 'Copyright 2017'
	__license__ = 'BSD'

	"""Trains a HMM based on gradient descent optimization.

	The parameters (theta) of the model are transition and
	emission probabilities, as well as the initial state probabilities.

	Given a start solution, the negative log likelihood of data given the
	model P(obs\|theta) is minimized.

	Since we cannot guarantee valid probability distributions after a gradient
	step, we add an additional layer to the trainable variables that turns
	them into probability distributions (here softmax is used). The optimized
	distributions are then the softmaxed version of optimized variables.
	"""

	import mxnet as mx
	import numpy as np


	def log_likelihood(A, B, pi, obs):
	'''Computes the log-likelihood of the HMM as log(P(obs\|theta)).

	The logarithmic version is preferred in optimization as it avoids
	underflows and scales the loss appropriately, so that backward sweeps
	are stay within meaningful ranges.

	See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
	for details.
	'''

	AT = mx.ndarray.transpose(A)

	def forward(prev, o):
	f = B[o].reshape((4,1)) * (mx.ndarray.dot(AT, prev))
	c = 1. / f.sum()
	return f * c, c

	f = pi
	s = mx.ndarray.zeros(1, ctx=A.context)
	for i, o in enumerate(obs):
	f, c = forward(f, o)
	s = s + c.log()
	return -s

	np.set_printoptions(precision=3, suppress=True)
	model_ctx = mx.gpu(0)

	# Transition probabilities
	A_ = mx.ndarray.array(
	np.array([
	[0.8, 0.2, 0.0, 0.0],
	[0.1, 0.5, 0.4, 0.0],
	[0.0, 0.4, 0.2, 0.4],
	[0.0, 0.0, 0.4, 0.6],
	]), dtype=np.float32, ctx=mx.gpu(0)
	)

	# Emission probabilities
	B_ = mx.ndarray.array(
	np.array([
	[0.7, 0.1, 0.2, 0.1],
	[0.15, 0.4, 0.7, 0.3],
	[0.15, 0.5, 0.1, 0.6]
	]), dtype=np.float32, ctx=mx.gpu(0)
	)

	# Initial state probabilities
	pi = mx.ndarray.array([0.6, 0.1, 0.2, 0.1], dtype=np.float32, ctx=mx.gpu(0))

	params = [A_, B_]
	for p in params:
	p.attach_grad()

	def step(params, lr):
	for p in params:
	p[:] = p[:] - lr * p.grad

	obs = [0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0]
	for i in range(1000):

	with mx.autograd.record():
	A = mx.ndarray.softmax(A_, axis=1)
	B = mx.ndarray.softmax(B_, axis=0)
	ll = log_likelihood(A, B, pi, obs)
	loss = -ll
	loss.backward()
	step(params, 1e-2)

	if i % 100 == 0:
	print('{} - log-likelihood {}'.format(i, ll))

	print(mx.ndarray.softmax(A_, axis=1))
	print(mx.ndarray.softmax(B_, axis=0))