jonghwanhyeon · May 1, 2015 01:53
diff --git a/gistfile1.py b/gistfile1.py
 __author__ = 'NoSyu'

 """
 You are not permitted to modify the constants or functions marked as "GIVEN".
 Indeed you are free to implement extra functions as you need.

 You must handle data to run EM algorithm
 iter means the # of iterations
 Pi & Mu are parameters you must find
 nData is the # of data
 nDim is dimension of data
 nClusters is the # of clusters
 Clearly comment how to implement your code and meaning of each line only in em4bernoulli function
 """

 import numpy

 def em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters):
    """
    implement EM algorithm for Bernoulli
    :param data:
    :param iter:
    :param pi:
    :param mu:
    :param ndata:
    :param ndim:
    :param nclusters:
    :return: parameter pi and mu
    """
    
    for iter_i in xrange(iter):
        # E-step
        z = numpy.zeros((ndata, nclusters))
        for n in range(0, ndata):
            denominator = 0.0
            for m in range(0, nclusters):
                denominator += pi[m] * bernoulli_distribution(data[n], mu[m], ndim)
            for k in range(0, nclusters):
                z[n][k] = (pi[k] * bernoulli_distribution(data[n], mu[k], ndim)) / denominator

        # M-step
        N = numpy.zeros(nclusters)
        for m in range(0, nclusters):
            for n in range(0, ndata):
                N[m] += z[n][m]
        
        x = numpy.zeros((nclusters, ndim))
        for m in range(0, nclusters):
            for n in range(0, ndata):
                x[m] += z[n][m] * data[n]
            x[m] = x[m] / N[m]
            
        mu = x
        pi = N / ndata

        # Print log likelihood
        log_likelihood = 0.0
        for n in range(0, ndata):
            for k in range(0, nclusters):
                log_likelihood += z[n][k] * (numpy.log(pi[k]) + numpy.log(bernoulli_distribution(data[n], mu[k], ndim)))

        """
        Compute the log likelihood
        """

        print 'iter: %d\tloglikelihood: %f' % (iter_i, log_likelihood)

    # return parameter pi and mu
    return pi, mu


 def bernoulli_distribution(x_n, Mu_k, nDim):
    """
    Calculate Bernoulli distribution
    prob is Bernoulli distribution of binary variable x.n with parameter Mu.k
    Do not use any library.
    :param x_n:
    :param Mu_k:
    :param nDim:
    :return:
    """
    # x_n -> 1 * 256
    # mu_k -> 1 * 256

    """
    Compute bernoulli pdf
    """

    prob = numpy.prod((Mu_k ** x_n) * ((1 - Mu_k) ** (1 - x_n)))
    
    # return value add very small value to avoid underflow
    return prob + 1.0e-200


 """
    GIVEN
 """


 def reconstruct_images(filename, mu, nclusters):
    """
    Reconstruct images
    This shows distribution of mu_(ki)
    Draw 16x16 matrix
    :param mu:
    :param nclusters:
    :return:
    """
    fig = matplotlib.pyplot.figure(facecolor='w')
    size = 16

    for cluster_idx in xrange(nclusters):
        fig.add_subplot(1, nclusters, cluster_idx + 1)

        mu_k = mu[cluster_idx, :]
        param = numpy.zeros(shape=(size, size))
        
        index = 0
        for i in xrange(size):
            for j in xrange(size):
                param[size - i - 1, j] = mu_k[index]
                index += 1

        x = numpy.arange(0, size, 1)
        y = numpy.arange(0, size, 1)
        matplotlib.pyplot.contourf(x, y, param)

    fig.savefig(filename)

 def run(nclusters, iter):
    """
    There are two types of data
    We check full data for grading
    You can check your result with small data

    :param filename: data file name
    :param nclusters: the number of clusters
    :param iter: # iteration
    :return: void
    """
    num_lines = int(raw_input())
    data = []
    for line_idx in range(0, num_lines):
        line = raw_input().strip().split(" ")
        line = [float(x) for x in line]
        data.append(line)
    data = numpy.array(data)
    ndata, ndim = data.shape

    # Initialize Pi and Mu
    pi = numpy.random.uniform(size=nclusters)
    pi /= numpy.sum(pi)
    # fixed value
    pi = numpy.array([0.51558837, 0.48441163])

    mu = numpy.full(shape=(nclusters, ndim), fill_value=(1.0/ndim))

    print("initial Pi")
    print(pi)

    # Run EM algorithm
    pi, mu = em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters)

    print("Pi after running EM algorithm")
    print(pi)


 def main():
    """
    Main function
    :return: void
    """

    run(2, 20)


 if __name__ == "__main__":
    main()
	__author__ = 'NoSyu'

	"""
	You are not permitted to modify the constants or functions marked as "GIVEN".
	Indeed you are free to implement extra functions as you need.

	You must handle data to run EM algorithm
	iter means the # of iterations
	Pi & Mu are parameters you must find
	nData is the # of data
	nDim is dimension of data
	nClusters is the # of clusters
	Clearly comment how to implement your code and meaning of each line only in em4bernoulli function
	"""

	import numpy

	def em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters):
	"""
	implement EM algorithm for Bernoulli
	:param data:
	:param iter:
	:param pi:
	:param mu:
	:param ndata:
	:param ndim:
	:param nclusters:
	:return: parameter pi and mu
	"""

	for iter_i in xrange(iter):
	# E-step
	z = numpy.zeros((ndata, nclusters))
	for n in range(0, ndata):
	denominator = 0.0
	for m in range(0, nclusters):
	denominator += pi[m] * bernoulli_distribution(data[n], mu[m], ndim)
	for k in range(0, nclusters):
	z[n][k] = (pi[k] * bernoulli_distribution(data[n], mu[k], ndim)) / denominator

	# M-step
	N = numpy.zeros(nclusters)
	for m in range(0, nclusters):
	for n in range(0, ndata):
	N[m] += z[n][m]

	x = numpy.zeros((nclusters, ndim))
	for m in range(0, nclusters):
	for n in range(0, ndata):
	x[m] += z[n][m] * data[n]
	x[m] = x[m] / N[m]

	mu = x
	pi = N / ndata

	# Print log likelihood
	log_likelihood = 0.0
	for n in range(0, ndata):
	for k in range(0, nclusters):
	log_likelihood += z[n][k] * (numpy.log(pi[k]) + numpy.log(bernoulli_distribution(data[n], mu[k], ndim)))

	"""
	Compute the log likelihood
	"""

	print 'iter: %d\tloglikelihood: %f' % (iter_i, log_likelihood)

	# return parameter pi and mu
	return pi, mu


	def bernoulli_distribution(x_n, Mu_k, nDim):
	"""
	Calculate Bernoulli distribution
	prob is Bernoulli distribution of binary variable x.n with parameter Mu.k
	Do not use any library.
	:param x_n:
	:param Mu_k:
	:param nDim:
	:return:
	"""
	# x_n -> 1 * 256
	# mu_k -> 1 * 256

	"""
	Compute bernoulli pdf
	"""

	prob = numpy.prod((Mu_k ** x_n) * ((1 - Mu_k) ** (1 - x_n)))

	# return value add very small value to avoid underflow
	return prob + 1.0e-200


	"""
	GIVEN
	"""


	def reconstruct_images(filename, mu, nclusters):
	"""
	Reconstruct images
	This shows distribution of mu_(ki)
	Draw 16x16 matrix
	:param mu:
	:param nclusters:
	:return:
	"""
	fig = matplotlib.pyplot.figure(facecolor='w')
	size = 16

	for cluster_idx in xrange(nclusters):
	fig.add_subplot(1, nclusters, cluster_idx + 1)

	mu_k = mu[cluster_idx, :]
	param = numpy.zeros(shape=(size, size))

	index = 0
	for i in xrange(size):
	for j in xrange(size):
	param[size - i - 1, j] = mu_k[index]
	index += 1

	x = numpy.arange(0, size, 1)
	y = numpy.arange(0, size, 1)
	matplotlib.pyplot.contourf(x, y, param)

	fig.savefig(filename)

	def run(nclusters, iter):
	"""
	There are two types of data
	We check full data for grading
	You can check your result with small data

	:param filename: data file name
	:param nclusters: the number of clusters
	:param iter: # iteration
	:return: void
	"""
	num_lines = int(raw_input())
	data = []
	for line_idx in range(0, num_lines):
	line = raw_input().strip().split(" ")
	line = [float(x) for x in line]
	data.append(line)
	data = numpy.array(data)
	ndata, ndim = data.shape

	# Initialize Pi and Mu
	pi = numpy.random.uniform(size=nclusters)
	pi /= numpy.sum(pi)
	# fixed value
	pi = numpy.array([0.51558837, 0.48441163])

	mu = numpy.full(shape=(nclusters, ndim), fill_value=(1.0/ndim))

	print("initial Pi")
	print(pi)

	# Run EM algorithm
	pi, mu = em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters)

	print("Pi after running EM algorithm")
	print(pi)


	def main():
	"""
	Main function
	:return: void
	"""

	run(2, 20)


	if __name__ == "__main__":
	main()