Skip to content

Instantly share code, notes, and snippets.

@jonghwanhyeon
Created May 1, 2015 01:53
Show Gist options
  • Save jonghwanhyeon/d1798e130e79ab6bbeb1 to your computer and use it in GitHub Desktop.
Save jonghwanhyeon/d1798e130e79ab6bbeb1 to your computer and use it in GitHub Desktop.
__author__ = 'NoSyu'
"""
You are not permitted to modify the constants or functions marked as "GIVEN".
Indeed you are free to implement extra functions as you need.
You must handle data to run EM algorithm
iter means the # of iterations
Pi & Mu are parameters you must find
nData is the # of data
nDim is dimension of data
nClusters is the # of clusters
Clearly comment how to implement your code and meaning of each line only in em4bernoulli function
"""
import numpy
def em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters):
"""
implement EM algorithm for Bernoulli
:param data:
:param iter:
:param pi:
:param mu:
:param ndata:
:param ndim:
:param nclusters:
:return: parameter pi and mu
"""
for iter_i in xrange(iter):
# E-step
z = numpy.zeros((ndata, nclusters))
for n in range(0, ndata):
denominator = 0.0
for m in range(0, nclusters):
denominator += pi[m] * bernoulli_distribution(data[n], mu[m], ndim)
for k in range(0, nclusters):
z[n][k] = (pi[k] * bernoulli_distribution(data[n], mu[k], ndim)) / denominator
# M-step
N = numpy.zeros(nclusters)
for m in range(0, nclusters):
for n in range(0, ndata):
N[m] += z[n][m]
x = numpy.zeros((nclusters, ndim))
for m in range(0, nclusters):
for n in range(0, ndata):
x[m] += z[n][m] * data[n]
x[m] = x[m] / N[m]
mu = x
pi = N / ndata
# Print log likelihood
log_likelihood = 0.0
for n in range(0, ndata):
for k in range(0, nclusters):
log_likelihood += z[n][k] * (numpy.log(pi[k]) + numpy.log(bernoulli_distribution(data[n], mu[k], ndim)))
"""
Compute the log likelihood
"""
print 'iter: %d\tloglikelihood: %f' % (iter_i, log_likelihood)
# return parameter pi and mu
return pi, mu
def bernoulli_distribution(x_n, Mu_k, nDim):
"""
Calculate Bernoulli distribution
prob is Bernoulli distribution of binary variable x.n with parameter Mu.k
Do not use any library.
:param x_n:
:param Mu_k:
:param nDim:
:return:
"""
# x_n -> 1 * 256
# mu_k -> 1 * 256
"""
Compute bernoulli pdf
"""
prob = numpy.prod((Mu_k ** x_n) * ((1 - Mu_k) ** (1 - x_n)))
# return value add very small value to avoid underflow
return prob + 1.0e-200
"""
GIVEN
"""
def reconstruct_images(filename, mu, nclusters):
"""
Reconstruct images
This shows distribution of mu_(ki)
Draw 16x16 matrix
:param mu:
:param nclusters:
:return:
"""
fig = matplotlib.pyplot.figure(facecolor='w')
size = 16
for cluster_idx in xrange(nclusters):
fig.add_subplot(1, nclusters, cluster_idx + 1)
mu_k = mu[cluster_idx, :]
param = numpy.zeros(shape=(size, size))
index = 0
for i in xrange(size):
for j in xrange(size):
param[size - i - 1, j] = mu_k[index]
index += 1
x = numpy.arange(0, size, 1)
y = numpy.arange(0, size, 1)
matplotlib.pyplot.contourf(x, y, param)
fig.savefig(filename)
def run(nclusters, iter):
"""
There are two types of data
We check full data for grading
You can check your result with small data
:param filename: data file name
:param nclusters: the number of clusters
:param iter: # iteration
:return: void
"""
num_lines = int(raw_input())
data = []
for line_idx in range(0, num_lines):
line = raw_input().strip().split(" ")
line = [float(x) for x in line]
data.append(line)
data = numpy.array(data)
ndata, ndim = data.shape
# Initialize Pi and Mu
pi = numpy.random.uniform(size=nclusters)
pi /= numpy.sum(pi)
# fixed value
pi = numpy.array([0.51558837, 0.48441163])
mu = numpy.full(shape=(nclusters, ndim), fill_value=(1.0/ndim))
print("initial Pi")
print(pi)
# Run EM algorithm
pi, mu = em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters)
print("Pi after running EM algorithm")
print(pi)
def main():
"""
Main function
:return: void
"""
run(2, 20)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment