Created
May 1, 2015 01:53
-
-
Save jonghwanhyeon/d1798e130e79ab6bbeb1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'NoSyu' | |
""" | |
You are not permitted to modify the constants or functions marked as "GIVEN". | |
Indeed you are free to implement extra functions as you need. | |
You must handle data to run EM algorithm | |
iter means the # of iterations | |
Pi & Mu are parameters you must find | |
nData is the # of data | |
nDim is dimension of data | |
nClusters is the # of clusters | |
Clearly comment how to implement your code and meaning of each line only in em4bernoulli function | |
""" | |
import numpy | |
def em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters): | |
""" | |
implement EM algorithm for Bernoulli | |
:param data: | |
:param iter: | |
:param pi: | |
:param mu: | |
:param ndata: | |
:param ndim: | |
:param nclusters: | |
:return: parameter pi and mu | |
""" | |
for iter_i in xrange(iter): | |
# E-step | |
z = numpy.zeros((ndata, nclusters)) | |
for n in range(0, ndata): | |
denominator = 0.0 | |
for m in range(0, nclusters): | |
denominator += pi[m] * bernoulli_distribution(data[n], mu[m], ndim) | |
for k in range(0, nclusters): | |
z[n][k] = (pi[k] * bernoulli_distribution(data[n], mu[k], ndim)) / denominator | |
# M-step | |
N = numpy.zeros(nclusters) | |
for m in range(0, nclusters): | |
for n in range(0, ndata): | |
N[m] += z[n][m] | |
x = numpy.zeros((nclusters, ndim)) | |
for m in range(0, nclusters): | |
for n in range(0, ndata): | |
x[m] += z[n][m] * data[n] | |
x[m] = x[m] / N[m] | |
mu = x | |
pi = N / ndata | |
# Print log likelihood | |
log_likelihood = 0.0 | |
for n in range(0, ndata): | |
for k in range(0, nclusters): | |
log_likelihood += z[n][k] * (numpy.log(pi[k]) + numpy.log(bernoulli_distribution(data[n], mu[k], ndim))) | |
""" | |
Compute the log likelihood | |
""" | |
print 'iter: %d\tloglikelihood: %f' % (iter_i, log_likelihood) | |
# return parameter pi and mu | |
return pi, mu | |
def bernoulli_distribution(x_n, Mu_k, nDim): | |
""" | |
Calculate Bernoulli distribution | |
prob is Bernoulli distribution of binary variable x.n with parameter Mu.k | |
Do not use any library. | |
:param x_n: | |
:param Mu_k: | |
:param nDim: | |
:return: | |
""" | |
# x_n -> 1 * 256 | |
# mu_k -> 1 * 256 | |
""" | |
Compute bernoulli pdf | |
""" | |
prob = numpy.prod((Mu_k ** x_n) * ((1 - Mu_k) ** (1 - x_n))) | |
# return value add very small value to avoid underflow | |
return prob + 1.0e-200 | |
""" | |
GIVEN | |
""" | |
def reconstruct_images(filename, mu, nclusters): | |
""" | |
Reconstruct images | |
This shows distribution of mu_(ki) | |
Draw 16x16 matrix | |
:param mu: | |
:param nclusters: | |
:return: | |
""" | |
fig = matplotlib.pyplot.figure(facecolor='w') | |
size = 16 | |
for cluster_idx in xrange(nclusters): | |
fig.add_subplot(1, nclusters, cluster_idx + 1) | |
mu_k = mu[cluster_idx, :] | |
param = numpy.zeros(shape=(size, size)) | |
index = 0 | |
for i in xrange(size): | |
for j in xrange(size): | |
param[size - i - 1, j] = mu_k[index] | |
index += 1 | |
x = numpy.arange(0, size, 1) | |
y = numpy.arange(0, size, 1) | |
matplotlib.pyplot.contourf(x, y, param) | |
fig.savefig(filename) | |
def run(nclusters, iter): | |
""" | |
There are two types of data | |
We check full data for grading | |
You can check your result with small data | |
:param filename: data file name | |
:param nclusters: the number of clusters | |
:param iter: # iteration | |
:return: void | |
""" | |
num_lines = int(raw_input()) | |
data = [] | |
for line_idx in range(0, num_lines): | |
line = raw_input().strip().split(" ") | |
line = [float(x) for x in line] | |
data.append(line) | |
data = numpy.array(data) | |
ndata, ndim = data.shape | |
# Initialize Pi and Mu | |
pi = numpy.random.uniform(size=nclusters) | |
pi /= numpy.sum(pi) | |
# fixed value | |
pi = numpy.array([0.51558837, 0.48441163]) | |
mu = numpy.full(shape=(nclusters, ndim), fill_value=(1.0/ndim)) | |
print("initial Pi") | |
print(pi) | |
# Run EM algorithm | |
pi, mu = em4bernoulli(data, iter, pi, mu, ndata, ndim, nclusters) | |
print("Pi after running EM algorithm") | |
print(pi) | |
def main(): | |
""" | |
Main function | |
:return: void | |
""" | |
run(2, 20) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment