-
-
Save mblondel/542786 to your computer and use it in GitHub Desktop.
""" | |
(C) Mathieu Blondel - 2010 | |
License: BSD 3 clause | |
Implementation of the collapsed Gibbs sampler for | |
Latent Dirichlet Allocation, as described in | |
Finding scientifc topics (Griffiths and Steyvers) | |
""" | |
import numpy as np | |
import scipy as sp | |
from scipy.special import gammaln | |
def sample_index(p): | |
""" | |
Sample from the Multinomial distribution and return the sample index. | |
""" | |
return np.random.multinomial(1,p).argmax() | |
def word_indices(vec): | |
""" | |
Turn a document vector of size vocab_size to a sequence | |
of word indices. The word indices are between 0 and | |
vocab_size-1. The sequence length is equal to the document length. | |
""" | |
for idx in vec.nonzero()[0]: | |
for i in xrange(int(vec[idx])): | |
yield idx | |
def log_multi_beta(alpha, K=None): | |
""" | |
Logarithm of the multinomial beta function. | |
""" | |
if K is None: | |
# alpha is assumed to be a vector | |
return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha)) | |
else: | |
# alpha is assumed to be a scalar | |
return K * gammaln(alpha) - gammaln(K*alpha) | |
class LdaSampler(object): | |
def __init__(self, n_topics, alpha=0.1, beta=0.1): | |
""" | |
n_topics: desired number of topics | |
alpha: a scalar (FIXME: accept vector of size n_topics) | |
beta: a scalar (FIME: accept vector of size vocab_size) | |
""" | |
self.n_topics = n_topics | |
self.alpha = alpha | |
self.beta = beta | |
def _initialize(self, matrix): | |
n_docs, vocab_size = matrix.shape | |
# number of times document m and topic z co-occur | |
self.nmz = np.zeros((n_docs, self.n_topics)) | |
# number of times topic z and word w co-occur | |
self.nzw = np.zeros((self.n_topics, vocab_size)) | |
self.nm = np.zeros(n_docs) | |
self.nz = np.zeros(self.n_topics) | |
self.topics = {} | |
for m in xrange(n_docs): | |
# i is a number between 0 and doc_length-1 | |
# w is a number between 0 and vocab_size-1 | |
for i, w in enumerate(word_indices(matrix[m, :])): | |
# choose an arbitrary topic as first topic for word i | |
z = np.random.randint(self.n_topics) | |
self.nmz[m,z] += 1 | |
self.nm[m] += 1 | |
self.nzw[z,w] += 1 | |
self.nz[z] += 1 | |
self.topics[(m,i)] = z | |
def _conditional_distribution(self, m, w): | |
""" | |
Conditional distribution (vector of size n_topics). | |
""" | |
vocab_size = self.nzw.shape[1] | |
left = (self.nzw[:,w] + self.beta) / \ | |
(self.nz + self.beta * vocab_size) | |
right = (self.nmz[m,:] + self.alpha) / \ | |
(self.nm[m] + self.alpha * self.n_topics) | |
p_z = left * right | |
# normalize to obtain probabilities | |
p_z /= np.sum(p_z) | |
return p_z | |
def loglikelihood(self): | |
""" | |
Compute the likelihood that the model generated the data. | |
""" | |
vocab_size = self.nzw.shape[1] | |
n_docs = self.nmz.shape[0] | |
lik = 0 | |
for z in xrange(self.n_topics): | |
lik += log_multi_beta(self.nzw[z,:]+self.beta) | |
lik -= log_multi_beta(self.beta, vocab_size) | |
for m in xrange(n_docs): | |
lik += log_multi_beta(self.nmz[m,:]+self.alpha) | |
lik -= log_multi_beta(self.alpha, self.n_topics) | |
return lik | |
def phi(self): | |
""" | |
Compute phi = p(w|z). | |
""" | |
V = self.nzw.shape[1] | |
num = self.nzw + self.beta | |
num /= np.sum(num, axis=1)[:, np.newaxis] | |
return num | |
def run(self, matrix, maxiter=30): | |
""" | |
Run the Gibbs sampler. | |
""" | |
n_docs, vocab_size = matrix.shape | |
self._initialize(matrix) | |
for it in xrange(maxiter): | |
for m in xrange(n_docs): | |
for i, w in enumerate(word_indices(matrix[m, :])): | |
z = self.topics[(m,i)] | |
self.nmz[m,z] -= 1 | |
self.nm[m] -= 1 | |
self.nzw[z,w] -= 1 | |
self.nz[z] -= 1 | |
p_z = self._conditional_distribution(m, w) | |
z = sample_index(p_z) | |
self.nmz[m,z] += 1 | |
self.nm[m] += 1 | |
self.nzw[z,w] += 1 | |
self.nz[z] += 1 | |
self.topics[(m,i)] = z | |
# FIXME: burn-in and lag! | |
yield self.phi() | |
if __name__ == "__main__": | |
import os | |
import shutil | |
N_TOPICS = 10 | |
DOCUMENT_LENGTH = 100 | |
FOLDER = "topicimg" | |
def vertical_topic(width, topic_index, document_length): | |
""" | |
Generate a topic whose words form a vertical bar. | |
""" | |
m = np.zeros((width, width)) | |
m[:, topic_index] = int(document_length / width) | |
return m.flatten() | |
def horizontal_topic(width, topic_index, document_length): | |
""" | |
Generate a topic whose words form a horizontal bar. | |
""" | |
m = np.zeros((width, width)) | |
m[topic_index, :] = int(document_length / width) | |
return m.flatten() | |
def save_document_image(filename, doc, zoom=2): | |
""" | |
Save document as an image. | |
doc must be a square matrix | |
""" | |
height, width = doc.shape | |
zoom = np.ones((width*zoom, width*zoom)) | |
# imsave scales pixels between 0 and 255 automatically | |
sp.misc.imsave(filename, np.kron(doc, zoom)) | |
def gen_word_distribution(n_topics, document_length): | |
""" | |
Generate a word distribution for each of the n_topics. | |
""" | |
width = n_topics / 2 | |
vocab_size = width ** 2 | |
m = np.zeros((n_topics, vocab_size)) | |
for k in range(width): | |
m[k,:] = vertical_topic(width, k, document_length) | |
for k in range(width): | |
m[k+width,:] = horizontal_topic(width, k, document_length) | |
m /= m.sum(axis=1)[:, np.newaxis] # turn counts into probabilities | |
return m | |
def gen_document(word_dist, n_topics, vocab_size, length=DOCUMENT_LENGTH, alpha=0.1): | |
""" | |
Generate a document: | |
1) Sample topic proportions from the Dirichlet distribution. | |
2) Sample a topic index from the Multinomial with the topic | |
proportions from 1). | |
3) Sample a word from the Multinomial corresponding to the topic | |
index from 2). | |
4) Go to 2) if need another word. | |
""" | |
theta = np.random.mtrand.dirichlet([alpha] * n_topics) | |
v = np.zeros(vocab_size) | |
for n in range(length): | |
z = sample_index(theta) | |
w = sample_index(word_dist[z,:]) | |
v[w] += 1 | |
return v | |
def gen_documents(word_dist, n_topics, vocab_size, n=500): | |
""" | |
Generate a document-term matrix. | |
""" | |
m = np.zeros((n, vocab_size)) | |
for i in xrange(n): | |
m[i, :] = gen_document(word_dist, n_topics, vocab_size) | |
return m | |
if os.path.exists(FOLDER): | |
shutil.rmtree(FOLDER) | |
os.mkdir(FOLDER) | |
width = N_TOPICS / 2 | |
vocab_size = width ** 2 | |
word_dist = gen_word_distribution(N_TOPICS, DOCUMENT_LENGTH) | |
matrix = gen_documents(word_dist, N_TOPICS, vocab_size) | |
sampler = LdaSampler(N_TOPICS) | |
for it, phi in enumerate(sampler.run(matrix)): | |
print "Iteration", it | |
print "Likelihood", sampler.loglikelihood() | |
if it % 5 == 0: | |
for z in range(N_TOPICS): | |
save_document_image("topicimg/topic%d-%d.png" % (it,z), | |
phi[z,:].reshape(width,-1)) | |
Not sure why the import failed to include the misc subfolder, but I was able to get the demonstration running with this tweak to save_document_image():
def save_document_image(filename, doc, zoom=2):
"""
Save document as an image.
doc must be a square matrix
"""
height, width = doc.shape
zoom = np.ones((width*zoom, width*zoom))
# imsave scales pixels between 0 and 255 automatically
sp.misc.imsave(filename, np.kron(doc, zoom))
try:
sp.misc.imsave(filename, np.kron(doc, zoom))
except:
import scipy.misc
scipy.misc.imsave(filename, np.kron(doc, zoom))
hi @Mathiue, here const doc_length is used for training samples. According to Gregor's Parameter estimation for text analysis, Fig.7 defination, Nm(doc length) should follow Poisson distribution, E=λ.
so in line 210, const number should better modified to
for n in np.random.poisson(length, DOC_NUM):
Thanks for posting this. I'd like to use this code for a project. If that's ok, can you add a license?
Quick note to anyone struggling with the scipy.misc.imsave import, you need to have PIL installed for this import to work. Python dependency management is crazy!
Re @cdfox's comment, you're much better off doing a searchsorted(cumsum(p), rand())
than np.random.multinomial(1,p).argmax()
which is efficient only when you're taking a large sample.
I implemented Gibbs sampler for standard LDA inference. My program updates alpha(vector) and beta(scalar) during the iterative sampling process by using Minka's fixed-point iteration.
Visit here: https://gist.github.com/ChangUk/a741e0ccf5737954956e
I hope it is helpful for your project. Thanks.
@ChangUk, how do you check the convergence?
How is choosing Dirichlet Prior values ( alpha = 0.1 ) different from choosing a uniform prior ( alpha = 1) ?
There is one https://github.com/fannix/lda-cython by @fannix