A script that replicates all examples in my blog post on using the lda
Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
$ python ex002_lda.py
Or,
$ chmod u+x ex002_lda.py
$ ./ex002_lda.py
A script that replicates all examples in my blog post on using the lda
Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
$ python ex002_lda.py
Or,
$ chmod u+x ex002_lda.py
$ ./ex002_lda.py
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# vim:fenc=utf-8 | |
# | |
# Copyright © 2014 Christopher C. Strelioff <[email protected]> | |
# | |
# Distributed under terms of the MIT license. | |
""" | |
ex002.py -- An example of LDA in Python. | |
""" | |
from __future__ import division, print_function | |
import numpy as np | |
import lda | |
import lda.datasets | |
# document-term matrix | |
X = lda.datasets.load_reuters() | |
print("type(X): {}".format(type(X))) | |
print("shape: {}\n".format(X.shape)) | |
# the vocab | |
vocab = lda.datasets.load_reuters_vocab() | |
print("type(vocab): {}".format(type(vocab))) | |
print("len(vocab): {}\n".format(len(vocab))) | |
# titles for each story | |
titles = lda.datasets.load_reuters_titles() | |
print("type(titles): {}".format(type(titles))) | |
print("len(titles): {}\n".format(len(titles))) | |
doc_id = 0 | |
word_id = 3117 | |
print("doc id: {} word id: {}".format(doc_id, word_id)) | |
print("-- count: {}".format(X[doc_id, word_id])) | |
print("-- word : {}".format(vocab[word_id])) | |
print("-- doc : {}".format(titles[doc_id])) | |
model = lda.LDA(n_topics=20, n_iter=500, random_state=1) | |
model.fit(X) | |
topic_word = model.topic_word_ | |
print("type(topic_word): {}".format(type(topic_word))) | |
print("shape: {}".format(topic_word.shape)) | |
for n in range(5): | |
sum_pr = sum(topic_word[n,:]) | |
print("topic: {} sum: {}".format(n, sum_pr)) | |
n = 5 | |
for i, topic_dist in enumerate(topic_word): | |
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1] | |
print('*Topic {}\n- {}'.format(i, ' '.join(topic_words))) | |
doc_topic = model.doc_topic_ | |
print("type(doc_topic): {}".format(type(doc_topic))) | |
print("shape: {}".format(doc_topic.shape)) | |
for n in range(5): | |
sum_pr = sum(doc_topic[n,:]) | |
print("document: {} sum: {}".format(n, sum_pr)) | |
for n in range(10): | |
topic_most_pr = doc_topic[n].argmax() | |
print("doc: {} topic: {}\n{}...".format(n, | |
topic_most_pr, | |
titles[n][:50])) | |
import matplotlib.pyplot as plt | |
# use matplotlib style sheet | |
try: | |
plt.style.use('ggplot') | |
except: | |
# version of matplotlib might not be recent | |
pass | |
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) | |
for i, k in enumerate([0, 5, 9, 14, 19]): | |
ax[i].stem(topic_word[k,:], linefmt='b-', | |
markerfmt='bo', basefmt='w-') | |
ax[i].set_xlim(-50,4350) | |
ax[i].set_ylim(0, 0.08) | |
ax[i].set_ylabel("Prob") | |
ax[i].set_title("topic {}".format(k)) | |
ax[4].set_xlabel("word") | |
plt.tight_layout() | |
plt.show() | |
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) | |
for i, k in enumerate([1, 3, 4, 8, 9]): | |
ax[i].stem(doc_topic[k,:], linefmt='r-', | |
markerfmt='ro', basefmt='w-') | |
ax[i].set_xlim(-1, 21) | |
ax[i].set_ylim(0, 1) | |
ax[i].set_ylabel("Prob") | |
ax[i].set_title("Document {}".format(k)) | |
ax[4].set_xlabel("Topic") | |
plt.tight_layout() | |
plt.show() |
Hi rosizel,
Sorry for not responding earlier -- I didn't get an email notification of your comment.
From the trace that you provide it seems like the c code is not compiled, or, you don't have numpy already installed on your machine. Make sure that you can start Python and import numpy, obtaining something like this:
>>> import numpy as np
>>> print np.__version__
1.9.0
If that doesn't work, you need to install numpy and try installing lda again.
Best,
Chris
Hi,
I'm trying to use above code. But i want to use my own data. Is it possible, if it is then how to do so. Please help.
hi,
I'm new in python. I've installed lda following your blog
but it shows this this error
Traceback (most recent call last):
File "E:/workspace/lda_c/lda/tests/test2.py", line 3, in
import lda
File "E:\workspace\lda_c\lda__init__.py", line 7, in
from lda.lda import LDA # noqa
File "E:\workspace\lda_c\lda\lda.py", line 10, in
import lda._lda
ImportError: No module named _lda
can you help me solve this?