A script that replicates all examples in my blog post on using the lda Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
$ python ex002_lda.pyOr,
$ chmod u+x ex002_lda.py
$ ./ex002_lda.pyA script that replicates all examples in my blog post on using the lda Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
$ python ex002_lda.pyOr,
$ chmod u+x ex002_lda.py
$ ./ex002_lda.py| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # vim:fenc=utf-8 | |
| # | |
| # Copyright © 2014 Christopher C. Strelioff <chris.strelioff@gmail.com> | |
| # | |
| # Distributed under terms of the MIT license. | |
| """ | |
| ex002.py -- An example of LDA in Python. | |
| """ | |
| from __future__ import division, print_function | |
| import numpy as np | |
| import lda | |
| import lda.datasets | |
| # document-term matrix | |
| X = lda.datasets.load_reuters() | |
| print("type(X): {}".format(type(X))) | |
| print("shape: {}\n".format(X.shape)) | |
| # the vocab | |
| vocab = lda.datasets.load_reuters_vocab() | |
| print("type(vocab): {}".format(type(vocab))) | |
| print("len(vocab): {}\n".format(len(vocab))) | |
| # titles for each story | |
| titles = lda.datasets.load_reuters_titles() | |
| print("type(titles): {}".format(type(titles))) | |
| print("len(titles): {}\n".format(len(titles))) | |
| doc_id = 0 | |
| word_id = 3117 | |
| print("doc id: {} word id: {}".format(doc_id, word_id)) | |
| print("-- count: {}".format(X[doc_id, word_id])) | |
| print("-- word : {}".format(vocab[word_id])) | |
| print("-- doc : {}".format(titles[doc_id])) | |
| model = lda.LDA(n_topics=20, n_iter=500, random_state=1) | |
| model.fit(X) | |
| topic_word = model.topic_word_ | |
| print("type(topic_word): {}".format(type(topic_word))) | |
| print("shape: {}".format(topic_word.shape)) | |
| for n in range(5): | |
| sum_pr = sum(topic_word[n,:]) | |
| print("topic: {} sum: {}".format(n, sum_pr)) | |
| n = 5 | |
| for i, topic_dist in enumerate(topic_word): | |
| topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1] | |
| print('*Topic {}\n- {}'.format(i, ' '.join(topic_words))) | |
| doc_topic = model.doc_topic_ | |
| print("type(doc_topic): {}".format(type(doc_topic))) | |
| print("shape: {}".format(doc_topic.shape)) | |
| for n in range(5): | |
| sum_pr = sum(doc_topic[n,:]) | |
| print("document: {} sum: {}".format(n, sum_pr)) | |
| for n in range(10): | |
| topic_most_pr = doc_topic[n].argmax() | |
| print("doc: {} topic: {}\n{}...".format(n, | |
| topic_most_pr, | |
| titles[n][:50])) | |
| import matplotlib.pyplot as plt | |
| # use matplotlib style sheet | |
| try: | |
| plt.style.use('ggplot') | |
| except: | |
| # version of matplotlib might not be recent | |
| pass | |
| f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) | |
| for i, k in enumerate([0, 5, 9, 14, 19]): | |
| ax[i].stem(topic_word[k,:], linefmt='b-', | |
| markerfmt='bo', basefmt='w-') | |
| ax[i].set_xlim(-50,4350) | |
| ax[i].set_ylim(0, 0.08) | |
| ax[i].set_ylabel("Prob") | |
| ax[i].set_title("topic {}".format(k)) | |
| ax[4].set_xlabel("word") | |
| plt.tight_layout() | |
| plt.show() | |
| f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) | |
| for i, k in enumerate([1, 3, 4, 8, 9]): | |
| ax[i].stem(doc_topic[k,:], linefmt='r-', | |
| markerfmt='ro', basefmt='w-') | |
| ax[i].set_xlim(-1, 21) | |
| ax[i].set_ylim(0, 1) | |
| ax[i].set_ylabel("Prob") | |
| ax[i].set_title("Document {}".format(k)) | |
| ax[4].set_xlabel("Topic") | |
| plt.tight_layout() | |
| plt.show() |
Hi,
I'm trying to use above code. But i want to use my own data. Is it possible, if it is then how to do so. Please help.
Hi rosizel,
Sorry for not responding earlier -- I didn't get an email notification of your comment.
From the trace that you provide it seems like the c code is not compiled, or, you don't have numpy already installed on your machine. Make sure that you can start Python and import numpy, obtaining something like this:
If that doesn't work, you need to install numpy and try installing lda again.
Best,
Chris