A script that replicates all examples in my blog post on using the lda
Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
$ python ex002_lda.py
Or,
$ chmod u+x ex002_lda.py
$ ./ex002_lda.py
A script that replicates all examples in my blog post on using the lda
Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
$ python ex002_lda.py
Or,
$ chmod u+x ex002_lda.py
$ ./ex002_lda.py
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# vim:fenc=utf-8 | |
# | |
# Copyright © 2014 Christopher C. Strelioff <[email protected]> | |
# | |
# Distributed under terms of the MIT license. | |
""" | |
ex002.py -- An example of LDA in Python. | |
""" | |
from __future__ import division, print_function | |
import numpy as np | |
import lda | |
import lda.datasets | |
# document-term matrix | |
X = lda.datasets.load_reuters() | |
print("type(X): {}".format(type(X))) | |
print("shape: {}\n".format(X.shape)) | |
# the vocab | |
vocab = lda.datasets.load_reuters_vocab() | |
print("type(vocab): {}".format(type(vocab))) | |
print("len(vocab): {}\n".format(len(vocab))) | |
# titles for each story | |
titles = lda.datasets.load_reuters_titles() | |
print("type(titles): {}".format(type(titles))) | |
print("len(titles): {}\n".format(len(titles))) | |
doc_id = 0 | |
word_id = 3117 | |
print("doc id: {} word id: {}".format(doc_id, word_id)) | |
print("-- count: {}".format(X[doc_id, word_id])) | |
print("-- word : {}".format(vocab[word_id])) | |
print("-- doc : {}".format(titles[doc_id])) | |
model = lda.LDA(n_topics=20, n_iter=500, random_state=1) | |
model.fit(X) | |
topic_word = model.topic_word_ | |
print("type(topic_word): {}".format(type(topic_word))) | |
print("shape: {}".format(topic_word.shape)) | |
for n in range(5): | |
sum_pr = sum(topic_word[n,:]) | |
print("topic: {} sum: {}".format(n, sum_pr)) | |
n = 5 | |
for i, topic_dist in enumerate(topic_word): | |
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1] | |
print('*Topic {}\n- {}'.format(i, ' '.join(topic_words))) | |
doc_topic = model.doc_topic_ | |
print("type(doc_topic): {}".format(type(doc_topic))) | |
print("shape: {}".format(doc_topic.shape)) | |
for n in range(5): | |
sum_pr = sum(doc_topic[n,:]) | |
print("document: {} sum: {}".format(n, sum_pr)) | |
for n in range(10): | |
topic_most_pr = doc_topic[n].argmax() | |
print("doc: {} topic: {}\n{}...".format(n, | |
topic_most_pr, | |
titles[n][:50])) | |
import matplotlib.pyplot as plt | |
# use matplotlib style sheet | |
try: | |
plt.style.use('ggplot') | |
except: | |
# version of matplotlib might not be recent | |
pass | |
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) | |
for i, k in enumerate([0, 5, 9, 14, 19]): | |
ax[i].stem(topic_word[k,:], linefmt='b-', | |
markerfmt='bo', basefmt='w-') | |
ax[i].set_xlim(-50,4350) | |
ax[i].set_ylim(0, 0.08) | |
ax[i].set_ylabel("Prob") | |
ax[i].set_title("topic {}".format(k)) | |
ax[4].set_xlabel("word") | |
plt.tight_layout() | |
plt.show() | |
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) | |
for i, k in enumerate([1, 3, 4, 8, 9]): | |
ax[i].stem(doc_topic[k,:], linefmt='r-', | |
markerfmt='ro', basefmt='w-') | |
ax[i].set_xlim(-1, 21) | |
ax[i].set_ylim(0, 1) | |
ax[i].set_ylabel("Prob") | |
ax[i].set_title("Document {}".format(k)) | |
ax[4].set_xlabel("Topic") | |
plt.tight_layout() | |
plt.show() |
Hi,
I'm trying to use above code. But i want to use my own data. Is it possible, if it is then how to do so. Please help.