Skip to content

Instantly share code, notes, and snippets.

@cstrelioff
Last active November 16, 2017 04:36
Show Gist options
  • Save cstrelioff/38b1d16a1253c962b7d7 to your computer and use it in GitHub Desktop.
Save cstrelioff/38b1d16a1253c962b7d7 to your computer and use it in GitHub Desktop.
Python + Latent Dirichlet Allocation -- example 2

A script that replicates all examples in my blog post on using the lda Python package for Latent Dirichlet Allocation-- see my lda post for more information.

Run all the examples

    $ python ex002_lda.py

Or,

    $ chmod u+x ex002_lda.py
    $ ./ex002_lda.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2014 Christopher C. Strelioff <[email protected]>
#
# Distributed under terms of the MIT license.
"""
ex002.py -- An example of LDA in Python.
"""
from __future__ import division, print_function
import numpy as np
import lda
import lda.datasets
# document-term matrix
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))
# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))
# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))
doc_id = 0
word_id = 3117
print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
print("-- doc : {}".format(titles[doc_id]))
model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(X)
topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))
for n in range(5):
sum_pr = sum(topic_word[n,:])
print("topic: {} sum: {}".format(n, sum_pr))
n = 5
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))
for n in range(5):
sum_pr = sum(doc_topic[n,:])
print("document: {} sum: {}".format(n, sum_pr))
for n in range(10):
topic_most_pr = doc_topic[n].argmax()
print("doc: {} topic: {}\n{}...".format(n,
topic_most_pr,
titles[n][:50]))
import matplotlib.pyplot as plt
# use matplotlib style sheet
try:
plt.style.use('ggplot')
except:
# version of matplotlib might not be recent
pass
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
ax[i].stem(topic_word[k,:], linefmt='b-',
markerfmt='bo', basefmt='w-')
ax[i].set_xlim(-50,4350)
ax[i].set_ylim(0, 0.08)
ax[i].set_ylabel("Prob")
ax[i].set_title("topic {}".format(k))
ax[4].set_xlabel("word")
plt.tight_layout()
plt.show()
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]):
ax[i].stem(doc_topic[k,:], linefmt='r-',
markerfmt='ro', basefmt='w-')
ax[i].set_xlim(-1, 21)
ax[i].set_ylim(0, 1)
ax[i].set_ylabel("Prob")
ax[i].set_title("Document {}".format(k))
ax[4].set_xlabel("Topic")
plt.tight_layout()
plt.show()
@sangryal
Copy link

sangryal commented Apr 2, 2016

Hi,
I'm trying to use above code. But i want to use my own data. Is it possible, if it is then how to do so. Please help.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment