Skip to content

Instantly share code, notes, and snippets.

@aneesha
Created September 3, 2016 12:05
Show Gist options
  • Save aneesha/962db2e378128d99eb26ba1ae29c8e5c to your computer and use it in GitHub Desktop.
Save aneesha/962db2e378128d99eb26ba1ae29c8e5c to your computer and use it in GitHub Desktop.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
for doc_index in top_doc_indices:
print documents[doc_index]
# Single line documents from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000
documents = [
"Human machine interface for Lab ABC computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user-perceived response time to error measurement",
"The generation of random, binary, unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV: Widths of trees and quasi-ordering",
"Graph minors: A survey"
]
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
no_topics = 2
# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_
# Run LDA
lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_
no_top_words = 4
no_top_documents = 4
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment