aneesha · September 3, 2016 12:05
diff --git a/display_topics_with_docs_toyexample.py b/display_topics_with_docs_toyexample.py
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.decomposition import NMF, LatentDirichletAllocation
 import numpy as np

 def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print documents[doc_index]

 # Single line documents from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000
 documents = [
            "Human machine interface for Lab ABC computer applications",
            "A survey of user opinion of computer system response time",
            "The EPS user interface management system",
            "System and human system engineering testing of EPS",
            "Relation of user-perceived response time to error measurement",
            "The generation of random, binary, unordered trees",
            "The intersection graph of paths in trees",
            "Graph minors IV: Widths of trees and quasi-ordering",
            "Graph minors: A survey"
            ]

 # NMF is able to use tf-idf
 tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
 tfidf = tfidf_vectorizer.fit_transform(documents)
 tfidf_feature_names = tfidf_vectorizer.get_feature_names()

 # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
 tf = tf_vectorizer.fit_transform(documents)
 tf_feature_names = tf_vectorizer.get_feature_names()

 no_topics = 2

 # Run NMF
 nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
 nmf_W = nmf_model.transform(tfidf)
 nmf_H = nmf_model.components_

 # Run LDA
 lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
 lda_W = lda_model.transform(tf)
 lda_H = lda_model.components_

 no_top_words = 4
 no_top_documents = 4
 display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
 display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
	for doc_index in top_doc_indices:
	print documents[doc_index]

	# Single line documents from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000
	documents = [
	"Human machine interface for Lab ABC computer applications",
	"A survey of user opinion of computer system response time",
	"The EPS user interface management system",
	"System and human system engineering testing of EPS",
	"Relation of user-perceived response time to error measurement",
	"The generation of random, binary, unordered trees",
	"The intersection graph of paths in trees",
	"Graph minors IV: Widths of trees and quasi-ordering",
	"Graph minors: A survey"
	]

	# NMF is able to use tf-idf
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
	tfidf = tfidf_vectorizer.fit_transform(documents)
	tfidf_feature_names = tfidf_vectorizer.get_feature_names()

	# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
	tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
	tf = tf_vectorizer.fit_transform(documents)
	tf_feature_names = tf_vectorizer.get_feature_names()

	no_topics = 2

	# Run NMF
	nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
	nmf_W = nmf_model.transform(tfidf)
	nmf_H = nmf_model.components_

	# Run LDA
	lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
	lda_W = lda_model.transform(tf)
	lda_H = lda_model.components_

	no_top_words = 4
	no_top_documents = 4
	display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
	display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)