aneesha · May 26, 2020 04:53
diff --git a/display_topics_with_docs.py b/display_topics_with_docs.py
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.decomposition import NMF, LatentDirichletAllocation
 import numpy as np

 def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print documents[doc_index]

 dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
 documents = dataset.data

 no_features = 1000

 # NMF is able to use tf-idf
 tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
 tfidf = tfidf_vectorizer.fit_transform(documents)
 tfidf_feature_names = tfidf_vectorizer.get_feature_names()

 # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
 tf = tf_vectorizer.fit_transform(documents)
 tf_feature_names = tf_vectorizer.get_feature_names()

 no_topics = 5

 # Run NMF
 nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
 nmf_W = nmf_model.transform(tfidf)
 nmf_H = nmf_model.components_

 # Run LDA
 lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
 lda_W = lda_model.transform(tf)
 lda_H = lda_model.components_

 no_top_words = 5
 no_top_documents = 2
 display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
 display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
	for doc_index in top_doc_indices:
	print documents[doc_index]

	dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
	documents = dataset.data

	no_features = 1000

	# NMF is able to use tf-idf
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
	tfidf = tfidf_vectorizer.fit_transform(documents)
	tfidf_feature_names = tfidf_vectorizer.get_feature_names()

	# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
	tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
	tf = tf_vectorizer.fit_transform(documents)
	tf_feature_names = tf_vectorizer.get_feature_names()

	no_topics = 5

	# Run NMF
	nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
	nmf_W = nmf_model.transform(tfidf)
	nmf_H = nmf_model.components_

	# Run LDA
	lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
	lda_W = lda_model.transform(tf)
	lda_H = lda_model.components_

	no_top_words = 5
	no_top_documents = 2
	display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
	display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)