aneesha’s gists

aneesha / retrofitting_blog_1.py

Created November 28, 2017 09:24

	# uncomment if gensim is installed
	#!pip install gensim
	import gensim
	# Need the interactive Tools for Matplotlib
	%matplotlib notebook
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.manifold import TSNE

aneesha / display_closestwords_tsnescatterplot.ipynb

Last active January 31, 2021 20:11

Use TSNE to only plot similar words using Word2Vec

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

aneesha / display_topics_with_docs_toyexample.py

Created September 3, 2016 12:05

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]

aneesha / display_topics_with_docs.py

Last active May 26, 2020 04:53

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])

aneesha / displaytopics2.py

Last active July 23, 2017 13:53

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
	for doc_index in top_doc_indices:
	print documents[doc_index]

aneesha / topicmodelling_scikitlearn.py

Created September 1, 2016 00:34

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import NMF, LatentDirichletAllocation

	def display_topics(model, feature_names, no_top_words):
	for topic_idx, topic in enumerate(model.components_):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])

aneesha / displaytopics.py

Created September 1, 2016 00:33

	def display_topics(model, feature_names, no_top_words):
	for topic_idx, topic in enumerate(model.components_):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])

	no_top_words = 10
	display_topics(nmf, tfidf_feature_names, no_top_words)
	display_topics(lda, tf_feature_names, no_top_words)

aneesha / nmf_lda_scikitlearn.py

Created September 1, 2016 00:27

	from sklearn.decomposition import NMF, LatentDirichletAllocation

	no_topics = 20

	# Run NMF
	nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

	# Run LDA
	lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

aneesha / preprocess.py

Created September 1, 2016 00:13

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

	no_features = 1000

	# NMF is able to use tf-idf
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
	tfidf = tfidf_vectorizer.fit_transform(documents)
	tfidf_feature_names = tfidf_vectorizer.get_feature_names()

	# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

aneesha / load20newsgroups.py

Created September 1, 2016 00:09

	from sklearn.datasets import fetch_20newsgroups

	dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
	documents = dataset.data