aneesha’s gists

aneesha / NNDEIG.m

Created June 7, 2014 12:36

Initialization for symmetric NMF using Eigendecomposition based on NNDSVD technique by Boutsidis & Gallopoulos (still experimental as the math may not be correct)

	function [W] = NNDEIG(A,k,flag);
	%
	% This function implements the NNDSVD algorithm described in [1] for
	% initialization of Nonnegative Matrix Factorization Algorithms
	% for symmetric NMF so uses Eigendecomposition
	%
	% [W] = nndeig(A,k,flag);
	%
	% INPUT
	% ------------

aneesha / load20newsgroups.py

Created September 1, 2016 00:09

	from sklearn.datasets import fetch_20newsgroups

	dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
	documents = dataset.data

aneesha / preprocess.py

Created September 1, 2016 00:13

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

	no_features = 1000

	# NMF is able to use tf-idf
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
	tfidf = tfidf_vectorizer.fit_transform(documents)
	tfidf_feature_names = tfidf_vectorizer.get_feature_names()

	# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

aneesha / nmf_lda_scikitlearn.py

Created September 1, 2016 00:27

	from sklearn.decomposition import NMF, LatentDirichletAllocation

	no_topics = 20

	# Run NMF
	nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

	# Run LDA
	lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

aneesha / displaytopics.py

Created September 1, 2016 00:33

	def display_topics(model, feature_names, no_top_words):
	for topic_idx, topic in enumerate(model.components_):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])

	no_top_words = 10
	display_topics(nmf, tfidf_feature_names, no_top_words)
	display_topics(lda, tf_feature_names, no_top_words)

aneesha / topicmodelling_scikitlearn.py

Created September 1, 2016 00:34

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import NMF, LatentDirichletAllocation

	def display_topics(model, feature_names, no_top_words):
	for topic_idx, topic in enumerate(model.components_):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])

aneesha / displaytopics2.py

Last active July 23, 2017 13:53

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
	for doc_index in top_doc_indices:
	print documents[doc_index]

aneesha / display_topics_with_docs.py

Last active May 26, 2020 04:53

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])

aneesha / display_topics_with_docs_toyexample.py

Created September 3, 2016 12:05

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print "Topic %d:" % (topic_idx)
	print " ".join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]])
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]

aneesha / display_closestwords_tsnescatterplot.ipynb

Last active January 31, 2021 20:11

Use TSNE to only plot similar words using Word2Vec

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.