Sooraj smsubrahmannian

Topic	Relevant words	Token percentage
Topic12	{'sas', 'powerpoint', 'python', 'r', 'excel', 'matlab', 'spss', 'sql', 'word', 'stata'}	21%
Topic7	{'classification', 'svm', 'learn', 'k', 'scikit', 'pandas', 'regression', 'matplotlib', 'scipy', 'numpy'}	20.3%
Topic9	{'sqoop', 'kafka', 'cassandra', 'hdfs', 'hbase', 'hive', 'pig', 'impala', 'flume', 'oozie'}	17.5%
Topic10	{'jquery', 'xml', 'css', 'eclipse', 'html', 'c', 'ajax', 'django', 'javascript', 'php'}	14.4%

	from gensim import models,corpora
	import spacy

	nlp = spacy.load('en')
	data = pd.read_feather('data/preprocessed_data')

	""" Step-1: clean up your text and generate list of words for each document.
	I recommend you go through an introductory tutorial on Spacy in this link.
	The content inside the cleanup function is designed for a specific action.
	I have provided two examples in the github repo """

	import spacy

	nlp = spacy.load('en') # loading the language model
	data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file

	def clean_up(text): # clean up your text and generate list of words for each document.
	removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
	text_out = []
	doc= nlp(text)
	for token in doc:

	from gensim import models,corpora
	import pyLDAvis.gensim

	# lda_final is the lda model built with 12 topics
	# vis is the pyLDAvis object

	vis = pyLDAvis.gensim.prepare(lda_final, doc_term_matrix, dictionary,sort_topics=False)

	def get_relevant_words(vis,lam=0.3,topn=10):
	a = vis.topic_info

	Topic,words with Relevance
	Topic9,"{'sqoop', 'kafka', 'cassandra', 'hdfs', 'hbase', 'hive', 'pig', 'impala', 'flume', 'oozie'}"
	Topic10,"{'jquery', 'xml', 'css', 'eclipse', 'html', 'c', 'ajax', 'django', 'javascript', 'php'}"
	Topic12,"{'sas', 'powerpoint', 'python', 'r', 'excel', 'matlab', 'spss', 'sql', 'word', 'stata'}"
	Topic7,"{'classification', 'svm', 'learn', 'k', 'scikit', 'pandas', 'regression', 'matplotlib', 'scipy', 'numpy'}"