anirudhr95 · January 6, 2017 11:01
diff --git a/topic_analysis.py b/topic_analysis.py
 import pandas as pd
 import xlrd
 from collections import defaultdict
 from nltk.corpus import stopwords 
 from nltk.stem.wordnet import WordNetLemmatizer
 import string
 import re


 excel_object = xlrd.open_workbook('sample.xlsx')
 # print(excel_object.sheet_names())

 # tree = lambda: defaultdict(tree)
 # key_wise_feedback = tree()

 key_wise_feedback = defaultdict(lambda : defaultdict(str))

 key = ''

 chat_logs = excel_object.sheet_by_name('chatLogs')
 user_feedback = excel_object.sheet_by_name('UserFeedback')

 for i,row in enumerate(chat_logs.col(4)):

 	if(i!=0):

 		text = row.value.strip()
 	
 		if(str(chat_logs.col(0)[i]).split("'")[1]!=''):
 			key = ''
 			text = ''
 			key = str(str(chat_logs.col(0)[i]).split("'")[1])
 			#print(key)

 		if(text.startswith('VA:  ')):
 			key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text
 		else:
 			key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text


 stop = set(stopwords.words('english'))
 exclude = set(string.punctuation) 
 lemma = WordNetLemmatizer()

 def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

 for i in key_wise_feedback:
 	key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()]   
 	key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()]

 for i in key_wise_feedback:
 	print(i,key_wise_feedback[i]['cust'])
 	break

 import gensim
 from gensim import corpora

 def return_corpus(text):
 	return corpora.Dictionary(text)

 # for i in key_wise_feedback:
 # 	key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot'])
 # 	key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust'])

 corpus_bot = u''
 corpus_cust = u''

 for i in key_wise_feedback:
 	for words in key_wise_feedback[i]['bot']:
 		corpus_bot = corpus_bot + u' '.join(words)
 	for words in key_wise_feedback[i]['cust']:
 		corpus_cust = corpus_cust + u' '.join(words)
 	

 corpus_bot = corpus_bot.replace(u'va' , '')
 corpus_bot = corpus_bot.strip()

 dictionary_bot = corpora.Dictionary([corpus_bot.split()])
 dictionary_cust = corpora.Dictionary([corpus_cust.split()])

 # def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust):

 # 	# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
 # 	doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot]
 # 	doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust]

 # 	return doc_term_matrix_bot,doc_term_matrix_cust

 # for i in key_wise_feedback:
 # 	key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust'])

 for i in key_wise_feedback:
 	key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']]
 	key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']]

 # for i in key_wise_feedback:
 # 	print(key_wise_feedback[i]['bot_matrix'])
 # 	break

 Lda = gensim.models.ldamodel.LdaModel

 for i in key_wise_feedback:
 	key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50)
 	key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50)


 for i in key_wise_feedback:
 	#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])
 	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
 	key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
 	key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
 	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
 	#print_topics(num_topics=3, num_words=3)
 	print(key_wise_feedback[i]['bot_words'])
 	print(key_wise_feedback[i]['cust_words'])
 	break
	import pandas as pd
	import xlrd
	from collections import defaultdict
	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer
	import string
	import re


	excel_object = xlrd.open_workbook('sample.xlsx')
	# print(excel_object.sheet_names())

	# tree = lambda: defaultdict(tree)
	# key_wise_feedback = tree()

	key_wise_feedback = defaultdict(lambda : defaultdict(str))

	key = ''

	chat_logs = excel_object.sheet_by_name('chatLogs')
	user_feedback = excel_object.sheet_by_name('UserFeedback')

	for i,row in enumerate(chat_logs.col(4)):

	if(i!=0):

	text = row.value.strip()

	if(str(chat_logs.col(0)[i]).split("'")[1]!=''):
	key = ''
	text = ''
	key = str(str(chat_logs.col(0)[i]).split("'")[1])
	#print(key)

	if(text.startswith('VA: ')):
	key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text
	else:
	key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text


	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation)
	lemma = WordNetLemmatizer()

	def clean(doc):
	stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
	punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
	return normalized

	for i in key_wise_feedback:
	key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()]
	key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()]

	for i in key_wise_feedback:
	print(i,key_wise_feedback[i]['cust'])
	break

	import gensim
	from gensim import corpora

	def return_corpus(text):
	return corpora.Dictionary(text)

	# for i in key_wise_feedback:
	# key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot'])
	# key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust'])

	corpus_bot = u''
	corpus_cust = u''

	for i in key_wise_feedback:
	for words in key_wise_feedback[i]['bot']:
	corpus_bot = corpus_bot + u' '.join(words)
	for words in key_wise_feedback[i]['cust']:
	corpus_cust = corpus_cust + u' '.join(words)


	corpus_bot = corpus_bot.replace(u'va' , '')
	corpus_bot = corpus_bot.strip()

	dictionary_bot = corpora.Dictionary([corpus_bot.split()])
	dictionary_cust = corpora.Dictionary([corpus_cust.split()])

	# def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust):

	# # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
	# doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot]
	# doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust]

	# return doc_term_matrix_bot,doc_term_matrix_cust

	# for i in key_wise_feedback:
	# key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust'])

	for i in key_wise_feedback:
	key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']]
	key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']]

	# for i in key_wise_feedback:
	# print(key_wise_feedback[i]['bot_matrix'])
	# break

	Lda = gensim.models.ldamodel.LdaModel

	for i in key_wise_feedback:
	key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50)
	key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50)


	for i in key_wise_feedback:
	#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])
	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
	key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
	key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
	#print_topics(num_topics=3, num_words=3)
	print(key_wise_feedback[i]['bot_words'])
	print(key_wise_feedback[i]['cust_words'])
	break