Created January 6, 2017 11:01
Topic analysis and extraction using gensim (uncommented)
import pandas as pd
import xlrd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
excel_object = xlrd.open_workbook('sample.xlsx')
# print(excel_object.sheet_names())
# tree = lambda: defaultdict(tree)
# key_wise_feedback = tree()
key_wise_feedback = defaultdict(lambda : defaultdict(str))
key = ''
chat_logs = excel_object.sheet_by_name('chatLogs')
user_feedback = excel_object.sheet_by_name('UserFeedback')
for i,row in enumerate(chat_logs.col(4)):
text = row.value.strip()
key = ''
text = ''
key = str(str(chat_logs.col(0)[i]).split("'")[1])
if(text.startswith('VA: ')):
key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text
key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
for i in key_wise_feedback:
key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()]
key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()]
for i in key_wise_feedback:
import gensim
from gensim import corpora
def return_corpus(text):
return corpora.Dictionary(text)
# for i in key_wise_feedback:
# key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot'])
# key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust'])
corpus_bot = u''
corpus_cust = u''
for i in key_wise_feedback:
for words in key_wise_feedback[i]['bot']:
corpus_bot = corpus_bot + u' '.join(words)
for words in key_wise_feedback[i]['cust']:
corpus_cust = corpus_cust + u' '.join(words)
corpus_bot = corpus_bot.replace(u'va' , '')
corpus_bot = corpus_bot.strip()
dictionary_bot = corpora.Dictionary([corpus_bot.split()])
dictionary_cust = corpora.Dictionary([corpus_cust.split()])
# def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust):
# # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
# doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot]
# doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust]
# return doc_term_matrix_bot,doc_term_matrix_cust
# for i in key_wise_feedback:
# key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust'])
for i in key_wise_feedback:
key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']]
key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']]
# for i in key_wise_feedback:
# print(key_wise_feedback[i]['bot_matrix'])
# break
Lda = gensim.models.ldamodel.LdaModel
for i in key_wise_feedback:
key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50)
key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50)
for i in key_wise_feedback:
#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])
#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
#print_topics(num_topics=3, num_words=3)
