Created
January 6, 2017 11:01
-
-
Save anirudhr95/caa437ae6be2c19360a3818fe1105200 to your computer and use it in GitHub Desktop.
Topic analysis and extraction using gensim (uncommented)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import xlrd | |
from collections import defaultdict | |
from nltk.corpus import stopwords | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import string | |
import re | |
excel_object = xlrd.open_workbook('sample.xlsx') | |
# print(excel_object.sheet_names()) | |
# tree = lambda: defaultdict(tree) | |
# key_wise_feedback = tree() | |
key_wise_feedback = defaultdict(lambda : defaultdict(str)) | |
key = '' | |
chat_logs = excel_object.sheet_by_name('chatLogs') | |
user_feedback = excel_object.sheet_by_name('UserFeedback') | |
for i,row in enumerate(chat_logs.col(4)): | |
if(i!=0): | |
text = row.value.strip() | |
if(str(chat_logs.col(0)[i]).split("'")[1]!=''): | |
key = '' | |
text = '' | |
key = str(str(chat_logs.col(0)[i]).split("'")[1]) | |
#print(key) | |
if(text.startswith('VA: ')): | |
key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text | |
else: | |
key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text | |
stop = set(stopwords.words('english')) | |
exclude = set(string.punctuation) | |
lemma = WordNetLemmatizer() | |
def clean(doc): | |
stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) | |
punc_free = ''.join(ch for ch in stop_free if ch not in exclude) | |
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) | |
return normalized | |
for i in key_wise_feedback: | |
key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()] | |
key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()] | |
for i in key_wise_feedback: | |
print(i,key_wise_feedback[i]['cust']) | |
break | |
import gensim | |
from gensim import corpora | |
def return_corpus(text): | |
return corpora.Dictionary(text) | |
# for i in key_wise_feedback: | |
# key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot']) | |
# key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust']) | |
corpus_bot = u'' | |
corpus_cust = u'' | |
for i in key_wise_feedback: | |
for words in key_wise_feedback[i]['bot']: | |
corpus_bot = corpus_bot + u' '.join(words) | |
for words in key_wise_feedback[i]['cust']: | |
corpus_cust = corpus_cust + u' '.join(words) | |
corpus_bot = corpus_bot.replace(u'va' , '') | |
corpus_bot = corpus_bot.strip() | |
dictionary_bot = corpora.Dictionary([corpus_bot.split()]) | |
dictionary_cust = corpora.Dictionary([corpus_cust.split()]) | |
# def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust): | |
# # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. | |
# doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot] | |
# doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust] | |
# return doc_term_matrix_bot,doc_term_matrix_cust | |
# for i in key_wise_feedback: | |
# key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust']) | |
for i in key_wise_feedback: | |
key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']] | |
key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']] | |
# for i in key_wise_feedback: | |
# print(key_wise_feedback[i]['bot_matrix']) | |
# break | |
Lda = gensim.models.ldamodel.LdaModel | |
for i in key_wise_feedback: | |
key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50) | |
key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50) | |
for i in key_wise_feedback: | |
#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]) | |
#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])) | |
key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1]))) | |
key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1]))) | |
#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])) | |
#print_topics(num_topics=3, num_words=3) | |
print(key_wise_feedback[i]['bot_words']) | |
print(key_wise_feedback[i]['cust_words']) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment