Skip to content

Instantly share code, notes, and snippets.

@anirudhr95
Created January 6, 2017 11:01
Show Gist options
  • Save anirudhr95/caa437ae6be2c19360a3818fe1105200 to your computer and use it in GitHub Desktop.
Save anirudhr95/caa437ae6be2c19360a3818fe1105200 to your computer and use it in GitHub Desktop.
Topic analysis and extraction using gensim (uncommented)
import pandas as pd
import xlrd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
excel_object = xlrd.open_workbook('sample.xlsx')
# print(excel_object.sheet_names())
# tree = lambda: defaultdict(tree)
# key_wise_feedback = tree()
key_wise_feedback = defaultdict(lambda : defaultdict(str))
key = ''
chat_logs = excel_object.sheet_by_name('chatLogs')
user_feedback = excel_object.sheet_by_name('UserFeedback')
for i,row in enumerate(chat_logs.col(4)):
if(i!=0):
text = row.value.strip()
if(str(chat_logs.col(0)[i]).split("'")[1]!=''):
key = ''
text = ''
key = str(str(chat_logs.col(0)[i]).split("'")[1])
#print(key)
if(text.startswith('VA: ')):
key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text
else:
key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
for i in key_wise_feedback:
key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()]
key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()]
for i in key_wise_feedback:
print(i,key_wise_feedback[i]['cust'])
break
import gensim
from gensim import corpora
def return_corpus(text):
return corpora.Dictionary(text)
# for i in key_wise_feedback:
# key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot'])
# key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust'])
corpus_bot = u''
corpus_cust = u''
for i in key_wise_feedback:
for words in key_wise_feedback[i]['bot']:
corpus_bot = corpus_bot + u' '.join(words)
for words in key_wise_feedback[i]['cust']:
corpus_cust = corpus_cust + u' '.join(words)
corpus_bot = corpus_bot.replace(u'va' , '')
corpus_bot = corpus_bot.strip()
dictionary_bot = corpora.Dictionary([corpus_bot.split()])
dictionary_cust = corpora.Dictionary([corpus_cust.split()])
# def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust):
# # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
# doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot]
# doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust]
# return doc_term_matrix_bot,doc_term_matrix_cust
# for i in key_wise_feedback:
# key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust'])
for i in key_wise_feedback:
key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']]
key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']]
# for i in key_wise_feedback:
# print(key_wise_feedback[i]['bot_matrix'])
# break
Lda = gensim.models.ldamodel.LdaModel
for i in key_wise_feedback:
key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50)
key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50)
for i in key_wise_feedback:
#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])
#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
#print_topics(num_topics=3, num_words=3)
print(key_wise_feedback[i]['bot_words'])
print(key_wise_feedback[i]['cust_words'])
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment