Created October 11, 2018 17:45
Example of predicting NPS score from feedback comments, also means of bucketizing customer sentiment.
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import time
start_time = time.time()
print('Starting run of analytic model...')
data = pd.read_csv(os.path.join(os.path.dirname(__file__), '../datasets/medallia.csv'))
def bucketize_score(x):
if x <= 5:
return 'Negative'
if x == 6 or x == 7:
return 'Neutral'
return 'Positive'
data['NPS_score'] = data['NPS_score'].apply(lambda x: bucketize_score(x))
train, test = train_test_split(data, test_size=0.15)
comments = []
stopwords_set = set(stopwords.words('english')) # You need to download this via, see readme
tokenizer = RegexpTokenizer(r'\w+') # Strip punctuation and special characters from words
def tokenize_words(strng):
strng = strng.replace('!', ' !').replace('?', ' ?').replace('.', ' .') # So that punctuation can be tokenized
words_filtered = [e.lower() for e in tokenizer.tokenize(strng) if len(e) >= 3]
words_cleaned = [word for word in words_filtered
if 'http' not in word
and word not in stopwords_set
return words_cleaned
for index, row in train.iterrows():
comments.append((tokenize_words(row.NPS_comment), row.NPS_score))
test_pos = test[test['NPS_score'] == 'Positive']
test_pos = test_pos['NPS_comment']
test_neu = test[test['NPS_score'] == 'Neutral']
test_neu = test_neu['NPS_comment']
test_neg = test[test['NPS_score'] == 'Negative']
test_neg = test_neg['NPS_comment']
# Extracting word features
def get_words_in_comments(comment):
all = []
for (words, sentiment) in comment:
return all
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
features = wordlist.keys()
return features
w_features = get_word_features(get_words_in_comments(comments))
def extract_features(document):
document_words = set(document)
features = {}
for word in w_features:
features['contains(%s)' % word] = (word in document_words)
return features
training_set = nltk.classify.apply_features(extract_features, comments)
classifier = nltk.NaiveBayesClassifier.train(training_set)
neg_cnt = 0
neu_cnt = 0
pos_cnt = 0
all_cnt = 0
for comment in test_neg:
res = classifier.classify(extract_features(tokenize_words(comment)))
if(res == 'Negative'):
neg_cnt = neg_cnt + 1
all_cnt = all_cnt + 1
for comment in test_neu:
res = classifier.classify(extract_features(tokenize_words(comment)))
if(res == 'Neutral'):
neu_cnt = neu_cnt + 1
all_cnt = all_cnt + 1
for comment in test_pos:
res = classifier.classify(extract_features(tokenize_words(comment)))
if(res == 'Positive'):
pos_cnt = pos_cnt + 1
all_cnt = all_cnt + 1
neg_pct = (float(neg_cnt) / float(len(test_neg))) * 100
neu_pct = (float(neu_cnt) / float(len(test_neu))) * 100
pos_pct = (float(pos_cnt) / float(len(test_pos))) * 100
all_pct = (float(all_cnt) / float(len(test_pos) + len(test_neg))) * 100
print('Accuracy of model predicting bucketized NPS Score from comments:')
print('[Negative]: %s/%s (%s%%)' % (neg_cnt, len(test_neg), neg_pct))
print('[Neutral]: %s/%s (%s%%)' % (neu_cnt, len(test_neu), neu_pct))
print('[Positive]: %s/%s (%s%%)' % (pos_cnt, len(test_pos), pos_pct))
print('[Overall]: %s/%s (%s%%)' % (all_cnt, len(test_pos) + len(test_neg), all_pct))
print('Ending run of analytic model.')
print('Runtime was %s seconds' % (time.time() - start_time))
