Created
October 11, 2018 17:45
-
-
Save quetzaluz/7f5b2f9f2cc089b321756da629e85220 to your computer and use it in GitHub Desktop.
Example of predicting NPS score from feedback comments, also means of bucketizing customer sentiment.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import RegexpTokenizer | |
import time | |
start_time = time.time() | |
print('Starting run of analytic model...') | |
data = pd.read_csv(os.path.join(os.path.dirname(__file__), '../datasets/medallia.csv')) | |
def bucketize_score(x): | |
if x <= 5: | |
return 'Negative' | |
if x == 6 or x == 7: | |
return 'Neutral' | |
else: | |
return 'Positive' | |
data['NPS_score'] = data['NPS_score'].apply(lambda x: bucketize_score(x)) | |
train, test = train_test_split(data, test_size=0.15) | |
comments = [] | |
stopwords_set = set(stopwords.words('english')) # You need to download this via nltk.download(), see readme | |
tokenizer = RegexpTokenizer(r'\w+') # Strip punctuation and special characters from words | |
def tokenize_words(strng): | |
strng = strng.replace('!', ' !').replace('?', ' ?').replace('.', ' .') # So that punctuation can be tokenized | |
words_filtered = [e.lower() for e in tokenizer.tokenize(strng) if len(e) >= 3] | |
words_cleaned = [word for word in words_filtered | |
if 'http' not in word | |
and word not in stopwords_set | |
] | |
return words_cleaned | |
for index, row in train.iterrows(): | |
comments.append((tokenize_words(row.NPS_comment), row.NPS_score)) | |
test_pos = test[test['NPS_score'] == 'Positive'] | |
test_pos = test_pos['NPS_comment'] | |
test_neu = test[test['NPS_score'] == 'Neutral'] | |
test_neu = test_neu['NPS_comment'] | |
test_neg = test[test['NPS_score'] == 'Negative'] | |
test_neg = test_neg['NPS_comment'] | |
# Extracting word features | |
def get_words_in_comments(comment): | |
all = [] | |
for (words, sentiment) in comment: | |
all.extend(words) | |
return all | |
def get_word_features(wordlist): | |
wordlist = nltk.FreqDist(wordlist) | |
features = wordlist.keys() | |
return features | |
w_features = get_word_features(get_words_in_comments(comments)) | |
def extract_features(document): | |
document_words = set(document) | |
features = {} | |
for word in w_features: | |
features['contains(%s)' % word] = (word in document_words) | |
return features | |
training_set = nltk.classify.apply_features(extract_features, comments) | |
classifier = nltk.NaiveBayesClassifier.train(training_set) | |
neg_cnt = 0 | |
neu_cnt = 0 | |
pos_cnt = 0 | |
all_cnt = 0 | |
for comment in test_neg: | |
res = classifier.classify(extract_features(tokenize_words(comment))) | |
if(res == 'Negative'): | |
neg_cnt = neg_cnt + 1 | |
all_cnt = all_cnt + 1 | |
for comment in test_neu: | |
res = classifier.classify(extract_features(tokenize_words(comment))) | |
if(res == 'Neutral'): | |
neu_cnt = neu_cnt + 1 | |
all_cnt = all_cnt + 1 | |
for comment in test_pos: | |
res = classifier.classify(extract_features(tokenize_words(comment))) | |
if(res == 'Positive'): | |
pos_cnt = pos_cnt + 1 | |
all_cnt = all_cnt + 1 | |
neg_pct = (float(neg_cnt) / float(len(test_neg))) * 100 | |
neu_pct = (float(neu_cnt) / float(len(test_neu))) * 100 | |
pos_pct = (float(pos_cnt) / float(len(test_pos))) * 100 | |
all_pct = (float(all_cnt) / float(len(test_pos) + len(test_neg))) * 100 | |
print('Accuracy of model predicting bucketized NPS Score from comments:') | |
print('[Negative]: %s/%s (%s%%)' % (neg_cnt, len(test_neg), neg_pct)) | |
print('[Neutral]: %s/%s (%s%%)' % (neu_cnt, len(test_neu), neu_pct)) | |
print('[Positive]: %s/%s (%s%%)' % (pos_cnt, len(test_pos), pos_pct)) | |
print('[Overall]: %s/%s (%s%%)' % (all_cnt, len(test_pos) + len(test_neg), all_pct)) | |
print('Ending run of analytic model.') | |
print('Runtime was %s seconds' % (time.time() - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment