Created
May 2, 2016 06:19
-
-
Save shkesar/be9009d83b82548edd16cd5a7d300457 to your computer and use it in GitHub Desktop.
NLP in python to test sentiments in mailbox
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('stopwords') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
echo "Installing dependencies" | |
brew install python | |
pip install ipython | |
pip install pandas | |
pip install nltk | |
echo "\n\nConfiguring ..." | |
ipython --quiet config.py | |
ipython sent_analysis.py |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mailbox import mbox | |
import pandas as pd | |
def store_content(message, body=None): | |
if not body: | |
body = message.get_payload(decode=True) | |
if len(message): | |
contents = { | |
"subject": message['subject'] or "", | |
"body": body, | |
"from": message['from'], | |
"to": message['to'], | |
"date": message['date'], | |
"labels": message['X-Gmail-Labels'], | |
"epilogue": message.epilogue, | |
} | |
return df.append(contents, ignore_index=True) | |
# Create an empty DataFrame with the relevant columns | |
df = pd.DataFrame( | |
columns=("subject", "body", "from", "to", "date", "labels", "epilogue")) | |
# Import your downloaded mbox file | |
box = mbox('~/Desktop/Sent.mbox/mbox') | |
fails = [] | |
for message in box: | |
try: | |
if message.get_content_type() == 'text/plain': | |
df = store_content(message) | |
elif message.is_multipart(): | |
# plaintext from multipart messages | |
for part in message.get_payload(): | |
if part.get_content_type() == 'text/plain': | |
df = store_content(message, part.get_payload(decode=True)) | |
break | |
except: | |
fails.append(message) | |
from nltk.corpus import stopwords | |
from collections import Counter | |
subject_word_bag = df.subject.apply(lambda t: t.lower() + " ").sum() | |
stops = [unicode(word) for word in stopwords.words('english')] + ['re:', 'fwd:', '-'] | |
subject_words = [word for word in subject_word_bag.split() if word.lower() not in stops] | |
from textblob import TextBlob | |
df['feels'] = df.subject.apply( | |
lambda s: TextBlob(unicode(s, errors='ignore')).sentiment.polarity) | |
sum = 0 | |
for score in df['feels']: | |
sum += score | |
print "Total mails - " + str(len(df['subject']) - 1) | |
print sum |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment