Last active
November 14, 2015 05:51
-
-
Save soodoku/22e4cff2eb6a05be3c0d to your computer and use it in GitHub Desktop.
Basic sentiment analysis with AFINN or custom word database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Basic Sentiment Analysis | |
Builds on: | |
https://finnaarupnielsen.wordpress.com/2011/06/20/simplest-sentiment-analysis-in-python-with-af/ | |
Utilizes AFINN or a custom sentiment db | |
Example Snippets at end from: https://code.google.com/p/sentana/wiki/ExampleSentiments | |
''' | |
import re | |
import math | |
import os | |
# AFINN-111 is as of June 2011 the most recent version of AFINN. Replace with newer. | |
filenameAFINN = os.path.dirname(__file__) + '/AFINN/AFINN-111.txt' | |
afinn = dict(map(lambda (w, s): (w, int(s)), [ | |
ws.strip().split('\t') for ws in open(filenameAFINN) ])) | |
# Word splitter pattern | |
pattern_split = re.compile(r"\W+") | |
def sentiment_AFINN(text): | |
""" | |
Returns a float for sentiment strength based on the input text. | |
Positive values are positive valence, negative value are negative valence. | |
""" | |
words = pattern_split.split(text.lower()) | |
sentiments = map(lambda word: afinn.get(word, 0), words) | |
if sentiments: | |
# How should you weight the individual word sentiments? | |
# You could do N, sqrt(N) or 1 for example. Here I use sqrt(N) | |
_sentiment = float(sum(sentiments))/math.sqrt(len(sentiments)) | |
else: | |
_sentiment = 0 | |
return _sentiment | |
# Using custom WORDDB | |
filenameWORDDB = os.path.dirname(__file__) + '/sentimentworddb.txt' | |
worddb = dict(map(lambda (w, s): (w, int(s) if int(s) != 0 else -1), [ | |
ws.strip().split('\t') for ws in open(filenameWORDDB) ])) | |
re_str = "(%s)([^ ]*)" % '|'.join([re.escape(w.replace('*', '')) for w in worddb if w.endswith('*')]) | |
re_str += "|(%s)[\s\.\,\;\?\!]" % '|'.join([re.escape(w) for w in worddb if not w.endswith('*')]) | |
re_worddb = re.compile(re_str) | |
def sentiment_WORDDB(text): | |
""" | |
Returns a float for sentiment strength based on the input text. | |
Positive values are positive valence, negative value are negative valence. | |
""" | |
sentiments = [] | |
for s in re_worddb.finditer(text): | |
if s.group(1): | |
sentiments.append(worddb[s.group(1) + '*']) | |
elif s.group(3): | |
sentiments.append(worddb[s.group(3)]) | |
return sum(sentiments) | |
if __name__ == "__main__": | |
print("========== AFINN Test cases ==========") | |
print(sentiment_AFINN("ibm is not going at cloud alone. We have an ecosystem of partners helping us.")) | |
print(sentiment_AFINN("I have an iPhone, but I am not really feeling very happy about the iPhone.")) | |
print(sentiment_AFINN("I love Macintosh!")) | |
print(sentiment_AFINN("I hate microsoft excel. I'm about to punch this computer!!!! ")) | |
print(sentiment_AFINN("I really love my iPhone, but the reception here is very bad.")) | |
print(sentiment_AFINN("I'm afraid, I cannot corruption")) | |
print("========== WORDDB Test cases ==========") | |
print(sentiment_WORDDB("ibm is not going at cloud alone. We have an ecosystem of partners helping us.")) | |
print(sentiment_WORDDB("I have an iPhone, but I am not really feeling very happy about the iPhone.")) | |
print(sentiment_WORDDB("I love Macintosh!")) | |
print(sentiment_WORDDB("I hate microsoft excel. I'm about to punch this computer!!!! ")) | |
print(sentiment_WORDDB("I really love my iPhone, but the reception here is very bad.")) | |
print(sentiment_WORDDB("I'm afraid, I cannot corruption")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment