Skip to content

Instantly share code, notes, and snippets.

@jxnl
Last active August 29, 2015 14:03
Show Gist options
  • Save jxnl/a129929ab6139507a41f to your computer and use it in GitHub Desktop.
Save jxnl/a129929ab6139507a41f to your computer and use it in GitHub Desktop.
!!!
"""
Author: Jason Liu
"""
from __future__ import division
from collections import defaultdict
from nltk.corpus import stopwords
from string import maketrans, punctuation
from itertools import imap
class Word(object):
"""High Performance word class for SentimentCollector to keep
track of word counts
Attributes:
occurrences (int): count of word occurrences
positive (int): count of positive occurrences
negative (int): count of negative occurrences
"""
# removes __dict__ object to be memory efficient
__slots__ = ['occurrences', 'positive', 'negative']
total_occurrences = int()
total_positive = int()
total_negative = int()
def __init__(self):
self.occurrences = 10
self.positive = 5
self.negative = 5
def update(self, sentiment):
"""Update the word with their sentiment frequencies
along with modifying the class level counters
args:
sentiment (str): either 'pos', 'neg'.
"""
self.occurrences += 1
Word.total_occurrences += 1
if sentiment == "pos":
Word.total_positive += 1
self.positive += 1
elif sentiment == "neg":
Word.total_negative += 1
self.negative += 1
@property
def p_word(self):
"""Return the empirical probability that this word will occur.
formula:
P(word)
"""
return self.occurrences / self.total_occurrences
def sentiment_word(self, sentiment):
"""Return the empirical probability that this word is a
certain sentiment.
formula:
P(sentiment; word)
"""
if sentiment == 'pos':
return self.positive / self.occurrences
elif sentiment == 'neg':
return self.negative / self.occurrences
@classmethod
def priori_sentiment(cls, sentiment):
"""Return the a prior of a certain sentiment appearing
formula:
P(sentiment)
"""
if sentiment == 'pos':
return Word.total_positive / Word.total_occurrences
elif sentiment == 'neg':
return Word.total_negative / Word.total_occurrences
class SentimentCollector(object):
"""Naive Term Sentiment learner that uses Bayes' theorem to calculate the
sentiment of new words and predicts the sentiment of new sentences
Attributes:
words (defaultdict) : container for word objects
sentiment_map (dict): maps int values to string values
"""
def __init__(self):
self.words = defaultdict(Word)
self.sentiment_map = {1: 'pos', 0: 'neu', -1: 'neg'}
self.stopwords = set(stopwords.words('english'))
def train(self, dataset):
"""Accept a new body of text given a specific known sentiment to
update our collection of words
args:
dataset (iterable of tuples): Where the first element is
either -1 or 1 where -1 denotes negative and 1 denotes positive
and the second element is the input string.
notes:
data = [(-1, "this sucks so much"),
( 1, "i love you so much"),
( 0, "cats and dogs")]
"""
for sentiment, text in dataset:
words, _ = self.tokenize_with_negation(text)
for word in words:
self.update(word, self.sentiment_map[int(sentiment)])
def update(self, word, sentiment):
"""Update the word with their sentiment frequencies
args:
word (str): single word to update in dict.
sentiment (str): either 'pos', 'neg'.
"""
self.words[word].update(sentiment)
def p_word_sentiment(self, word, sentiment):
"""Apply Bayes' theorem to obtain the probaility of a word occuring
given a certain sentiment
args:
word (string): observed word.
sentiment (str): either 'pos', 'neg'.
formula:
P(word) * P(sentiment; word)
P(word; sentiment) = ------------------------------
P(sentiment)
"""
_ = self.words[word]
p_sentiment_word = _.sentiment_word(sentiment)
p_sentiment = Word.priori_sentiment(sentiment)
return _.p_word * p_sentiment_word / p_sentiment
@staticmethod
def remove_puncutation(word):
"""Remove Puncuation from a string
args:
word (str): word
"""
return word.translate(maketrans("", ""), punctuation)
def tokenize_with_negation(self, text):
"""Tokenize terms while negating any term that has 'not'
before it. e.g. 'not happy' => 'not_happy'
args:
text (string): body of text to tokeniz.
"""
words = text.split(" ")
words = (w.lower().strip() for w in words if w not in self.stopwords)
words = imap(self.remove_puncutation, words)
negations = (i for i, w in enumerate(words) if w in ['Not', 'not'])
negated = False
if negations:
negated = True
for index in negations:
words[index] = "" # essentially remove word
try:
words[index + 1] = "not_" + words[index]
except KeyError:
pass
return words, negated
def filter_words(self, words):
"""
args:
words (list): words...
"""
clean_words = []
for word in words:
if word in self.words and self.words[word].occurrences > 10:
clean_words.append(word)
return clean_words
def predict(self, text, sentiment):
"""Apply Bayes' theorem to obtain the probability of a sentience
being positive or negative given a certain set of words
args:
text (string): body of text that you want to classify.
sentiment (str): either 'pos', 'neg'.
formula:
P(sentiment) * P(words; sentiment)
P(sentiment; words) = -----------------------------------
P(words)
"""
words, _ = self.tokenize_with_negation(text)
p_sentiment = Word.priori_sentiment(sentiment)
try:
words = self.filter_words(words)
p_words_sentiment = reduce(lambda x, y: x * y,
(self.p_word_sentiment(w, sentiment) for w in words))
p_words = reduce(lambda x, y: x * y,
(self.words[w].p_word for w in words))
return p_sentiment * p_words_sentiment / p_words
except TypeError:
return 0.5 #due to filter words that heavily bias the terms
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment