jxnl · August 29, 2015 14:03
diff --git a/naive_trainer.py b/naive_trainer.py
 """
 Author: Jason Liu
 """

 from __future__ import division
 from collections import defaultdict
 from nltk.corpus import stopwords
 from string import maketrans, punctuation
 from itertools import imap


 class Word(object):

    """High Performance word class for SentimentCollector to keep
    track of word counts

    Attributes:
        occurrences (int): count of word occurrences
        positive (int): count of positive occurrences
        negative (int): count of negative occurrences

    """

    # removes __dict__ object to be memory efficient
    __slots__ = ['occurrences', 'positive', 'negative']

    total_occurrences = int()
    total_positive = int()
    total_negative = int()

    def __init__(self):
        self.occurrences = 10
        self.positive = 5
        self.negative = 5

    def update(self, sentiment):
        """Update the word with their sentiment frequencies
        along with modifying the class level counters

        args:
            sentiment (str): either 'pos', 'neg'.

        """
        self.occurrences += 1
        Word.total_occurrences += 1
        if sentiment == "pos":
            Word.total_positive += 1
            self.positive += 1
        elif sentiment == "neg":
            Word.total_negative += 1
            self.negative += 1

    @property
    def p_word(self):
        """Return the empirical probability that this word will occur.

        formula:
            P(word)

        """
        return self.occurrences / self.total_occurrences

    def sentiment_word(self, sentiment):
        """Return the empirical probability that this word is a
        certain sentiment.

        formula:
            P(sentiment; word)

        """
        if sentiment == 'pos':
            return self.positive / self.occurrences
        elif sentiment == 'neg':
            return self.negative / self.occurrences

    @classmethod
    def priori_sentiment(cls, sentiment):
        """Return the a prior of a certain sentiment appearing

        formula:
            P(sentiment)
        """
        if sentiment == 'pos':
            return Word.total_positive / Word.total_occurrences
        elif sentiment == 'neg':
            return Word.total_negative / Word.total_occurrences


 class SentimentCollector(object):

    """Naive Term Sentiment learner that uses Bayes' theorem to calculate the
    sentiment of new words and predicts the sentiment of new sentences

    Attributes:
        words (defaultdict) : container for word objects
        sentiment_map (dict): maps int values to string values

    """

    def __init__(self):
        self.words = defaultdict(Word)
        self.sentiment_map = {1: 'pos', 0: 'neu', -1: 'neg'}
        self.stopwords = set(stopwords.words('english'))

    def train(self, dataset):
        """Accept a new body of text given a specific known sentiment to
        update our collection of words

        args:
            dataset (iterable of tuples): Where the first element is
                either -1 or 1 where -1 denotes negative and 1 denotes positive
                and the second element is the input string.

        notes:
               data = [(-1, "this sucks so much"),
                       ( 1, "i love you so much"),
                       ( 0, "cats and dogs")]
        """
        for sentiment, text in dataset:
            words, _ = self.tokenize_with_negation(text)
            for word in words:
                self.update(word, self.sentiment_map[int(sentiment)])

    def update(self, word, sentiment):
        """Update the word with their sentiment frequencies

        args:
            word (str): single word to update in dict.
            sentiment (str): either 'pos', 'neg'.

        """
        self.words[word].update(sentiment)

    def p_word_sentiment(self, word, sentiment):
        """Apply Bayes' theorem to obtain the probaility of a word occuring
        given a certain sentiment

        args:
            word (string): observed word.
            sentiment (str): either 'pos', 'neg'.

        formula:
                                  P(word) * P(sentiment; word)
            P(word; sentiment) = ------------------------------
                                           P(sentiment)

        """
        _ = self.words[word]
        p_sentiment_word = _.sentiment_word(sentiment)
        p_sentiment = Word.priori_sentiment(sentiment)
        return _.p_word * p_sentiment_word / p_sentiment

    @staticmethod
    def remove_puncutation(word):
        """Remove Puncuation from a string

        args:
            word (str): word

        """
        return word.translate(maketrans("", ""), punctuation)

    def tokenize_with_negation(self, text):
        """Tokenize terms while negating any term that has 'not'
        before it. e.g. 'not happy' => 'not_happy'

        args:
            text (string): body of text to tokeniz.
        """
        words = text.split(" ")
        words = (w.lower().strip() for w in words if w not in self.stopwords)
        words = imap(self.remove_puncutation, words)
        negations = (i for i, w in enumerate(words) if w in ['Not', 'not'])
        negated = False
        if negations:
            negated = True
            for index in negations:
                words[index] = ""  # essentially remove word
                try:
                    words[index + 1] = "not_" + words[index]
                except KeyError:
                    pass
        return words, negated

    def filter_words(self, words):
        """
        args:
            words (list): words...
        """
        clean_words = []
        for word in words:
            if word in self.words and self.words[word].occurrences > 10:
                clean_words.append(word)
        return clean_words

    def predict(self, text, sentiment):
        """Apply Bayes' theorem to obtain the probability of a sentience
        being positive or negative given a certain set of words

        args:
            text (string): body of text that you want to classify.
            sentiment (str): either 'pos', 'neg'.

        formula:
                                  P(sentiment) * P(words; sentiment)
            P(sentiment; words) = -----------------------------------
                                               P(words)
        """
        words, _ = self.tokenize_with_negation(text)
        p_sentiment = Word.priori_sentiment(sentiment)
        try:
            words = self.filter_words(words)
            p_words_sentiment = reduce(lambda x, y: x * y,
                           (self.p_word_sentiment(w, sentiment) for w in words))
            p_words = reduce(lambda x, y: x * y,
                           (self.words[w].p_word for w in words))
            return p_sentiment * p_words_sentiment / p_words
        except TypeError:
            return 0.5 #due to filter words that heavily bias the terms
	"""
	Author: Jason Liu
	"""

	from __future__ import division
	from collections import defaultdict
	from nltk.corpus import stopwords
	from string import maketrans, punctuation
	from itertools import imap


	class Word(object):

	"""High Performance word class for SentimentCollector to keep
	track of word counts

	Attributes:
	occurrences (int): count of word occurrences
	positive (int): count of positive occurrences
	negative (int): count of negative occurrences

	"""

	# removes __dict__ object to be memory efficient
	__slots__ = ['occurrences', 'positive', 'negative']

	total_occurrences = int()
	total_positive = int()
	total_negative = int()

	def __init__(self):
	self.occurrences = 10
	self.positive = 5
	self.negative = 5

	def update(self, sentiment):
	"""Update the word with their sentiment frequencies
	along with modifying the class level counters

	args:
	sentiment (str): either 'pos', 'neg'.

	"""
	self.occurrences += 1
	Word.total_occurrences += 1
	if sentiment == "pos":
	Word.total_positive += 1
	self.positive += 1
	elif sentiment == "neg":
	Word.total_negative += 1
	self.negative += 1

	@property
	def p_word(self):
	"""Return the empirical probability that this word will occur.

	formula:
	P(word)

	"""
	return self.occurrences / self.total_occurrences

	def sentiment_word(self, sentiment):
	"""Return the empirical probability that this word is a
	certain sentiment.

	formula:
	P(sentiment; word)

	"""
	if sentiment == 'pos':
	return self.positive / self.occurrences
	elif sentiment == 'neg':
	return self.negative / self.occurrences

	@classmethod
	def priori_sentiment(cls, sentiment):
	"""Return the a prior of a certain sentiment appearing

	formula:
	P(sentiment)
	"""
	if sentiment == 'pos':
	return Word.total_positive / Word.total_occurrences
	elif sentiment == 'neg':
	return Word.total_negative / Word.total_occurrences


	class SentimentCollector(object):

	"""Naive Term Sentiment learner that uses Bayes' theorem to calculate the
	sentiment of new words and predicts the sentiment of new sentences

	Attributes:
	words (defaultdict) : container for word objects
	sentiment_map (dict): maps int values to string values

	"""

	def __init__(self):
	self.words = defaultdict(Word)
	self.sentiment_map = {1: 'pos', 0: 'neu', -1: 'neg'}
	self.stopwords = set(stopwords.words('english'))

	def train(self, dataset):
	"""Accept a new body of text given a specific known sentiment to
	update our collection of words

	args:
	dataset (iterable of tuples): Where the first element is
	either -1 or 1 where -1 denotes negative and 1 denotes positive
	and the second element is the input string.

	notes:
	data = [(-1, "this sucks so much"),
	( 1, "i love you so much"),
	( 0, "cats and dogs")]
	"""
	for sentiment, text in dataset:
	words, _ = self.tokenize_with_negation(text)
	for word in words:
	self.update(word, self.sentiment_map[int(sentiment)])

	def update(self, word, sentiment):
	"""Update the word with their sentiment frequencies

	args:
	word (str): single word to update in dict.
	sentiment (str): either 'pos', 'neg'.

	"""
	self.words[word].update(sentiment)

	def p_word_sentiment(self, word, sentiment):
	"""Apply Bayes' theorem to obtain the probaility of a word occuring
	given a certain sentiment

	args:
	word (string): observed word.
	sentiment (str): either 'pos', 'neg'.

	formula:
	P(word) * P(sentiment; word)
	P(word; sentiment) = ------------------------------
	P(sentiment)

	"""
	_ = self.words[word]
	p_sentiment_word = _.sentiment_word(sentiment)
	p_sentiment = Word.priori_sentiment(sentiment)
	return _.p_word * p_sentiment_word / p_sentiment

	@staticmethod
	def remove_puncutation(word):
	"""Remove Puncuation from a string

	args:
	word (str): word

	"""
	return word.translate(maketrans("", ""), punctuation)

	def tokenize_with_negation(self, text):
	"""Tokenize terms while negating any term that has 'not'
	before it. e.g. 'not happy' => 'not_happy'

	args:
	text (string): body of text to tokeniz.
	"""
	words = text.split(" ")
	words = (w.lower().strip() for w in words if w not in self.stopwords)
	words = imap(self.remove_puncutation, words)
	negations = (i for i, w in enumerate(words) if w in ['Not', 'not'])
	negated = False
	if negations:
	negated = True
	for index in negations:
	words[index] = "" # essentially remove word
	try:
	words[index + 1] = "not_" + words[index]
	except KeyError:
	pass
	return words, negated

	def filter_words(self, words):
	"""
	args:
	words (list): words...
	"""
	clean_words = []
	for word in words:
	if word in self.words and self.words[word].occurrences > 10:
	clean_words.append(word)
	return clean_words

	def predict(self, text, sentiment):
	"""Apply Bayes' theorem to obtain the probability of a sentience
	being positive or negative given a certain set of words

	args:
	text (string): body of text that you want to classify.
	sentiment (str): either 'pos', 'neg'.

	formula:
	P(sentiment) * P(words; sentiment)
	P(sentiment; words) = -----------------------------------
	P(words)
	"""
	words, _ = self.tokenize_with_negation(text)
	p_sentiment = Word.priori_sentiment(sentiment)
	try:
	words = self.filter_words(words)
	p_words_sentiment = reduce(lambda x, y: x * y,
	(self.p_word_sentiment(w, sentiment) for w in words))
	p_words = reduce(lambda x, y: x * y,
	(self.words[w].p_word for w in words))
	return p_sentiment * p_words_sentiment / p_words
	except TypeError:
	return 0.5 #due to filter words that heavily bias the terms