Last active
August 29, 2015 14:03
-
-
Save jxnl/a129929ab6139507a41f to your computer and use it in GitHub Desktop.
!!!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Author: Jason Liu | |
| """ | |
| from __future__ import division | |
| from collections import defaultdict | |
| from nltk.corpus import stopwords | |
| from string import maketrans, punctuation | |
| from itertools import imap | |
| class Word(object): | |
| """High Performance word class for SentimentCollector to keep | |
| track of word counts | |
| Attributes: | |
| occurrences (int): count of word occurrences | |
| positive (int): count of positive occurrences | |
| negative (int): count of negative occurrences | |
| """ | |
| # removes __dict__ object to be memory efficient | |
| __slots__ = ['occurrences', 'positive', 'negative'] | |
| total_occurrences = int() | |
| total_positive = int() | |
| total_negative = int() | |
| def __init__(self): | |
| self.occurrences = 10 | |
| self.positive = 5 | |
| self.negative = 5 | |
| def update(self, sentiment): | |
| """Update the word with their sentiment frequencies | |
| along with modifying the class level counters | |
| args: | |
| sentiment (str): either 'pos', 'neg'. | |
| """ | |
| self.occurrences += 1 | |
| Word.total_occurrences += 1 | |
| if sentiment == "pos": | |
| Word.total_positive += 1 | |
| self.positive += 1 | |
| elif sentiment == "neg": | |
| Word.total_negative += 1 | |
| self.negative += 1 | |
| @property | |
| def p_word(self): | |
| """Return the empirical probability that this word will occur. | |
| formula: | |
| P(word) | |
| """ | |
| return self.occurrences / self.total_occurrences | |
| def sentiment_word(self, sentiment): | |
| """Return the empirical probability that this word is a | |
| certain sentiment. | |
| formula: | |
| P(sentiment; word) | |
| """ | |
| if sentiment == 'pos': | |
| return self.positive / self.occurrences | |
| elif sentiment == 'neg': | |
| return self.negative / self.occurrences | |
| @classmethod | |
| def priori_sentiment(cls, sentiment): | |
| """Return the a prior of a certain sentiment appearing | |
| formula: | |
| P(sentiment) | |
| """ | |
| if sentiment == 'pos': | |
| return Word.total_positive / Word.total_occurrences | |
| elif sentiment == 'neg': | |
| return Word.total_negative / Word.total_occurrences | |
| class SentimentCollector(object): | |
| """Naive Term Sentiment learner that uses Bayes' theorem to calculate the | |
| sentiment of new words and predicts the sentiment of new sentences | |
| Attributes: | |
| words (defaultdict) : container for word objects | |
| sentiment_map (dict): maps int values to string values | |
| """ | |
| def __init__(self): | |
| self.words = defaultdict(Word) | |
| self.sentiment_map = {1: 'pos', 0: 'neu', -1: 'neg'} | |
| self.stopwords = set(stopwords.words('english')) | |
| def train(self, dataset): | |
| """Accept a new body of text given a specific known sentiment to | |
| update our collection of words | |
| args: | |
| dataset (iterable of tuples): Where the first element is | |
| either -1 or 1 where -1 denotes negative and 1 denotes positive | |
| and the second element is the input string. | |
| notes: | |
| data = [(-1, "this sucks so much"), | |
| ( 1, "i love you so much"), | |
| ( 0, "cats and dogs")] | |
| """ | |
| for sentiment, text in dataset: | |
| words, _ = self.tokenize_with_negation(text) | |
| for word in words: | |
| self.update(word, self.sentiment_map[int(sentiment)]) | |
| def update(self, word, sentiment): | |
| """Update the word with their sentiment frequencies | |
| args: | |
| word (str): single word to update in dict. | |
| sentiment (str): either 'pos', 'neg'. | |
| """ | |
| self.words[word].update(sentiment) | |
| def p_word_sentiment(self, word, sentiment): | |
| """Apply Bayes' theorem to obtain the probaility of a word occuring | |
| given a certain sentiment | |
| args: | |
| word (string): observed word. | |
| sentiment (str): either 'pos', 'neg'. | |
| formula: | |
| P(word) * P(sentiment; word) | |
| P(word; sentiment) = ------------------------------ | |
| P(sentiment) | |
| """ | |
| _ = self.words[word] | |
| p_sentiment_word = _.sentiment_word(sentiment) | |
| p_sentiment = Word.priori_sentiment(sentiment) | |
| return _.p_word * p_sentiment_word / p_sentiment | |
| @staticmethod | |
| def remove_puncutation(word): | |
| """Remove Puncuation from a string | |
| args: | |
| word (str): word | |
| """ | |
| return word.translate(maketrans("", ""), punctuation) | |
| def tokenize_with_negation(self, text): | |
| """Tokenize terms while negating any term that has 'not' | |
| before it. e.g. 'not happy' => 'not_happy' | |
| args: | |
| text (string): body of text to tokeniz. | |
| """ | |
| words = text.split(" ") | |
| words = (w.lower().strip() for w in words if w not in self.stopwords) | |
| words = imap(self.remove_puncutation, words) | |
| negations = (i for i, w in enumerate(words) if w in ['Not', 'not']) | |
| negated = False | |
| if negations: | |
| negated = True | |
| for index in negations: | |
| words[index] = "" # essentially remove word | |
| try: | |
| words[index + 1] = "not_" + words[index] | |
| except KeyError: | |
| pass | |
| return words, negated | |
| def filter_words(self, words): | |
| """ | |
| args: | |
| words (list): words... | |
| """ | |
| clean_words = [] | |
| for word in words: | |
| if word in self.words and self.words[word].occurrences > 10: | |
| clean_words.append(word) | |
| return clean_words | |
| def predict(self, text, sentiment): | |
| """Apply Bayes' theorem to obtain the probability of a sentience | |
| being positive or negative given a certain set of words | |
| args: | |
| text (string): body of text that you want to classify. | |
| sentiment (str): either 'pos', 'neg'. | |
| formula: | |
| P(sentiment) * P(words; sentiment) | |
| P(sentiment; words) = ----------------------------------- | |
| P(words) | |
| """ | |
| words, _ = self.tokenize_with_negation(text) | |
| p_sentiment = Word.priori_sentiment(sentiment) | |
| try: | |
| words = self.filter_words(words) | |
| p_words_sentiment = reduce(lambda x, y: x * y, | |
| (self.p_word_sentiment(w, sentiment) for w in words)) | |
| p_words = reduce(lambda x, y: x * y, | |
| (self.words[w].p_word for w in words)) | |
| return p_sentiment * p_words_sentiment / p_words | |
| except TypeError: | |
| return 0.5 #due to filter words that heavily bias the terms | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment