Created
June 5, 2016 19:32
-
-
Save jonasft/b57edc86d582cd0f7a13d9e3d01e1f8f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
from sklearn.base import TransformerMixin, BaseEstimator | |
from sklearn.preprocessing import normalize | |
from data import lexicons | |
import numpy as np | |
from data import resources | |
from transformers.tfidf_transformer import TfidfTransformer | |
class LexiconTransformer(TransformerMixin, BaseEstimator): | |
def __init__(self, norm=False, negate=True): | |
self.normalize = norm | |
self.norm = norm | |
self.nrc = lexicons._nrc_emotion | |
self.bingliu = lexicons._bing_liu | |
self.mpqa = lexicons._mpqa | |
self.negate = negate | |
def transform(self, raw_tweets, y=None): | |
manual_lexica = [self.nrc, self.bingliu, self.mpqa] | |
matrix = np.zeros((len(raw_tweets), 4)) | |
#content = raw_tweets | |
content = TfidfTransformer().process_negation_in_dataset(raw_tweets) if self.negate else raw_tweets | |
for lexicon in manual_lexica: | |
scores = self._manual_lexicon_scorer(content, lexicon()) | |
matrix += scores | |
return matrix | |
def fit(self, raw_tweets, y=None): | |
return self | |
def _manual_lexicon_scorer(self, raw_tweets, lexicon_dict): | |
data = raw_tweets | |
scores = np.zeros((len(data), 4)) | |
negation_re = r'(.*)_NEG(?:FIRST)?$' | |
for i, tweet in enumerate(data): | |
for token in tweet.split(" "): | |
try: | |
if re.match(negation_re, token): | |
token = re.sub(negation_re, r'\1', token) | |
if lexicon_dict[token] > 0: | |
scores[i][2] -= lexicon_dict[token] | |
else: | |
scores[i][3] -= lexicon_dict[token] | |
else: | |
scores[i][0 if lexicon_dict[token] > 0 else 1] += lexicon_dict[token] | |
except KeyError: | |
pass | |
return normalize(scores) if self.normalize else scores |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment