jonasft · June 5, 2016 19:32
diff --git a/lexicon_transformer.py b/lexicon_transformer.py
 import csv
 import re
 from sklearn.base import TransformerMixin, BaseEstimator
 from sklearn.preprocessing import normalize
 from data import lexicons
 import numpy as np
 from data import resources
 from transformers.tfidf_transformer import TfidfTransformer

 class LexiconTransformer(TransformerMixin, BaseEstimator):
 	def __init__(self, norm=False, negate=True):
 		self.normalize = norm
 		self.norm = norm
 		self.nrc = lexicons._nrc_emotion
 		self.bingliu = lexicons._bing_liu
 		self.mpqa = lexicons._mpqa
 		self.negate = negate
 	
 	def transform(self, raw_tweets, y=None):
 		manual_lexica = [self.nrc, self.bingliu, self.mpqa]
 		matrix = np.zeros((len(raw_tweets), 4))
 		#content = raw_tweets
 		content = TfidfTransformer().process_negation_in_dataset(raw_tweets) if self.negate else raw_tweets
 		for lexicon in manual_lexica:
 			scores = self._manual_lexicon_scorer(content, lexicon())
 			matrix += scores
 		return matrix

 	def fit(self, raw_tweets, y=None):
 		return self

 	def _manual_lexicon_scorer(self, raw_tweets, lexicon_dict):
 		data = raw_tweets
 		scores = np.zeros((len(data), 4))
 		negation_re = r'(.*)_NEG(?:FIRST)?$'
 		for i, tweet in enumerate(data):
 			for token in tweet.split(" "):
 				try:
 					if re.match(negation_re, token):
 						token = re.sub(negation_re, r'\1', token)
 						if lexicon_dict[token] > 0:
 							scores[i][2] -= lexicon_dict[token]
 						else:
 							scores[i][3] -= lexicon_dict[token]
 					else:
 						scores[i][0 if lexicon_dict[token] > 0 else 1] += lexicon_dict[token]
 				except KeyError:
 					pass
 		return normalize(scores) if self.normalize else scores
	import csv
	import re
	from sklearn.base import TransformerMixin, BaseEstimator
	from sklearn.preprocessing import normalize
	from data import lexicons
	import numpy as np
	from data import resources
	from transformers.tfidf_transformer import TfidfTransformer

	class LexiconTransformer(TransformerMixin, BaseEstimator):
	def __init__(self, norm=False, negate=True):
	self.normalize = norm
	self.norm = norm
	self.nrc = lexicons._nrc_emotion
	self.bingliu = lexicons._bing_liu
	self.mpqa = lexicons._mpqa
	self.negate = negate

	def transform(self, raw_tweets, y=None):
	manual_lexica = [self.nrc, self.bingliu, self.mpqa]
	matrix = np.zeros((len(raw_tweets), 4))
	#content = raw_tweets
	content = TfidfTransformer().process_negation_in_dataset(raw_tweets) if self.negate else raw_tweets
	for lexicon in manual_lexica:
	scores = self._manual_lexicon_scorer(content, lexicon())
	matrix += scores
	return matrix

	def fit(self, raw_tweets, y=None):
	return self

	def _manual_lexicon_scorer(self, raw_tweets, lexicon_dict):
	data = raw_tweets
	scores = np.zeros((len(data), 4))
	negation_re = r'(.*)_NEG(?:FIRST)?$'
	for i, tweet in enumerate(data):
	for token in tweet.split(" "):
	try:
	if re.match(negation_re, token):
	token = re.sub(negation_re, r'\1', token)
	if lexicon_dict[token] > 0:
	scores[i][2] -= lexicon_dict[token]
	else:
	scores[i][3] -= lexicon_dict[token]
	else:
	scores[i][0 if lexicon_dict[token] > 0 else 1] += lexicon_dict[token]
	except KeyError:
	pass
	return normalize(scores) if self.normalize else scores