vallantin · October 19, 2018 15:00
diff --git a/004savariables.py b/004savariables.py
 # Clean tweets
 class process_tweets(BaseEstimator, TransformerMixin):
    '''Extracts the Tweet and apply the transformation'''

    def __init__(self):
        pass

    def preprocess_text(self, text):
        # Return the normal form form for the Unicode string, encode them as ascii and decode back as utf-8
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        
        # Lowercase
        text = text.lower()

        # Remove HTML
        text = re.sub(r'<[^>]*>', '', text)

        # Expand contractions
        text = re.sub(r"i'm", " i am ", text)
        text = re.sub(r" im ", " i am ", text)
        text = re.sub(r"\: p", "", text)
        text = re.sub(r" ive ", " i have ", text)
        text = re.sub(r" he's ", " he is ", text)
        text = re.sub(r" she's ", " she is ", text)
        text = re.sub(r" that's ", " that is ", text)
        text = re.sub(r" what's ", " what is ", text)
        text = re.sub(r" where's ", " where is ", text)
        text = re.sub(r" haven't ", " have not ", text)
        text = re.sub(r" ur ", " you are ", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r" won't ", " will not ", text)
        text = re.sub(r" wouldn't ", " would not ", text)
        text = re.sub(r" can't ", " cannot ", text)
        text = re.sub(r" cannot ", " cannot ", text)
        text = re.sub(r" don't ", " do not ", text)
        text = re.sub(r" didn't ", " did not ", text)
        text = re.sub(r" doesn't ", " does not ", text)
        text = re.sub(r" isn't ", " is not ", text)
        text = re.sub(r" it's ", " it is ", text)
        text = re.sub(r" who's ", " who is ", text)
        text = re.sub(r" there's ", " there is ", text)
        text = re.sub(r" weren't ", " were not ", text)
        text = re.sub(r" okay ", " ok ", text)
        text = re.sub(r" you're ", " you are ", text)
        text = re.sub(r" c'mon ", " come on ", text)
        text = re.sub(r"in'", "ing", text)
        text = re.sub(r"\'s", " s", text)

        # Remove ponctuation and special chars except ! and ?
        text = re.sub('[^a-zA-Z?!\s]', ' ', text)

        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        sentence = []
        for word in text.split(' '):
            sentence.append(lemmatizer.lemmatize(word))

        # Rebuild sentences
        text = ' '.join(sentence)

        # Remove stopwords
        stopWords = set(stopwords.words('english'))
        sentence = []
        for word in text.split(' '):
            if word not in stopWords:
                sentence.append(word)

        # Rebuild sentences
        text = ' '.join(sentence)    

        # Remove twitter handlers, hashtags symbols and URLs
        text = re.sub(r'@[\w_-]+', 'mention', text)
        text = re.sub('https?://[^ ]+', 'link', text)
        text = re.sub('#', '', text)

        # If remaining text is empty, we have to find a way to spot this
        if text == '':
            text = 'notext'       

        return text
    
    def transform(self, df, y=None):
        return df['tweet'].apply(self.preprocess_text)

    def fit(self, df, y=None):
        return self
        
       
 # Dummify categorical variables        
 class dummies_transformation(BaseEstimator, TransformerMixin):

    def __init__(self, columns):
        self.columns = columns

    def fit(self, df, y=None):
        return self
     
    def transform(self, df, y=None):
        for column in self.columns:
            df = pd.concat([df, pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)
            df = df.drop([column], axis=1)
        
        df = df.drop(['tweet'], axis=1)

        return df
	# Clean tweets
	class process_tweets(BaseEstimator, TransformerMixin):
	'''Extracts the Tweet and apply the transformation'''

	def __init__(self):
	pass

	def preprocess_text(self, text):
	# Return the normal form form for the Unicode string, encode them as ascii and decode back as utf-8
	text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

	# Lowercase
	text = text.lower()

	# Remove HTML
	text = re.sub(r'<[^>]*>', '', text)

	# Expand contractions
	text = re.sub(r"i'm", " i am ", text)
	text = re.sub(r" im ", " i am ", text)
	text = re.sub(r"\: p", "", text)
	text = re.sub(r" ive ", " i have ", text)
	text = re.sub(r" he's ", " he is ", text)
	text = re.sub(r" she's ", " she is ", text)
	text = re.sub(r" that's ", " that is ", text)
	text = re.sub(r" what's ", " what is ", text)
	text = re.sub(r" where's ", " where is ", text)
	text = re.sub(r" haven't ", " have not ", text)
	text = re.sub(r" ur ", " you are ", text)
	text = re.sub(r"\'ll", " will", text)
	text = re.sub(r"\'ve", " have", text)
	text = re.sub(r"\'re", " are", text)
	text = re.sub(r"\'d", " would", text)
	text = re.sub(r" won't ", " will not ", text)
	text = re.sub(r" wouldn't ", " would not ", text)
	text = re.sub(r" can't ", " cannot ", text)
	text = re.sub(r" cannot ", " cannot ", text)
	text = re.sub(r" don't ", " do not ", text)
	text = re.sub(r" didn't ", " did not ", text)
	text = re.sub(r" doesn't ", " does not ", text)
	text = re.sub(r" isn't ", " is not ", text)
	text = re.sub(r" it's ", " it is ", text)
	text = re.sub(r" who's ", " who is ", text)
	text = re.sub(r" there's ", " there is ", text)
	text = re.sub(r" weren't ", " were not ", text)
	text = re.sub(r" okay ", " ok ", text)
	text = re.sub(r" you're ", " you are ", text)
	text = re.sub(r" c'mon ", " come on ", text)
	text = re.sub(r"in'", "ing", text)
	text = re.sub(r"\'s", " s", text)

	# Remove ponctuation and special chars except ! and ?
	text = re.sub('[^a-zA-Z?!\s]', ' ', text)

	# Lemmatize
	lemmatizer = WordNetLemmatizer()
	sentence = []
	for word in text.split(' '):
	sentence.append(lemmatizer.lemmatize(word))

	# Rebuild sentences
	text = ' '.join(sentence)

	# Remove stopwords
	stopWords = set(stopwords.words('english'))
	sentence = []
	for word in text.split(' '):
	if word not in stopWords:
	sentence.append(word)

	# Rebuild sentences
	text = ' '.join(sentence)

	# Remove twitter handlers, hashtags symbols and URLs
	text = re.sub(r'@[\w_-]+', 'mention', text)
	text = re.sub('https?://[^ ]+', 'link', text)
	text = re.sub('#', '', text)

	# If remaining text is empty, we have to find a way to spot this
	if text == '':
	text = 'notext'

	return text

	def transform(self, df, y=None):
	return df['tweet'].apply(self.preprocess_text)

	def fit(self, df, y=None):
	return self


	# Dummify categorical variables
	class dummies_transformation(BaseEstimator, TransformerMixin):

	def __init__(self, columns):
	self.columns = columns

	def fit(self, df, y=None):
	return self

	def transform(self, df, y=None):
	for column in self.columns:
	df = pd.concat([df, pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)
	df = df.drop([column], axis=1)

	df = df.drop(['tweet'], axis=1)

	return df