Created
September 6, 2017 21:44
-
-
Save youngsoul/b748bee79c064dc7ac4620c4c8eb29c9 to your computer and use it in GitHub Desktop.
number of data transformers for sentiment analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from sklearn.base import BaseEstimator, TransformerMixin | |
class RemoveEllipseTransformer(TransformerMixin): | |
@staticmethod | |
def _preprocess_data(data_series): | |
""" | |
inspired from: | |
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py | |
:param data_series: | |
:return: | |
""" | |
# remove ..., ...., ..... with space | |
for remove in map(lambda r: re.compile(r), ["\.\.\.\.\.", "\.\.\.\.", "\.\.\."]): | |
data_series.replace(remove, " ", inplace=True) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
""" | |
:param X: Series, aka column of data. | |
:return: | |
""" | |
RemoveEllipseTransformer._preprocess_data(X) | |
return X | |
class RemoveNumbersTransformer(TransformerMixin): | |
@staticmethod | |
def _preprocess_data(data_series): | |
""" | |
inspired from: | |
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py | |
:param data_series: | |
:return: | |
""" | |
# remove numbers | |
regex = re.compile(r"\s?[0-9]+\.?[0-9]*") | |
data_series.replace(regex, "", inplace=True) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
""" | |
:param X: Series, aka column of data. | |
:return: | |
""" | |
RemoveNumbersTransformer._preprocess_data(X) | |
return X | |
class RemoveSpecialCharactersTransformer(TransformerMixin): | |
@staticmethod | |
def _preprocess_data(data_series): | |
""" | |
inspired from: | |
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py | |
:param data_series: | |
:return: | |
""" | |
# remove special characters | |
for remove in map(lambda r: re.compile(re.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$", | |
"@", "%", "^", "*", "(", ")", "{", "}", | |
"[", "]", "|", "/", "\\", ">", "<", "-", | |
"!", "?", ".", "'", | |
"--", "---", "#"]): | |
data_series.replace(remove, "", inplace=True) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
""" | |
:param X: Series, aka column of data. | |
:return: | |
""" | |
RemoveSpecialCharactersTransformer._preprocess_data(X) | |
return X | |
class RemoveHtmlEncodedTransformer(TransformerMixin): | |
@staticmethod | |
def _preprocess_data(data_series): | |
""" | |
inspired from: | |
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py | |
:param data_series: | |
:return: | |
""" | |
# html encoded characters | |
for remove in map(lambda r: re.compile(r), ["<", ">", """, "&", "w/o", "w/"]): | |
data_series.replace(remove, "", inplace=True) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
""" | |
:param X: Series, aka column of data. | |
:return: | |
""" | |
RemoveHtmlEncodedTransformer._preprocess_data(X) | |
return X | |
class RemoveUsernameTransformer(TransformerMixin): | |
@staticmethod | |
def _preprocess_data(data_series): | |
""" | |
inspired from: | |
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py | |
:param data_series: | |
:return: | |
""" | |
# remove user name | |
regex = re.compile(r"@[^\s]+[\s]?") | |
data_series.replace(regex, "", inplace=True) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
""" | |
:param X: Series, aka column of data. | |
:return: | |
""" | |
RemoveUsernameTransformer._preprocess_data(X) | |
return X | |
class RemoveUrlsTransformer(TransformerMixin): | |
@staticmethod | |
def _preprocess_data(data_series): | |
""" | |
inspired from: | |
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py | |
:param data_series: | |
:return: | |
""" | |
# remove urls | |
regex = re.compile(r"http.?://[^\s]+[\s]?") | |
data_series.replace(regex, "", inplace=True) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
""" | |
:param X: Series, aka column of data. | |
:return: | |
""" | |
RemoveUrlsTransformer._preprocess_data(X) | |
return X |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment