Skip to content

Instantly share code, notes, and snippets.

@youngsoul
Created September 6, 2017 21:44
Show Gist options
  • Save youngsoul/b748bee79c064dc7ac4620c4c8eb29c9 to your computer and use it in GitHub Desktop.
Save youngsoul/b748bee79c064dc7ac4620c4c8eb29c9 to your computer and use it in GitHub Desktop.
number of data transformers for sentiment analysis
import re
from sklearn.base import BaseEstimator, TransformerMixin
class RemoveEllipseTransformer(TransformerMixin):
@staticmethod
def _preprocess_data(data_series):
"""
inspired from:
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py
:param data_series:
:return:
"""
# remove ..., ...., ..... with space
for remove in map(lambda r: re.compile(r), ["\.\.\.\.\.", "\.\.\.\.", "\.\.\."]):
data_series.replace(remove, " ", inplace=True)
def fit(self, X, y=None):
return self
def transform(self, X):
"""
:param X: Series, aka column of data.
:return:
"""
RemoveEllipseTransformer._preprocess_data(X)
return X
class RemoveNumbersTransformer(TransformerMixin):
@staticmethod
def _preprocess_data(data_series):
"""
inspired from:
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py
:param data_series:
:return:
"""
# remove numbers
regex = re.compile(r"\s?[0-9]+\.?[0-9]*")
data_series.replace(regex, "", inplace=True)
def fit(self, X, y=None):
return self
def transform(self, X):
"""
:param X: Series, aka column of data.
:return:
"""
RemoveNumbersTransformer._preprocess_data(X)
return X
class RemoveSpecialCharactersTransformer(TransformerMixin):
@staticmethod
def _preprocess_data(data_series):
"""
inspired from:
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py
:param data_series:
:return:
"""
# remove special characters
for remove in map(lambda r: re.compile(re.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
"@", "%", "^", "*", "(", ")", "{", "}",
"[", "]", "|", "/", "\\", ">", "<", "-",
"!", "?", ".", "'",
"--", "---", "#"]):
data_series.replace(remove, "", inplace=True)
def fit(self, X, y=None):
return self
def transform(self, X):
"""
:param X: Series, aka column of data.
:return:
"""
RemoveSpecialCharactersTransformer._preprocess_data(X)
return X
class RemoveHtmlEncodedTransformer(TransformerMixin):
@staticmethod
def _preprocess_data(data_series):
"""
inspired from:
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py
:param data_series:
:return:
"""
# html encoded characters
for remove in map(lambda r: re.compile(r), ["&lt;", "&gt;", "&quot;", "&amp;", "w/o", "w/"]):
data_series.replace(remove, "", inplace=True)
def fit(self, X, y=None):
return self
def transform(self, X):
"""
:param X: Series, aka column of data.
:return:
"""
RemoveHtmlEncodedTransformer._preprocess_data(X)
return X
class RemoveUsernameTransformer(TransformerMixin):
@staticmethod
def _preprocess_data(data_series):
"""
inspired from:
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py
:param data_series:
:return:
"""
# remove user name
regex = re.compile(r"@[^\s]+[\s]?")
data_series.replace(regex, "", inplace=True)
def fit(self, X, y=None):
return self
def transform(self, X):
"""
:param X: Series, aka column of data.
:return:
"""
RemoveUsernameTransformer._preprocess_data(X)
return X
class RemoveUrlsTransformer(TransformerMixin):
@staticmethod
def _preprocess_data(data_series):
"""
inspired from:
https://raw.githubusercontent.com/youngsoul/ml-twitter-sentiment-analysis/develop/cleanup.py
:param data_series:
:return:
"""
# remove urls
regex = re.compile(r"http.?://[^\s]+[\s]?")
data_series.replace(regex, "", inplace=True)
def fit(self, X, y=None):
return self
def transform(self, X):
"""
:param X: Series, aka column of data.
:return:
"""
RemoveUrlsTransformer._preprocess_data(X)
return X
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment