Skip to content

Instantly share code, notes, and snippets.

@vallantin
Created October 19, 2018 15:00
Show Gist options
  • Save vallantin/7e9752b9a9f7ebd1f27c3776ca727c8e to your computer and use it in GitHub Desktop.
Save vallantin/7e9752b9a9f7ebd1f27c3776ca727c8e to your computer and use it in GitHub Desktop.
# Clean tweets
class process_tweets(BaseEstimator, TransformerMixin):
'''Extracts the Tweet and apply the transformation'''
def __init__(self):
pass
def preprocess_text(self, text):
# Return the normal form form for the Unicode string, encode them as ascii and decode back as utf-8
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# Lowercase
text = text.lower()
# Remove HTML
text = re.sub(r'<[^>]*>', '', text)
# Expand contractions
text = re.sub(r"i'm", " i am ", text)
text = re.sub(r" im ", " i am ", text)
text = re.sub(r"\: p", "", text)
text = re.sub(r" ive ", " i have ", text)
text = re.sub(r" he's ", " he is ", text)
text = re.sub(r" she's ", " she is ", text)
text = re.sub(r" that's ", " that is ", text)
text = re.sub(r" what's ", " what is ", text)
text = re.sub(r" where's ", " where is ", text)
text = re.sub(r" haven't ", " have not ", text)
text = re.sub(r" ur ", " you are ", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r" won't ", " will not ", text)
text = re.sub(r" wouldn't ", " would not ", text)
text = re.sub(r" can't ", " cannot ", text)
text = re.sub(r" cannot ", " cannot ", text)
text = re.sub(r" don't ", " do not ", text)
text = re.sub(r" didn't ", " did not ", text)
text = re.sub(r" doesn't ", " does not ", text)
text = re.sub(r" isn't ", " is not ", text)
text = re.sub(r" it's ", " it is ", text)
text = re.sub(r" who's ", " who is ", text)
text = re.sub(r" there's ", " there is ", text)
text = re.sub(r" weren't ", " were not ", text)
text = re.sub(r" okay ", " ok ", text)
text = re.sub(r" you're ", " you are ", text)
text = re.sub(r" c'mon ", " come on ", text)
text = re.sub(r"in'", "ing", text)
text = re.sub(r"\'s", " s", text)
# Remove ponctuation and special chars except ! and ?
text = re.sub('[^a-zA-Z?!\s]', ' ', text)
# Lemmatize
lemmatizer = WordNetLemmatizer()
sentence = []
for word in text.split(' '):
sentence.append(lemmatizer.lemmatize(word))
# Rebuild sentences
text = ' '.join(sentence)
# Remove stopwords
stopWords = set(stopwords.words('english'))
sentence = []
for word in text.split(' '):
if word not in stopWords:
sentence.append(word)
# Rebuild sentences
text = ' '.join(sentence)
# Remove twitter handlers, hashtags symbols and URLs
text = re.sub(r'@[\w_-]+', 'mention', text)
text = re.sub('https?://[^ ]+', 'link', text)
text = re.sub('#', '', text)
# If remaining text is empty, we have to find a way to spot this
if text == '':
text = 'notext'
return text
def transform(self, df, y=None):
return df['tweet'].apply(self.preprocess_text)
def fit(self, df, y=None):
return self
# Dummify categorical variables
class dummies_transformation(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, df, y=None):
return self
def transform(self, df, y=None):
for column in self.columns:
df = pd.concat([df, pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)
df = df.drop([column], axis=1)
df = df.drop(['tweet'], axis=1)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment