Created
September 18, 2022 12:07
-
-
Save Steboss89/ea27a930ceb9388f98479a1b5e998513 to your computer and use it in GitHub Desktop.
Cleaning tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import nltk | |
nltk.download("stopwords") | |
from nltk.corpus import stopwords | |
import string | |
import re | |
# vectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
STOPWORDS = stopwords.words("english") | |
# HELPER FUNCTIONS | |
def remove_stopwords(text): | |
r""" Function to remove stopwords from tweets | |
Parameters | |
---------- | |
text: str, input tweet | |
Return | |
------ | |
str, cleared tweet | |
""" | |
tweet_no_punct = [word for word in text.split() if not word in STOPWORDS] | |
return ' '.join(tweet_no_punct) | |
def remove_punctuation(text): | |
r""" Function to remove punctuation | |
Parameters | |
---------- | |
text: str, input tweet | |
Return | |
------ | |
str, cleared tweet""" | |
outline = text.translate(str.maketrans('', '', string.punctuation)) | |
return outline | |
def remove_specific_chars(text): | |
r""" Custom function to remove \n, \s+ or \' | |
Parameters | |
---------- | |
text: str, input tweet | |
Return | |
------ | |
str, cleared tweet | |
""" | |
# remove words after @ | |
outline = re.sub("@\S+ ", "", text) | |
# remove single quote | |
outline = re.sub("\'", '', outline) | |
# remove new line | |
outline = re.sub('\\n', '', outline) | |
return outline |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment