Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created September 18, 2022 12:07
Show Gist options
  • Save Steboss89/ea27a930ceb9388f98479a1b5e998513 to your computer and use it in GitHub Desktop.
Save Steboss89/ea27a930ceb9388f98479a1b5e998513 to your computer and use it in GitHub Desktop.
Cleaning tweets
import pandas as pd
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import string
import re
# vectorizer
from sklearn.feature_extraction.text import CountVectorizer
STOPWORDS = stopwords.words("english")
# HELPER FUNCTIONS
def remove_stopwords(text):
r""" Function to remove stopwords from tweets
Parameters
----------
text: str, input tweet
Return
------
str, cleared tweet
"""
tweet_no_punct = [word for word in text.split() if not word in STOPWORDS]
return ' '.join(tweet_no_punct)
def remove_punctuation(text):
r""" Function to remove punctuation
Parameters
----------
text: str, input tweet
Return
------
str, cleared tweet"""
outline = text.translate(str.maketrans('', '', string.punctuation))
return outline
def remove_specific_chars(text):
r""" Custom function to remove \n, \s+ or \'
Parameters
----------
text: str, input tweet
Return
------
str, cleared tweet
"""
# remove words after @
outline = re.sub("@\S+ ", "", text)
# remove single quote
outline = re.sub("\'", '', outline)
# remove new line
outline = re.sub('\\n', '', outline)
return outline
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment