Skip to content

Instantly share code, notes, and snippets.

@AnasAlmasri
Created February 12, 2019 13:07
Show Gist options
  • Save AnasAlmasri/af0b92428b00708b4cc710370ff3c82e to your computer and use it in GitHub Desktop.
Save AnasAlmasri/af0b92428b00708b4cc710370ff3c82e to your computer and use it in GitHub Desktop.
tweet preprocessor class
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processTweets(self, list_of_tweets):
processedTweets=[]
for tweet in list_of_tweets:
processedTweets.append((self._processTweet(tweet["text"]),tweet["label"]))
return processedTweets
def _processTweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment