Created
October 12, 2022 17:07
-
-
Save yavuzKomecoglu/851c98774c77b64f88f72fad533906d8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_tweet(tweet): | |
processed_tweet = [] | |
# Convert to lower case | |
tweet = tweet.lower() | |
#Clean only digits | |
tweet = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", tweet) | |
# Replaces URLs with the word URL | |
#tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet) | |
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', tweet) | |
# Replace @handle with the word USER_MENTION | |
#tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet) | |
tweet = re.sub(r'@[\S]+', '', tweet) | |
# Replaces #hashtag with hashtag | |
#tweet = re.sub(r'#(\S+)', r' \1 ', tweet) | |
tweet = re.sub(r'#(\S+)', '', tweet) | |
# Remove RT (retweet) | |
tweet = re.sub(r'\brt\b', '', tweet) | |
# Replace 2+ dots with space | |
tweet = re.sub(r'\.{2,}', ' ', tweet) | |
# Strip space, " and ' from tweet | |
tweet = tweet.strip(' "\'') | |
# Replace emojis with either EMO_POS or EMO_NEG | |
#tweet = handle_emojis(tweet) | |
tweet = remove_emoji(tweet) | |
# Replace multiple spaces with a single space | |
tweet = re.sub(r'\s+', ' ', tweet) | |
#my custom chars | |
tweet = tweet.replace('₺','') | |
tweet = tweet.replace('=','') | |
tweet = tweet.replace('’','') | |
tweet = tweet.replace('|','') | |
tweet = tweet.replace('‘','') | |
tweet = tweet.replace('/','') | |
tweet = tweet.replace('…','') | |
tweet = tweet.replace('–','') | |
tweet = tweet.replace('&','') | |
tweet = tweet.replace('“','') | |
tweet = tweet.replace('”','') | |
tweet = tweet.replace('+','') | |
tweet = tweet.replace('%','') | |
tweet = tweet.replace('@','') | |
tweet = tweet.replace('#','') | |
words = word_tokenize(tweet) #tweet.split() | |
for word in words: | |
word = preprocess_word(word) | |
#if is_valid_word(word): | |
# processed_tweet.append(word) | |
processed_tweet.append(word) | |
return ' '.join(processed_tweet) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment