Last active
December 9, 2020 02:57
-
-
Save ravikiranj/2639031 to your computer and use it in GitHub Desktop.
preprocess tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import regex | |
import re | |
#start process_tweet | |
def processTweet(tweet): | |
# process the tweets | |
#Convert to lower case | |
tweet = tweet.lower() | |
#Convert www.* or https?://* to URL | |
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet) | |
#Convert @username to AT_USER | |
tweet = re.sub('@[^\s]+','AT_USER',tweet) | |
#Remove additional white spaces | |
tweet = re.sub('[\s]+', ' ', tweet) | |
#Replace #word with word | |
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) | |
#trim | |
tweet = tweet.strip('\'"') | |
return tweet | |
#end | |
#Read the tweets one by one and process it | |
fp = open('data/sampleTweets.txt', 'r') | |
line = fp.readline() | |
while line: | |
processedTweet = processTweet(line) | |
print processedTweet | |
line = fp.readline() | |
#end loop | |
fp.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment