Created
September 4, 2020 14:16
-
-
Save pranjalAI/1caf56e11716cd66a80e1174f648c793 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def processTweet(chat): | |
chat = chat.lower() | |
chat = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',chat) | |
chat = re.sub('@[^\s]+','',chat) | |
chat = re.sub('[\s]+', ' ', chat) | |
chat = re.sub(r'#([^\s]+)', r'\1', chat) | |
chat = re.sub(r'[\.!:\?\-\'\"\\/]', r'', chat) | |
chat = chat.strip('\'"') | |
return chat | |
def replaceTwoOrMore(s): | |
#look for 2 or more repetitions of character and replace with the character itself | |
pattern = re.compile(r"(.)\1{2,}", re.DOTALL) | |
return pattern.sub(r"\1", s) | |
def getFeatureVector(chat): | |
chat=processTweet(chat) | |
featureVector = [] | |
#split tweet into words | |
words = chat.split() | |
for w in words: | |
#replace two or more with two occurrences | |
w = replaceTwoOrMore(w) | |
#strip punctuation | |
w = w.strip('\'"?,.') | |
#check if the word stats with an alphabet | |
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w) | |
#ignore if it is a stop word | |
if(val is None): | |
continue | |
else: | |
featureVector.append(w.lower()) | |
return " ".join(list(featureVector)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment