Skip to content

Instantly share code, notes, and snippets.

@pranjalAI
Created September 4, 2020 14:16
Show Gist options
  • Save pranjalAI/1caf56e11716cd66a80e1174f648c793 to your computer and use it in GitHub Desktop.
Save pranjalAI/1caf56e11716cd66a80e1174f648c793 to your computer and use it in GitHub Desktop.
import re
def processTweet(chat):
chat = chat.lower()
chat = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',chat)
chat = re.sub('@[^\s]+','',chat)
chat = re.sub('[\s]+', ' ', chat)
chat = re.sub(r'#([^\s]+)', r'\1', chat)
chat = re.sub(r'[\.!:\?\-\'\"\\/]', r'', chat)
chat = chat.strip('\'"')
return chat
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
return pattern.sub(r"\1", s)
def getFeatureVector(chat):
chat=processTweet(chat)
featureVector = []
#split tweet into words
words = chat.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word stats with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(val is None):
continue
else:
featureVector.append(w.lower())
return " ".join(list(featureVector))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment