Skip to content

Instantly share code, notes, and snippets.

@bhuiyanmobasshir94
Last active June 3, 2020 18:50
Show Gist options
  • Save bhuiyanmobasshir94/5326a370b5761748d29f72d30da85715 to your computer and use it in GitHub Desktop.
Save bhuiyanmobasshir94/5326a370b5761748d29f72d30da85715 to your computer and use it in GitHub Desktop.
import string
import re
def clean_text(text):
text = text.translate(string.punctuation)
## Convert words to lower case and split them
text = text.lower().split()
## Remove stop words
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops and len(w) >= 3]
text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
return text
# https://medium.com/@sumith.gannarapu/restaurant-recommendation-system-b52911d1ed0b
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment