Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Last active April 22, 2022 19:58
Show Gist options
  • Save Eligijus112/2e93b2f51e97742146d1f66587061f13 to your computer and use it in GitHub Desktop.
Save Eligijus112/2e93b2f51e97742146d1f66587061f13 to your computer and use it in GitHub Desktop.
Text preprocesing function
import re
def clean_text(
string: str,
punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
stop_words=['the', 'a', 'and', 'is', 'be', 'will']) -> str:
"""
A method to clean text
"""
# Cleaning the urls
string = re.sub(r'https?://\S+|www\.\S+', '', string)
# Cleaning the html elements
string = re.sub(r'<.*?>', '', string)
# Removing the punctuations
for x in string.lower():
if x in punctuations:
string = string.replace(x, "")
# Converting the text to lower
string = string.lower()
# Removing stop words
string = ' '.join([word for word in string.split() if word not in stop_words])
# Cleaning the whitespaces
string = re.sub(r'\s+', ' ', string).strip()
return string
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment