Skip to content

Instantly share code, notes, and snippets.

@shresthakamal
Created February 20, 2023 08:38
Show Gist options
  • Save shresthakamal/4ac50b4572ce309725dac45eaceeab8a to your computer and use it in GitHub Desktop.
Save shresthakamal/4ac50b4572ce309725dac45eaceeab8a to your computer and use it in GitHub Desktop.
Common Text Processing Steps in NLP
# standard pre-processing steps for text processing
# 1. lower case
# 2. remove punctuation
# 3. remove stop words
# 4. remove numbers
# 5. remove short words
# 6. lemmatize
# 7. stem
# 8. remove non-ascii characters
# 9. remove extra spaces
# 10. remove extra newlines
# 11. remove extra tabs
# 12. remove extra carriage returns
# 13. remove extra line feeds
# 14. remove extra form feeds
# 15. remove extra vertical tabs
# 16. remove extra non-breaking spaces
# 17. remove extra non-breaking hyphens
# 18. remove extra non-breaking dashes
import re
import string
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# 1. lower case
def lower_case(text):
return text.lower()
# 2. remove punctuation
def remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
# 3. remove stop words
def remove_stop_words(text):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
filtered_text = [w for w in word_tokens if not w in stop_words]
return ' '.join(filtered_text)
# 4. remove numbers
def remove_numbers(text):
return re.sub(r'\d+', '', text)
# 5. remove short words
def remove_short_words(text):
word_tokens = word_tokenize(text)
filtered_text = [w for w in word_tokens if len(w) > 2]
return ' '.join(filtered_text)
# 6. lemmatize
def lemmatize(text):
lemmatizer = WordNetLemmatizer()
word_tokens = word_tokenize(text)
lemmatized_text = [lemmatizer.lemmatize(w) for w in word_tokens]
return ' '.join(lemmatized_text)
# 7. stem
def stem(text):
stemmer = PorterStemmer()
word_tokens = word_tokenize(text)
stemmed_text = [stemmer.stem(w) for w in word_tokens]
return ' '.join(stemmed_text)
# 8. remove non-ascii characters
def remove_non_ascii(text):
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# 9. remove extra spaces
def remove_extra_spaces(text):
return re.sub(' +', ' ', text)
# 10. remove extra newlines
def remove_extra_newlines(text):
return re.sub('+', '', text)
# 11. remove extra tabs
def remove_extra_tabs(text):
return re.sub('\t+', '', text)
# 12. remove extra carriage returns
def remove_extra_carriage_returns(text):
return re.sub('\r+', '', text)
# 13. remove extra line feeds
def remove_extra_line_feeds(text):
return re.sub('\f+', '', text)
# 14. remove extra form feeds
def remove_extra_form_feeds(text):
return re.sub('\f+', '', text)
# 15. remove extra vertical tabs
def remove_extra_vertical_tabs(text):
return re.sub('\v+', '', text)
# 16. remove extra non-breaking spaces
def remove_extra_non_breaking_spaces(text):
return re.sub('\xa0+', '', text)
# 17. remove extra non-breaking hyphens
def remove_extra_non_breaking_hyphens(text):
return re.sub('\xad+', '', text)
# 18. remove extra non-breaking dashes
def remove_extra_non_breaking_dashes(text):
return re.sub('\u2013+', '', text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment