Created
February 20, 2023 08:38
-
-
Save shresthakamal/4ac50b4572ce309725dac45eaceeab8a to your computer and use it in GitHub Desktop.
Common Text Processing Steps in NLP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# standard pre-processing steps for text processing | |
# 1. lower case | |
# 2. remove punctuation | |
# 3. remove stop words | |
# 4. remove numbers | |
# 5. remove short words | |
# 6. lemmatize | |
# 7. stem | |
# 8. remove non-ascii characters | |
# 9. remove extra spaces | |
# 10. remove extra newlines | |
# 11. remove extra tabs | |
# 12. remove extra carriage returns | |
# 13. remove extra line feeds | |
# 14. remove extra form feeds | |
# 15. remove extra vertical tabs | |
# 16. remove extra non-breaking spaces | |
# 17. remove extra non-breaking hyphens | |
# 18. remove extra non-breaking dashes | |
import re | |
import string | |
import unicodedata | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.stem import PorterStemmer | |
from nltk.tokenize import word_tokenize | |
# 1. lower case | |
def lower_case(text): | |
return text.lower() | |
# 2. remove punctuation | |
def remove_punctuation(text): | |
return text.translate(str.maketrans('', '', string.punctuation)) | |
# 3. remove stop words | |
def remove_stop_words(text): | |
stop_words = set(stopwords.words('english')) | |
word_tokens = word_tokenize(text) | |
filtered_text = [w for w in word_tokens if not w in stop_words] | |
return ' '.join(filtered_text) | |
# 4. remove numbers | |
def remove_numbers(text): | |
return re.sub(r'\d+', '', text) | |
# 5. remove short words | |
def remove_short_words(text): | |
word_tokens = word_tokenize(text) | |
filtered_text = [w for w in word_tokens if len(w) > 2] | |
return ' '.join(filtered_text) | |
# 6. lemmatize | |
def lemmatize(text): | |
lemmatizer = WordNetLemmatizer() | |
word_tokens = word_tokenize(text) | |
lemmatized_text = [lemmatizer.lemmatize(w) for w in word_tokens] | |
return ' '.join(lemmatized_text) | |
# 7. stem | |
def stem(text): | |
stemmer = PorterStemmer() | |
word_tokens = word_tokenize(text) | |
stemmed_text = [stemmer.stem(w) for w in word_tokens] | |
return ' '.join(stemmed_text) | |
# 8. remove non-ascii characters | |
def remove_non_ascii(text): | |
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') | |
# 9. remove extra spaces | |
def remove_extra_spaces(text): | |
return re.sub(' +', ' ', text) | |
# 10. remove extra newlines | |
def remove_extra_newlines(text): | |
return re.sub('+', '', text) | |
# 11. remove extra tabs | |
def remove_extra_tabs(text): | |
return re.sub('\t+', '', text) | |
# 12. remove extra carriage returns | |
def remove_extra_carriage_returns(text): | |
return re.sub('\r+', '', text) | |
# 13. remove extra line feeds | |
def remove_extra_line_feeds(text): | |
return re.sub('\f+', '', text) | |
# 14. remove extra form feeds | |
def remove_extra_form_feeds(text): | |
return re.sub('\f+', '', text) | |
# 15. remove extra vertical tabs | |
def remove_extra_vertical_tabs(text): | |
return re.sub('\v+', '', text) | |
# 16. remove extra non-breaking spaces | |
def remove_extra_non_breaking_spaces(text): | |
return re.sub('\xa0+', '', text) | |
# 17. remove extra non-breaking hyphens | |
def remove_extra_non_breaking_hyphens(text): | |
return re.sub('\xad+', '', text) | |
# 18. remove extra non-breaking dashes | |
def remove_extra_non_breaking_dashes(text): | |
return re.sub('\u2013+', '', text) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment