shresthakamal · February 20, 2023 08:38
diff --git a/text-processing.py b/text-processing.py
 # standard pre-processing steps for text processing
 # 1. lower case
 # 2. remove punctuation
 # 3. remove stop words
 # 4. remove numbers
 # 5. remove short words
 # 6. lemmatize
 # 7. stem
 # 8. remove non-ascii characters
 # 9. remove extra spaces
 # 10. remove extra newlines
 # 11. remove extra tabs
 # 12. remove extra carriage returns
 # 13. remove extra line feeds
 # 14. remove extra form feeds
 # 15. remove extra vertical tabs
 # 16. remove extra non-breaking spaces
 # 17. remove extra non-breaking hyphens
 # 18. remove extra non-breaking dashes


 import re
 import string
 import unicodedata
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize

 # 1. lower case
 def lower_case(text):
    return text.lower()

 # 2. remove punctuation
 def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

 # 3. remove stop words
 def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_text)

 # 4. remove numbers
 def remove_numbers(text):
    return re.sub(r'\d+', '', text)

 # 5. remove short words
 def remove_short_words(text):
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if len(w) > 2]
    return ' '.join(filtered_text)

 # 6. lemmatize
 def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(w) for w in word_tokens]
    return ' '.join(lemmatized_text)

 # 7. stem
 def stem(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(w) for w in word_tokens]
    return ' '.join(stemmed_text)

 # 8. remove non-ascii characters
 def remove_non_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

 # 9. remove extra spaces
 def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

 # 10. remove extra newlines
 def remove_extra_newlines(text):
    return re.sub('+', '', text)

 # 11. remove extra tabs
 def remove_extra_tabs(text):
    return re.sub('\t+', '', text)

 # 12. remove extra carriage returns
 def remove_extra_carriage_returns(text):
    return re.sub('\r+', '', text)

 # 13. remove extra line feeds
 def remove_extra_line_feeds(text):
    return re.sub('\f+', '', text)

 # 14. remove extra form feeds
 def remove_extra_form_feeds(text):
    return re.sub('\f+', '', text)

 # 15. remove extra vertical tabs
 def remove_extra_vertical_tabs(text):
    return re.sub('\v+', '', text)

 # 16. remove extra non-breaking spaces
 def remove_extra_non_breaking_spaces(text):
    return re.sub('\xa0+', '', text)

 # 17. remove extra non-breaking hyphens
 def remove_extra_non_breaking_hyphens(text):
    return re.sub('\xad+', '', text)

 # 18. remove extra non-breaking dashes
 def remove_extra_non_breaking_dashes(text):
    return re.sub('\u2013+', '', text)
	# standard pre-processing steps for text processing
	# 1. lower case
	# 2. remove punctuation
	# 3. remove stop words
	# 4. remove numbers
	# 5. remove short words
	# 6. lemmatize
	# 7. stem
	# 8. remove non-ascii characters
	# 9. remove extra spaces
	# 10. remove extra newlines
	# 11. remove extra tabs
	# 12. remove extra carriage returns
	# 13. remove extra line feeds
	# 14. remove extra form feeds
	# 15. remove extra vertical tabs
	# 16. remove extra non-breaking spaces
	# 17. remove extra non-breaking hyphens
	# 18. remove extra non-breaking dashes


	import re
	import string
	import unicodedata
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.stem import PorterStemmer
	from nltk.tokenize import word_tokenize

	# 1. lower case
	def lower_case(text):
	return text.lower()

	# 2. remove punctuation
	def remove_punctuation(text):
	return text.translate(str.maketrans('', '', string.punctuation))

	# 3. remove stop words
	def remove_stop_words(text):
	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(text)
	filtered_text = [w for w in word_tokens if not w in stop_words]
	return ' '.join(filtered_text)

	# 4. remove numbers
	def remove_numbers(text):
	return re.sub(r'\d+', '', text)

	# 5. remove short words
	def remove_short_words(text):
	word_tokens = word_tokenize(text)
	filtered_text = [w for w in word_tokens if len(w) > 2]
	return ' '.join(filtered_text)

	# 6. lemmatize
	def lemmatize(text):
	lemmatizer = WordNetLemmatizer()
	word_tokens = word_tokenize(text)
	lemmatized_text = [lemmatizer.lemmatize(w) for w in word_tokens]
	return ' '.join(lemmatized_text)

	# 7. stem
	def stem(text):
	stemmer = PorterStemmer()
	word_tokens = word_tokenize(text)
	stemmed_text = [stemmer.stem(w) for w in word_tokens]
	return ' '.join(stemmed_text)

	# 8. remove non-ascii characters
	def remove_non_ascii(text):
	return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

	# 9. remove extra spaces
	def remove_extra_spaces(text):
	return re.sub(' +', ' ', text)

	# 10. remove extra newlines
	def remove_extra_newlines(text):
	return re.sub('+', '', text)

	# 11. remove extra tabs
	def remove_extra_tabs(text):
	return re.sub('\t+', '', text)

	# 12. remove extra carriage returns
	def remove_extra_carriage_returns(text):
	return re.sub('\r+', '', text)

	# 13. remove extra line feeds
	def remove_extra_line_feeds(text):
	return re.sub('\f+', '', text)

	# 14. remove extra form feeds
	def remove_extra_form_feeds(text):
	return re.sub('\f+', '', text)

	# 15. remove extra vertical tabs
	def remove_extra_vertical_tabs(text):
	return re.sub('\v+', '', text)

	# 16. remove extra non-breaking spaces
	def remove_extra_non_breaking_spaces(text):
	return re.sub('\xa0+', '', text)

	# 17. remove extra non-breaking hyphens
	def remove_extra_non_breaking_hyphens(text):
	return re.sub('\xad+', '', text)

	# 18. remove extra non-breaking dashes
	def remove_extra_non_breaking_dashes(text):
	return re.sub('\u2013+', '', text)
No results found