eaglewarrior · March 5, 2021 15:32
diff --git a/clean_text.py b/clean_text.py
 import re
 import unicodedata
 from nltk.corpus import stopwords
 import nltk
 def removetitle(text):
    return re.sub(r'.*:', '', text)

 def removebrackets(text):
    return re.sub('[\(\[].*?[\)\]]', ' ', text)

 def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

 def remove_special_chars(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    return re.sub(pattern, '', text)

 def remove_stopwords(text):
    stopword_list = stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return ' '.join([token for token in tokens if token not in stopword_list])

 def lemmatize(text):
    text = nlp(text)
    return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])

 text_title=removetitle(text)
 text_brackets=removebrackets(text_title)
 text_clean=remove_accented_chars(text_brackets)
 text_clean=text_clean.lower()
 text_clean=remove_special_chars(text_clean)
 text_clean=remove_stopwords(text_clean)
	import re
	import unicodedata
	from nltk.corpus import stopwords
	import nltk
	def removetitle(text):
	return re.sub(r'.*:', '', text)

	def removebrackets(text):
	return re.sub('[\(\[].*?[\)\]]', ' ', text)

	def remove_accented_chars(text):
	return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

	def remove_special_chars(text, remove_digits=False):
	pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
	return re.sub(pattern, '', text)

	def remove_stopwords(text):
	stopword_list = stopwords.words('english')
	tokens = nltk.word_tokenize(text)
	tokens = [token.strip() for token in tokens]
	return ' '.join([token for token in tokens if token not in stopword_list])

	def lemmatize(text):
	text = nlp(text)
	return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])

	text_title=removetitle(text)
	text_brackets=removebrackets(text_title)
	text_clean=remove_accented_chars(text_brackets)
	text_clean=text_clean.lower()
	text_clean=remove_special_chars(text_clean)
	text_clean=remove_stopwords(text_clean)
No results found