Sanket758 · November 5, 2020 17:31
diff --git a/Text Preprocessing on IMDB reviews b/Text Preprocessing on IMDB reviews
 # !pip install contractions 
 import nltk
 nltk.download('stopwords')
 nltk.download('punkt')
 import contractions
 from bs4 import BeautifulSoup
 import unicodedata
 import re
 import nltk
 import numpy as np

 ps = nltk.porter.PorterStemmer()
 stop_words = nltk.corpus.stopwords.words('english')
 stop_words.remove('no')
 stop_words.remove('but')
 stop_words.remove('not')


 def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


 def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


 def expand_contractions(text):
    return contractions.fix(text)


 def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text


 def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


 def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


 def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)

    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  

    # stemming text
    document = simple_stemming(document)      
    
    # remove stopwords
    document = remove_stopwords(document, is_lower_case=True, stopwords=stop_words)
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document
    
 pre_process_corpus = np.vectorize(pre_process_document)

 # Normalizing train and test 
 norm_train_reviews = pre_process_corpus(train_reviews)
 norm_test_reviews = pre_process_corpus(test_reviews)
	# !pip install contractions
	import nltk
	nltk.download('stopwords')
	nltk.download('punkt')
	import contractions
	from bs4 import BeautifulSoup
	import unicodedata
	import re
	import nltk
	import numpy as np

	ps = nltk.porter.PorterStemmer()
	stop_words = nltk.corpus.stopwords.words('english')
	stop_words.remove('no')
	stop_words.remove('but')
	stop_words.remove('not')


	def strip_html_tags(text):
	soup = BeautifulSoup(text, "html.parser")
	[s.extract() for s in soup(['iframe', 'script'])]
	stripped_text = soup.get_text()
	stripped_text = re.sub(r'[\r\|\n\|\r\n]+', '\n', stripped_text)
	return stripped_text


	def remove_accented_chars(text):
	text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
	return text


	def expand_contractions(text):
	return contractions.fix(text)


	def simple_stemming(text, stemmer=ps):
	text = ' '.join([stemmer.stem(word) for word in text.split()])
	return text


	def remove_special_characters(text, remove_digits=False):
	pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
	text = re.sub(pattern, '', text)
	return text


	def remove_stopwords(text, is_lower_case=False, stopwords=None):
	if not stopwords:
	stopwords = nltk.corpus.stopwords.words('english')
	tokens = nltk.word_tokenize(text)
	tokens = [token.strip() for token in tokens]

	if is_lower_case:
	filtered_tokens = [token for token in tokens if token not in stopwords]
	else:
	filtered_tokens = [token for token in tokens if token.lower() not in stopwords]

	filtered_text = ' '.join(filtered_tokens)
	return filtered_text


	def pre_process_document(document):

	# strip HTML
	document = strip_html_tags(document)

	# lower case
	document = document.lower()

	# remove extra newlines (often might be present in really noisy text)
	document = document.translate(document.maketrans("\n\t\r", " "))

	# remove accented characters
	document = remove_accented_chars(document)

	# expand contractions
	document = expand_contractions(document)

	# remove special characters and\or digits
	# insert spaces between special characters to isolate them
	special_char_pattern = re.compile(r'([{.(-)!}])')
	document = special_char_pattern.sub(" \\1 ", document)
	document = remove_special_characters(document, remove_digits=True)

	# stemming text
	document = simple_stemming(document)

	# remove stopwords
	document = remove_stopwords(document, is_lower_case=True, stopwords=stop_words)

	# remove extra whitespace
	document = re.sub(' +', ' ', document)
	document = document.strip()

	return document

	pre_process_corpus = np.vectorize(pre_process_document)

	# Normalizing train and test
	norm_train_reviews = pre_process_corpus(train_reviews)
	norm_test_reviews = pre_process_corpus(test_reviews)