Last active
November 5, 2020 17:31
-
-
Save Sanket758/8d194ff550e499a02f7dc9a71dafd647 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !pip install contractions | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
import contractions | |
from bs4 import BeautifulSoup | |
import unicodedata | |
import re | |
import nltk | |
import numpy as np | |
ps = nltk.porter.PorterStemmer() | |
stop_words = nltk.corpus.stopwords.words('english') | |
stop_words.remove('no') | |
stop_words.remove('but') | |
stop_words.remove('not') | |
def strip_html_tags(text): | |
soup = BeautifulSoup(text, "html.parser") | |
[s.extract() for s in soup(['iframe', 'script'])] | |
stripped_text = soup.get_text() | |
stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text) | |
return stripped_text | |
def remove_accented_chars(text): | |
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') | |
return text | |
def expand_contractions(text): | |
return contractions.fix(text) | |
def simple_stemming(text, stemmer=ps): | |
text = ' '.join([stemmer.stem(word) for word in text.split()]) | |
return text | |
def remove_special_characters(text, remove_digits=False): | |
pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]' | |
text = re.sub(pattern, '', text) | |
return text | |
def remove_stopwords(text, is_lower_case=False, stopwords=None): | |
if not stopwords: | |
stopwords = nltk.corpus.stopwords.words('english') | |
tokens = nltk.word_tokenize(text) | |
tokens = [token.strip() for token in tokens] | |
if is_lower_case: | |
filtered_tokens = [token for token in tokens if token not in stopwords] | |
else: | |
filtered_tokens = [token for token in tokens if token.lower() not in stopwords] | |
filtered_text = ' '.join(filtered_tokens) | |
return filtered_text | |
def pre_process_document(document): | |
# strip HTML | |
document = strip_html_tags(document) | |
# lower case | |
document = document.lower() | |
# remove extra newlines (often might be present in really noisy text) | |
document = document.translate(document.maketrans("\n\t\r", " ")) | |
# remove accented characters | |
document = remove_accented_chars(document) | |
# expand contractions | |
document = expand_contractions(document) | |
# remove special characters and\or digits | |
# insert spaces between special characters to isolate them | |
special_char_pattern = re.compile(r'([{.(-)!}])') | |
document = special_char_pattern.sub(" \\1 ", document) | |
document = remove_special_characters(document, remove_digits=True) | |
# stemming text | |
document = simple_stemming(document) | |
# remove stopwords | |
document = remove_stopwords(document, is_lower_case=True, stopwords=stop_words) | |
# remove extra whitespace | |
document = re.sub(' +', ' ', document) | |
document = document.strip() | |
return document | |
pre_process_corpus = np.vectorize(pre_process_document) | |
# Normalizing train and test | |
norm_train_reviews = pre_process_corpus(train_reviews) | |
norm_test_reviews = pre_process_corpus(test_reviews) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment