Skip to content

Instantly share code, notes, and snippets.

@Sanket758
Last active November 5, 2020 17:31
Show Gist options
  • Save Sanket758/8d194ff550e499a02f7dc9a71dafd647 to your computer and use it in GitHub Desktop.
Save Sanket758/8d194ff550e499a02f7dc9a71dafd647 to your computer and use it in GitHub Desktop.
# !pip install contractions
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re
import nltk
import numpy as np
ps = nltk.porter.PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')
def strip_html_tags(text):
soup = BeautifulSoup(text, "html.parser")
[s.extract() for s in soup(['iframe', 'script'])]
stripped_text = soup.get_text()
stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
return stripped_text
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def expand_contractions(text):
return contractions.fix(text)
def simple_stemming(text, stemmer=ps):
text = ' '.join([stemmer.stem(word) for word in text.split()])
return text
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
text = re.sub(pattern, '', text)
return text
def remove_stopwords(text, is_lower_case=False, stopwords=None):
if not stopwords:
stopwords = nltk.corpus.stopwords.words('english')
tokens = nltk.word_tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopwords]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def pre_process_document(document):
# strip HTML
document = strip_html_tags(document)
# lower case
document = document.lower()
# remove extra newlines (often might be present in really noisy text)
document = document.translate(document.maketrans("\n\t\r", " "))
# remove accented characters
document = remove_accented_chars(document)
# expand contractions
document = expand_contractions(document)
# remove special characters and\or digits
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
document = special_char_pattern.sub(" \\1 ", document)
document = remove_special_characters(document, remove_digits=True)
# stemming text
document = simple_stemming(document)
# remove stopwords
document = remove_stopwords(document, is_lower_case=True, stopwords=stop_words)
# remove extra whitespace
document = re.sub(' +', ' ', document)
document = document.strip()
return document
pre_process_corpus = np.vectorize(pre_process_document)
# Normalizing train and test
norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment