This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# add tf-idfs columns | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
tfidf = TfidfVectorizer(min_df = 10) | |
tfidf_result = tfidf.fit_transform(reviews_df["review_clean"]).toarray() | |
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names()) | |
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns] | |
tfidf_df.index = reviews_df.index | |
reviews_df = pd.concat([reviews_df, tfidf_df], axis=1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create doc2vec vector columns | |
from gensim.test.utils import common_texts | |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | |
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["review_clean"].apply(lambda x: x.split(" ")))] | |
# train a Doc2Vec model with our text data | |
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) | |
# transform each document into a vector data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# add number of characters column | |
reviews_df["nb_chars"] = reviews_df["review"].apply(lambda x: len(x)) | |
# add number of words column | |
reviews_df["nb_words"] = reviews_df["review"].apply(lambda x: len(x.split(" "))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# add sentiment anaylsis columns | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
sid = SentimentIntensityAnalyzer() | |
reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x)) | |
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# return the wordnet object value corresponding to the POS tag | |
from nltk.corpus import wordnet | |
def get_wordnet_pos(pos_tag): | |
if pos_tag.startswith('J'): | |
return wordnet.ADJ | |
elif pos_tag.startswith('V'): | |
return wordnet.VERB | |
elif pos_tag.startswith('N'): | |
return wordnet.NOUN |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# remove 'No Negative' or 'No Positive' from text | |
reviews_df["review"] = reviews_df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", "")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# read data | |
reviews_df = pd.read_csv("../input/Hotel_Reviews.csv") | |
# append the positive and negative text reviews | |
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"] | |
# create the label | |
reviews_df["is_bad_review"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0) | |
# select only relevant columns | |
reviews_df = reviews_df[["review", "is_bad_review"]] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
names = [] | |
prices = [] | |
nb_in_stock = [] | |
img_urls = [] | |
categories = [] | |
ratings = [] | |
# scrape data for every book URL: this may take some time | |
for url in booksURLs: | |
soup = getAndParseURL(url) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
booksURLs = [] | |
for page in pages_urls: | |
booksURLs.extend(getBooksURLs(page)) | |
print(str(len(booksURLs)) + " fetched URLs") | |
print("Some examples:") | |
booksURLs[:5] |