jonathanoheix’s gists

jonathanoheix / nlp8.py

Created December 18, 2018 09:50

	# add tf-idfs columns
	from sklearn.feature_extraction.text import TfidfVectorizer
	tfidf = TfidfVectorizer(min_df = 10)
	tfidf_result = tfidf.fit_transform(reviews_df["review_clean"]).toarray()
	tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
	tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
	tfidf_df.index = reviews_df.index
	reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)

jonathanoheix / nlp7.py

Created December 18, 2018 09:50

	# create doc2vec vector columns
	from gensim.test.utils import common_texts
	from gensim.models.doc2vec import Doc2Vec, TaggedDocument

	documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["review_clean"].apply(lambda x: x.split(" ")))]

	# train a Doc2Vec model with our text data
	model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

	# transform each document into a vector data

jonathanoheix / nlp6.py

Created December 18, 2018 09:50

	# add number of characters column
	reviews_df["nb_chars"] = reviews_df["review"].apply(lambda x: len(x))

	# add number of words column
	reviews_df["nb_words"] = reviews_df["review"].apply(lambda x: len(x.split(" ")))

jonathanoheix / nlp5.py

Created December 18, 2018 09:50

	# add sentiment anaylsis columns
	from nltk.sentiment.vader import SentimentIntensityAnalyzer

	sid = SentimentIntensityAnalyzer()
	reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x))
	reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)

jonathanoheix / nlp4.py

Created December 18, 2018 09:49

	# return the wordnet object value corresponding to the POS tag
	from nltk.corpus import wordnet

	def get_wordnet_pos(pos_tag):
	if pos_tag.startswith('J'):
	return wordnet.ADJ
	elif pos_tag.startswith('V'):
	return wordnet.VERB
	elif pos_tag.startswith('N'):
	return wordnet.NOUN

jonathanoheix / nlp3.py

Created December 18, 2018 09:49

	# remove 'No Negative' or 'No Positive' from text
	reviews_df["review"] = reviews_df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))

jonathanoheix / nlp2.py

Created December 18, 2018 09:48

reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42)

jonathanoheix / nlp1.py

Created December 18, 2018 09:48

	import pandas as pd

	# read data
	reviews_df = pd.read_csv("../input/Hotel_Reviews.csv")
	# append the positive and negative text reviews
	reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
	# create the label
	reviews_df["is_bad_review"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)
	# select only relevant columns
	reviews_df = reviews_df[["review", "is_bad_review"]]

jonathanoheix / scraping14.py

Created December 11, 2018 14:57

	names = []
	prices = []
	nb_in_stock = []
	img_urls = []
	categories = []
	ratings = []

	# scrape data for every book URL: this may take some time
	for url in booksURLs:
	soup = getAndParseURL(url)

jonathanoheix / scraping13.py

Created December 11, 2018 14:56

	booksURLs = []
	for page in pages_urls:
	booksURLs.extend(getBooksURLs(page))

	print(str(len(booksURLs)) + " fetched URLs")
	print("Some examples:")
	booksURLs[:5]