accessnash · July 5, 2020 07:55
diff --git a/yelp-review.py b/yelp-review.py
 # -*- coding: utf-8 -*-
 """
 Created on Sat Jul  4 10:08:00 2020

 @author: User
 """


 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np

 yelp_df = pd.read_csv('yelp.csv')
 yelp_df.describe()

 yelp_df['length'] = yelp_df['text'].apply(len)
 yelp_df['length'].plot(bins = 100, kind = 'hist')
 yelp_df['length'].describe()
 yelp_df[yelp_df['length'] == 4997]['text'].iloc[0]
 sns.countplot(y = 'stars', data = yelp_df)
 g = sns.FacetGrid(data = yelp_df, col = 'stars', col_wrap = 5)
 g.map(plt.hist, 'length', bins = 20, color = 'r')
 yelp_df_1 = yelp_df[yelp_df['stars'] == 1]
 yelp_df_5 = yelp_df[yelp_df['stars'] == 5]
 yelp_df_1_5 = pd.concat([yelp_df_1, yelp_df_5])
 yelp_df_1_5.info()
 print('1-star Review percentage = ', (len(yelp_df_1)/len(yelp_df_1_5)))
 sns.countplot(yelp_df_1_5['stars'], label = 'Count')

 import string
 string.punctuation

 import nltk
 from nltk.corpus import stopwords
 nltk.download('stopwords')
 stopwords = stopwords.words('english')

 def message_cleaning(message):
    Test_punc_rmvd = [char for char in message if char not in string.punctuation]
    Test_punc_rmvd_join = ''.join(Test_punc_rmvd)
    Test_punc_rmvd_join_clean = [word for word in Test_punc_rmvd_join.split() if word.lower() not in stopwords]
    return Test_punc_rmvd_join_clean

 yelp_df_clean = yelp_df_1_5['text'].apply(message_cleaning)
 print(yelp_df_1_5['text'][0]) # original review
 print(yelp_df_clean[0]) # cleaned up review

 from sklearn.feature_extraction.text import CountVectorizer
 vectorizer = CountVectorizer(analyzer = message_cleaning)
 yelp_countvectorizer = vectorizer.fit_transform(yelp_df_1_5['text'])
 print(vectorizer.get_feature_names())
 print(yelp_countvectorizer.toarray())
 yelp_countvectorizer.shape

 label = yelp_df_1_5['stars'].values
 from sklearn.naive_bayes import MultinomialNB
 nb_classifier = MultinomialNB()
 nb_classifier.fit(yelp_countvectorizer, label)

 # testing prediction ability using some sample reviews
 testing_sample1 = ['amazing food! highly recommended']
 testing_sample2 = ['shitty food and horrible service']
 testing_sample_countvectorizer1 = vectorizer.transform(testing_sample1)
 testing_sample_countvectorizer2 = vectorizer.transform(testing_sample2)
 test_predict1 = nb_classifier.predict(testing_sample_countvectorizer1)
 test_predict2 = nb_classifier.predict(testing_sample_countvectorizer2)

 X = yelp_countvectorizer
 y = label
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
 from sklearn.naive_bayes import MultinomialNB
 nb_classifier = MultinomialNB()
 nb_classifier.fit(X_train, y_train)
 from sklearn.metrics import classification_report, confusion_matrix
 y_predict_train = nb_classifier.predict(X_train)
 cm = confusion_matrix(y_train, y_predict_train)
 sns.heatmap(cm, annot = True)

 y_predict_test = nb_classifier.predict(X_test)
 cm2 = confusion_matrix(y_test, y_predict_test)
 sns.heatmap(cm2, annot = True)

 print(classification_report(y_test, y_predict_test))
	# -- coding: utf-8 --
	"""
	Created on Sat Jul 4 10:08:00 2020

	@author: User
	"""


	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import numpy as np

	yelp_df = pd.read_csv('yelp.csv')
	yelp_df.describe()

	yelp_df['length'] = yelp_df['text'].apply(len)
	yelp_df['length'].plot(bins = 100, kind = 'hist')
	yelp_df['length'].describe()
	yelp_df[yelp_df['length'] == 4997]['text'].iloc[0]
	sns.countplot(y = 'stars', data = yelp_df)
	g = sns.FacetGrid(data = yelp_df, col = 'stars', col_wrap = 5)
	g.map(plt.hist, 'length', bins = 20, color = 'r')
	yelp_df_1 = yelp_df[yelp_df['stars'] == 1]
	yelp_df_5 = yelp_df[yelp_df['stars'] == 5]
	yelp_df_1_5 = pd.concat([yelp_df_1, yelp_df_5])
	yelp_df_1_5.info()
	print('1-star Review percentage = ', (len(yelp_df_1)/len(yelp_df_1_5)))
	sns.countplot(yelp_df_1_5['stars'], label = 'Count')

	import string
	string.punctuation

	import nltk
	from nltk.corpus import stopwords
	nltk.download('stopwords')
	stopwords = stopwords.words('english')

	def message_cleaning(message):
	Test_punc_rmvd = [char for char in message if char not in string.punctuation]
	Test_punc_rmvd_join = ''.join(Test_punc_rmvd)
	Test_punc_rmvd_join_clean = [word for word in Test_punc_rmvd_join.split() if word.lower() not in stopwords]
	return Test_punc_rmvd_join_clean

	yelp_df_clean = yelp_df_1_5['text'].apply(message_cleaning)
	print(yelp_df_1_5['text'][0]) # original review
	print(yelp_df_clean[0]) # cleaned up review

	from sklearn.feature_extraction.text import CountVectorizer
	vectorizer = CountVectorizer(analyzer = message_cleaning)
	yelp_countvectorizer = vectorizer.fit_transform(yelp_df_1_5['text'])
	print(vectorizer.get_feature_names())
	print(yelp_countvectorizer.toarray())
	yelp_countvectorizer.shape

	label = yelp_df_1_5['stars'].values
	from sklearn.naive_bayes import MultinomialNB
	nb_classifier = MultinomialNB()
	nb_classifier.fit(yelp_countvectorizer, label)

	# testing prediction ability using some sample reviews
	testing_sample1 = ['amazing food! highly recommended']
	testing_sample2 = ['shitty food and horrible service']
	testing_sample_countvectorizer1 = vectorizer.transform(testing_sample1)
	testing_sample_countvectorizer2 = vectorizer.transform(testing_sample2)
	test_predict1 = nb_classifier.predict(testing_sample_countvectorizer1)
	test_predict2 = nb_classifier.predict(testing_sample_countvectorizer2)

	X = yelp_countvectorizer
	y = label
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
	from sklearn.naive_bayes import MultinomialNB
	nb_classifier = MultinomialNB()
	nb_classifier.fit(X_train, y_train)
	from sklearn.metrics import classification_report, confusion_matrix
	y_predict_train = nb_classifier.predict(X_train)
	cm = confusion_matrix(y_train, y_predict_train)
	sns.heatmap(cm, annot = True)

	y_predict_test = nb_classifier.predict(X_test)
	cm2 = confusion_matrix(y_test, y_predict_test)
	sns.heatmap(cm2, annot = True)

	print(classification_report(y_test, y_predict_test))
No results found