Created
July 5, 2020 07:55
-
-
Save accessnash/2a8e5396cac06b7302e85afad5df271e to your computer and use it in GitHub Desktop.
NLP methods to analyse Yelp reviews data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Jul 4 10:08:00 2020 | |
| @author: User | |
| """ | |
| import pandas as pd | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| yelp_df = pd.read_csv('yelp.csv') | |
| yelp_df.describe() | |
| yelp_df['length'] = yelp_df['text'].apply(len) | |
| yelp_df['length'].plot(bins = 100, kind = 'hist') | |
| yelp_df['length'].describe() | |
| yelp_df[yelp_df['length'] == 4997]['text'].iloc[0] | |
| sns.countplot(y = 'stars', data = yelp_df) | |
| g = sns.FacetGrid(data = yelp_df, col = 'stars', col_wrap = 5) | |
| g.map(plt.hist, 'length', bins = 20, color = 'r') | |
| yelp_df_1 = yelp_df[yelp_df['stars'] == 1] | |
| yelp_df_5 = yelp_df[yelp_df['stars'] == 5] | |
| yelp_df_1_5 = pd.concat([yelp_df_1, yelp_df_5]) | |
| yelp_df_1_5.info() | |
| print('1-star Review percentage = ', (len(yelp_df_1)/len(yelp_df_1_5))) | |
| sns.countplot(yelp_df_1_5['stars'], label = 'Count') | |
| import string | |
| string.punctuation | |
| import nltk | |
| from nltk.corpus import stopwords | |
| nltk.download('stopwords') | |
| stopwords = stopwords.words('english') | |
| def message_cleaning(message): | |
| Test_punc_rmvd = [char for char in message if char not in string.punctuation] | |
| Test_punc_rmvd_join = ''.join(Test_punc_rmvd) | |
| Test_punc_rmvd_join_clean = [word for word in Test_punc_rmvd_join.split() if word.lower() not in stopwords] | |
| return Test_punc_rmvd_join_clean | |
| yelp_df_clean = yelp_df_1_5['text'].apply(message_cleaning) | |
| print(yelp_df_1_5['text'][0]) # original review | |
| print(yelp_df_clean[0]) # cleaned up review | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| vectorizer = CountVectorizer(analyzer = message_cleaning) | |
| yelp_countvectorizer = vectorizer.fit_transform(yelp_df_1_5['text']) | |
| print(vectorizer.get_feature_names()) | |
| print(yelp_countvectorizer.toarray()) | |
| yelp_countvectorizer.shape | |
| label = yelp_df_1_5['stars'].values | |
| from sklearn.naive_bayes import MultinomialNB | |
| nb_classifier = MultinomialNB() | |
| nb_classifier.fit(yelp_countvectorizer, label) | |
| # testing prediction ability using some sample reviews | |
| testing_sample1 = ['amazing food! highly recommended'] | |
| testing_sample2 = ['shitty food and horrible service'] | |
| testing_sample_countvectorizer1 = vectorizer.transform(testing_sample1) | |
| testing_sample_countvectorizer2 = vectorizer.transform(testing_sample2) | |
| test_predict1 = nb_classifier.predict(testing_sample_countvectorizer1) | |
| test_predict2 = nb_classifier.predict(testing_sample_countvectorizer2) | |
| X = yelp_countvectorizer | |
| y = label | |
| from sklearn.model_selection import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) | |
| from sklearn.naive_bayes import MultinomialNB | |
| nb_classifier = MultinomialNB() | |
| nb_classifier.fit(X_train, y_train) | |
| from sklearn.metrics import classification_report, confusion_matrix | |
| y_predict_train = nb_classifier.predict(X_train) | |
| cm = confusion_matrix(y_train, y_predict_train) | |
| sns.heatmap(cm, annot = True) | |
| y_predict_test = nb_classifier.predict(X_test) | |
| cm2 = confusion_matrix(y_test, y_predict_test) | |
| sns.heatmap(cm2, annot = True) | |
| print(classification_report(y_test, y_predict_test)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment