Skip to content

Instantly share code, notes, and snippets.

@accessnash
Created July 5, 2020 07:55
Show Gist options
  • Select an option

  • Save accessnash/2a8e5396cac06b7302e85afad5df271e to your computer and use it in GitHub Desktop.

Select an option

Save accessnash/2a8e5396cac06b7302e85afad5df271e to your computer and use it in GitHub Desktop.
NLP methods to analyse Yelp reviews data
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 4 10:08:00 2020
@author: User
"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
yelp_df = pd.read_csv('yelp.csv')
yelp_df.describe()
yelp_df['length'] = yelp_df['text'].apply(len)
yelp_df['length'].plot(bins = 100, kind = 'hist')
yelp_df['length'].describe()
yelp_df[yelp_df['length'] == 4997]['text'].iloc[0]
sns.countplot(y = 'stars', data = yelp_df)
g = sns.FacetGrid(data = yelp_df, col = 'stars', col_wrap = 5)
g.map(plt.hist, 'length', bins = 20, color = 'r')
yelp_df_1 = yelp_df[yelp_df['stars'] == 1]
yelp_df_5 = yelp_df[yelp_df['stars'] == 5]
yelp_df_1_5 = pd.concat([yelp_df_1, yelp_df_5])
yelp_df_1_5.info()
print('1-star Review percentage = ', (len(yelp_df_1)/len(yelp_df_1_5)))
sns.countplot(yelp_df_1_5['stars'], label = 'Count')
import string
string.punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')
def message_cleaning(message):
Test_punc_rmvd = [char for char in message if char not in string.punctuation]
Test_punc_rmvd_join = ''.join(Test_punc_rmvd)
Test_punc_rmvd_join_clean = [word for word in Test_punc_rmvd_join.split() if word.lower() not in stopwords]
return Test_punc_rmvd_join_clean
yelp_df_clean = yelp_df_1_5['text'].apply(message_cleaning)
print(yelp_df_1_5['text'][0]) # original review
print(yelp_df_clean[0]) # cleaned up review
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = message_cleaning)
yelp_countvectorizer = vectorizer.fit_transform(yelp_df_1_5['text'])
print(vectorizer.get_feature_names())
print(yelp_countvectorizer.toarray())
yelp_countvectorizer.shape
label = yelp_df_1_5['stars'].values
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(yelp_countvectorizer, label)
# testing prediction ability using some sample reviews
testing_sample1 = ['amazing food! highly recommended']
testing_sample2 = ['shitty food and horrible service']
testing_sample_countvectorizer1 = vectorizer.transform(testing_sample1)
testing_sample_countvectorizer2 = vectorizer.transform(testing_sample2)
test_predict1 = nb_classifier.predict(testing_sample_countvectorizer1)
test_predict2 = nb_classifier.predict(testing_sample_countvectorizer2)
X = yelp_countvectorizer
y = label
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
from sklearn.metrics import classification_report, confusion_matrix
y_predict_train = nb_classifier.predict(X_train)
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot = True)
y_predict_test = nb_classifier.predict(X_test)
cm2 = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm2, annot = True)
print(classification_report(y_test, y_predict_test))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment