Abhayparashar31 · November 16, 2021 05:17
diff --git a/Sentiment Analyzer Model Building.py b/Sentiment Analyzer Model Building.py
 ############ Importing Required Libraries #############
 import pandas as pd
 import numpy as np 
 import nltk
 import matplotlib.pyplot as plt
 import re
 import nltk
 nltk.download('stopwords')
 from nltk.corpus import stopwords
 from nltk.stem.porter import PorterStemmer
 from sklearn.feature_extraction.text import CountVectorizer
 import pickle
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import GaussianNB,MultinomialNB
 from sklearn.metrics import confusion_matrix,accuracy_score

 print("All Libraries Imported.........")
 print("Loading Data........")
 ######## Loading Hotel Review Dataset Using Raw Github Link 
 dataset = pd.read_csv("https://raw.githubusercontent.com/Abhayparashar31/sentiment-analsis-on-hotel-review/main/Restaurant_Reviews.tsv",delimiter = "\t",quoting=3)

 print("Cleaning Data........")
 #### Cleaning The Data
 corpus = []
 for i in range(0,1000):   #we have 1000 reviews
    review = re.sub('[^a-zA-Z]'," ",dataset["Review"][i]) #sub can replace anything in a text with anything #replace punctuation
    review = review.lower()
    review = review.split()
    pe = PorterStemmer()
    all_stopword = stopwords.words('english')
    type(all_stopword)
    all_stopword.remove('not')
    review = [pe.stem(word) for word in review if not word in set(all_stopword)]
                ## for every word in reviews we check if the word not in stopword if yes then 
                ## using porterstemmer we apply stemming if no then we drop the word
    review = " ".join(review)
    corpus.append(review)

 #### Vectorization
 cv = CountVectorizer(max_features=1500) ##1500 columns
 X = cv.fit_transform(corpus).toarray()
 y = dataset["Liked"]

 print("Splitting Data....")
 ### Data Splitting
 X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
 X_train.shape , y_test.shape

 #### Preparing Model
 print("Preparing Model.......")
 classifier = GaussianNB().fit(X_train, y_train)
 cls = MultinomialNB().fit(X_train, y_train)
 print(cls.score(X_test,y_test))



 ### Generating Predictions
 print("----------------------------------------------")
 print("------Predictions------------")
 y_pred = cls.predict(X_test)
 #print(np.concatenate((y_pred.reshape(len(y_pred),1), np.array(y_test).reshape(len(y_test),1)),1))
 #print(y_pred)


 ### Generating Confusion Matrix
 print("\n\n------- Confusion Matrix --------------")
 cm = confusion_matrix(y_test, y_pred)
 score = accuracy_score(y_test,y_pred)
 print(cm,score*100)

 ### Saving Models For Future Use
 print("Dumnping Models and CV..........")
 pickle.dump(cv, open('cv.pkl', 'wb'))      ## Convt Vectorization
 pickle.dump(cls, open("review.pkl", "wb")) ## Model


 print("Model Created............")

 ### Loading Model
 print("Loading Dumped Model............")
 loaded_model = pickle.load(open("review.pkl", "rb"))
 loaded_model.predict(X_test)

 ## Accuracy Score of The Loaded Model
 print("---------- Accuracy of the Loaded Model On Test Data------------")
 print(loaded_model.score(X_test,y_test))
	############ Importing Required Libraries #############
	import pandas as pd
	import numpy as np
	import nltk
	import matplotlib.pyplot as plt
	import re
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer
	from sklearn.feature_extraction.text import CountVectorizer
	import pickle
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import GaussianNB,MultinomialNB
	from sklearn.metrics import confusion_matrix,accuracy_score

	print("All Libraries Imported.........")
	print("Loading Data........")
	######## Loading Hotel Review Dataset Using Raw Github Link
	dataset = pd.read_csv("https://raw.githubusercontent.com/Abhayparashar31/sentiment-analsis-on-hotel-review/main/Restaurant_Reviews.tsv",delimiter = "\t",quoting=3)

	print("Cleaning Data........")
	#### Cleaning The Data
	corpus = []
	for i in range(0,1000): #we have 1000 reviews
	review = re.sub('[^a-zA-Z]'," ",dataset["Review"][i]) #sub can replace anything in a text with anything #replace punctuation
	review = review.lower()
	review = review.split()
	pe = PorterStemmer()
	all_stopword = stopwords.words('english')
	type(all_stopword)
	all_stopword.remove('not')
	review = [pe.stem(word) for word in review if not word in set(all_stopword)]
	## for every word in reviews we check if the word not in stopword if yes then
	## using porterstemmer we apply stemming if no then we drop the word
	review = " ".join(review)
	corpus.append(review)

	#### Vectorization
	cv = CountVectorizer(max_features=1500) ##1500 columns
	X = cv.fit_transform(corpus).toarray()
	y = dataset["Liked"]

	print("Splitting Data....")
	### Data Splitting
	X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
	X_train.shape , y_test.shape

	#### Preparing Model
	print("Preparing Model.......")
	classifier = GaussianNB().fit(X_train, y_train)
	cls = MultinomialNB().fit(X_train, y_train)
	print(cls.score(X_test,y_test))



	### Generating Predictions
	print("----------------------------------------------")
	print("------Predictions------------")
	y_pred = cls.predict(X_test)
	#print(np.concatenate((y_pred.reshape(len(y_pred),1), np.array(y_test).reshape(len(y_test),1)),1))
	#print(y_pred)


	### Generating Confusion Matrix
	print("\n\n------- Confusion Matrix --------------")
	cm = confusion_matrix(y_test, y_pred)
	score = accuracy_score(y_test,y_pred)
	print(cm,score*100)

	### Saving Models For Future Use
	print("Dumnping Models and CV..........")
	pickle.dump(cv, open('cv.pkl', 'wb')) ## Convt Vectorization
	pickle.dump(cls, open("review.pkl", "wb")) ## Model


	print("Model Created............")

	### Loading Model
	print("Loading Dumped Model............")
	loaded_model = pickle.load(open("review.pkl", "rb"))
	loaded_model.predict(X_test)

	## Accuracy Score of The Loaded Model
	print("---------- Accuracy of the Loaded Model On Test Data------------")
	print(loaded_model.score(X_test,y_test))