Created
November 16, 2021 05:17
-
-
Save Abhayparashar31/3a194bbd5e87a8dd78251876a5f10e90 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ############ Importing Required Libraries ############# | |
| import pandas as pd | |
| import numpy as np | |
| import nltk | |
| import matplotlib.pyplot as plt | |
| import re | |
| import nltk | |
| nltk.download('stopwords') | |
| from nltk.corpus import stopwords | |
| from nltk.stem.porter import PorterStemmer | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| import pickle | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.naive_bayes import GaussianNB,MultinomialNB | |
| from sklearn.metrics import confusion_matrix,accuracy_score | |
| print("All Libraries Imported.........") | |
| print("Loading Data........") | |
| ######## Loading Hotel Review Dataset Using Raw Github Link | |
| dataset = pd.read_csv("https://raw.githubusercontent.com/Abhayparashar31/sentiment-analsis-on-hotel-review/main/Restaurant_Reviews.tsv",delimiter = "\t",quoting=3) | |
| print("Cleaning Data........") | |
| #### Cleaning The Data | |
| corpus = [] | |
| for i in range(0,1000): #we have 1000 reviews | |
| review = re.sub('[^a-zA-Z]'," ",dataset["Review"][i]) #sub can replace anything in a text with anything #replace punctuation | |
| review = review.lower() | |
| review = review.split() | |
| pe = PorterStemmer() | |
| all_stopword = stopwords.words('english') | |
| type(all_stopword) | |
| all_stopword.remove('not') | |
| review = [pe.stem(word) for word in review if not word in set(all_stopword)] | |
| ## for every word in reviews we check if the word not in stopword if yes then | |
| ## using porterstemmer we apply stemming if no then we drop the word | |
| review = " ".join(review) | |
| corpus.append(review) | |
| #### Vectorization | |
| cv = CountVectorizer(max_features=1500) ##1500 columns | |
| X = cv.fit_transform(corpus).toarray() | |
| y = dataset["Liked"] | |
| print("Splitting Data....") | |
| ### Data Splitting | |
| X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0) | |
| X_train.shape , y_test.shape | |
| #### Preparing Model | |
| print("Preparing Model.......") | |
| classifier = GaussianNB().fit(X_train, y_train) | |
| cls = MultinomialNB().fit(X_train, y_train) | |
| print(cls.score(X_test,y_test)) | |
| ### Generating Predictions | |
| print("----------------------------------------------") | |
| print("------Predictions------------") | |
| y_pred = cls.predict(X_test) | |
| #print(np.concatenate((y_pred.reshape(len(y_pred),1), np.array(y_test).reshape(len(y_test),1)),1)) | |
| #print(y_pred) | |
| ### Generating Confusion Matrix | |
| print("\n\n------- Confusion Matrix --------------") | |
| cm = confusion_matrix(y_test, y_pred) | |
| score = accuracy_score(y_test,y_pred) | |
| print(cm,score*100) | |
| ### Saving Models For Future Use | |
| print("Dumnping Models and CV..........") | |
| pickle.dump(cv, open('cv.pkl', 'wb')) ## Convt Vectorization | |
| pickle.dump(cls, open("review.pkl", "wb")) ## Model | |
| print("Model Created............") | |
| ### Loading Model | |
| print("Loading Dumped Model............") | |
| loaded_model = pickle.load(open("review.pkl", "rb")) | |
| loaded_model.predict(X_test) | |
| ## Accuracy Score of The Loaded Model | |
| print("---------- Accuracy of the Loaded Model On Test Data------------") | |
| print(loaded_model.score(X_test,y_test)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment