Created
September 6, 2017 21:44
-
-
Save youngsoul/ce013eea67aa8d4726cc9e57059f385c to your computer and use it in GitHub Desktop.
Initial WIP Sentiment analysis of Kaggle data set
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import time | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.svm import LinearSVC | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.model_selection import StratifiedKFold | |
import pandas as pd | |
from sklearn import metrics | |
from sklearn.linear_model import LogisticRegression | |
from stemming.porter2 import stem | |
from sklearn.pipeline import Pipeline | |
from .transformers import RemoveEllipseTransformer, RemoveHtmlEncodedTransformer, RemoveNumbersTransformer, RemoveSpecialCharactersTransformer, RemoveUsernameTransformer, RemoveUrlsTransformer | |
sentiments = ['worry', 'neutral', 'sadness', 'happiness', 'love', 'surprise', 'hate', 'fun', 'relief', 'empty', | |
'enthusiasm', 'boredom', 'anger'] | |
class StemmedTfidfVectorizer(TfidfVectorizer): | |
def build_analyzer(self): | |
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() | |
return lambda doc: ([stem(w) for w in analyzer(doc)]) | |
def get_data_frame(): | |
sa = pd.read_csv('./train_data.csv') | |
# convert label to a numerical variable | |
# not sure if we have to do this in later versions of scikit learn | |
sa['sentiment_num'] = sa.sentiment.map( | |
{'worry': 0, 'neutral': 1, 'sadness': 2, 'happiness': 3, 'love': 4, 'surprise': 5, 'hate': 6, 'fun': 7, | |
'relief': 8, 'empty': 9, 'enthusiasm': 10, 'boredom': 11, 'anger': 12}) | |
return sa | |
def _run_all_models(X, y, vectorizer, msg): | |
kf = StratifiedKFold(n_splits=10) | |
model1_accuracy_scores = [] | |
model2_accuracy_scores = [] | |
model3_accuracy_scores = [] | |
dataprep_pipeline = Pipeline([ | |
('urls', RemoveUrlsTransformer()), | |
('username', RemoveUsernameTransformer()), | |
('ellipse', RemoveEllipseTransformer()), | |
('html_encoded', RemoveHtmlEncodedTransformer()), | |
('special', RemoveSpecialCharactersTransformer()), | |
('nums', RemoveNumbersTransformer()) | |
]) | |
for train_index, test_index in kf.split(X, y): | |
X_train, X_test = X[train_index], X[test_index] | |
y_train, y_test = y[train_index], y[test_index] | |
# print("X_train, y_train shapes: {0},{1}".format(X_train.shape, y_train.shape)) | |
# print("X_test, y_test shapes: {0},{1}".format(X_test.shape, y_test.shape)) | |
X_train_prep = dataprep_pipeline.transform(X_train) | |
train_corpus = vectorizer.fit_transform(X_train_prep) | |
test_corpus = vectorizer.transform(X_test) | |
model1 = LinearSVC() | |
model2 = MultinomialNB() | |
model3 = LogisticRegression() | |
model1.fit(train_corpus, y_train) | |
model2.fit(train_corpus, y_train) | |
model3.fit(train_corpus, y_train) | |
result1 = model1.predict(test_corpus) | |
result2 = model2.predict(test_corpus) | |
result3 = model3.predict(test_corpus) | |
model1_accuracy_scores.append(metrics.accuracy_score(y_test, result1)) | |
model2_accuracy_scores.append(metrics.accuracy_score(y_test, result2)) | |
model3_accuracy_scores.append(metrics.accuracy_score(y_test, result3)) | |
# sample_input = 'I am bummed that I missed you last night' | |
# p = model1.predict(vectorizer.transform([sample_input])) | |
# print("Model1: {} / {}".format(sample_input, sentiments[int(p)])) | |
# p = model2.predict(vectorizer.transform([sample_input])) | |
# print("Model2: {} / {}".format(sample_input, sentiments[int(p)])) | |
# p = model3.predict(vectorizer.transform([sample_input])) | |
# print("Model3: {} / {}".format(sample_input, sentiments[int(p)])) | |
# | |
# sample_input = 'You totally rocked the show last night' | |
# p = model1.predict(vectorizer.transform([sample_input])) | |
# print("Model1: {} / {}".format(sample_input, sentiments[int(p)])) | |
# p = model2.predict(vectorizer.transform([sample_input])) | |
# print("Model2: {} / {}".format(sample_input, sentiments[int(p)])) | |
# p = model3.predict(vectorizer.transform([sample_input])) | |
# print("Model3: {} / {}".format(sample_input, sentiments[int(p)])) | |
print(msg) | |
print("Model1 LinearSVC: {}".format(np.mean(model1_accuracy_scores))) | |
print("Model2 MultinomialNB: {}".format(np.mean(model2_accuracy_scores))) | |
print("Model3 LogisticRegression: {}".format(np.mean(model3_accuracy_scores))) | |
def run_all_models(data_frame): | |
count_vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words='english', ngram_range=(1,4)) | |
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True, stop_words='english', ngram_range=(1,4)) | |
stem_vectorizer = StemmedTfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True, stop_words='english', ngram_range=(1,4)) | |
X = data_frame.content | |
y = data_frame.sentiment_num | |
_run_all_models(X, y, count_vectorizer, "Count Vectorizer") | |
_run_all_models(X, y, tfidf_vectorizer, "TF-IDF Vectorizer") | |
_run_all_models(X, y, stem_vectorizer, "Stem TF-IDF Vectorizer") | |
if __name__ == '__main__': | |
data_frame = get_data_frame() | |
start_time = time.time() | |
run_all_models(data_frame) | |
end_time = time.time() | |
print("total time: ", end_time - start_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment