Skip to content

Instantly share code, notes, and snippets.

@youngsoul
Created September 6, 2017 21:44
Show Gist options
  • Save youngsoul/ce013eea67aa8d4726cc9e57059f385c to your computer and use it in GitHub Desktop.
Save youngsoul/ce013eea67aa8d4726cc9e57059f385c to your computer and use it in GitHub Desktop.
Initial WIP Sentiment analysis of Kaggle data set
import numpy as np
import time
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from stemming.porter2 import stem
from sklearn.pipeline import Pipeline
from .transformers import RemoveEllipseTransformer, RemoveHtmlEncodedTransformer, RemoveNumbersTransformer, RemoveSpecialCharactersTransformer, RemoveUsernameTransformer, RemoveUrlsTransformer
sentiments = ['worry', 'neutral', 'sadness', 'happiness', 'love', 'surprise', 'hate', 'fun', 'relief', 'empty',
'enthusiasm', 'boredom', 'anger']
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
return lambda doc: ([stem(w) for w in analyzer(doc)])
def get_data_frame():
sa = pd.read_csv('./train_data.csv')
# convert label to a numerical variable
# not sure if we have to do this in later versions of scikit learn
sa['sentiment_num'] = sa.sentiment.map(
{'worry': 0, 'neutral': 1, 'sadness': 2, 'happiness': 3, 'love': 4, 'surprise': 5, 'hate': 6, 'fun': 7,
'relief': 8, 'empty': 9, 'enthusiasm': 10, 'boredom': 11, 'anger': 12})
return sa
def _run_all_models(X, y, vectorizer, msg):
kf = StratifiedKFold(n_splits=10)
model1_accuracy_scores = []
model2_accuracy_scores = []
model3_accuracy_scores = []
dataprep_pipeline = Pipeline([
('urls', RemoveUrlsTransformer()),
('username', RemoveUsernameTransformer()),
('ellipse', RemoveEllipseTransformer()),
('html_encoded', RemoveHtmlEncodedTransformer()),
('special', RemoveSpecialCharactersTransformer()),
('nums', RemoveNumbersTransformer())
])
for train_index, test_index in kf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# print("X_train, y_train shapes: {0},{1}".format(X_train.shape, y_train.shape))
# print("X_test, y_test shapes: {0},{1}".format(X_test.shape, y_test.shape))
X_train_prep = dataprep_pipeline.transform(X_train)
train_corpus = vectorizer.fit_transform(X_train_prep)
test_corpus = vectorizer.transform(X_test)
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()
model1.fit(train_corpus, y_train)
model2.fit(train_corpus, y_train)
model3.fit(train_corpus, y_train)
result1 = model1.predict(test_corpus)
result2 = model2.predict(test_corpus)
result3 = model3.predict(test_corpus)
model1_accuracy_scores.append(metrics.accuracy_score(y_test, result1))
model2_accuracy_scores.append(metrics.accuracy_score(y_test, result2))
model3_accuracy_scores.append(metrics.accuracy_score(y_test, result3))
# sample_input = 'I am bummed that I missed you last night'
# p = model1.predict(vectorizer.transform([sample_input]))
# print("Model1: {} / {}".format(sample_input, sentiments[int(p)]))
# p = model2.predict(vectorizer.transform([sample_input]))
# print("Model2: {} / {}".format(sample_input, sentiments[int(p)]))
# p = model3.predict(vectorizer.transform([sample_input]))
# print("Model3: {} / {}".format(sample_input, sentiments[int(p)]))
#
# sample_input = 'You totally rocked the show last night'
# p = model1.predict(vectorizer.transform([sample_input]))
# print("Model1: {} / {}".format(sample_input, sentiments[int(p)]))
# p = model2.predict(vectorizer.transform([sample_input]))
# print("Model2: {} / {}".format(sample_input, sentiments[int(p)]))
# p = model3.predict(vectorizer.transform([sample_input]))
# print("Model3: {} / {}".format(sample_input, sentiments[int(p)]))
print(msg)
print("Model1 LinearSVC: {}".format(np.mean(model1_accuracy_scores)))
print("Model2 MultinomialNB: {}".format(np.mean(model2_accuracy_scores)))
print("Model3 LogisticRegression: {}".format(np.mean(model3_accuracy_scores)))
def run_all_models(data_frame):
count_vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words='english', ngram_range=(1,4))
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True, stop_words='english', ngram_range=(1,4))
stem_vectorizer = StemmedTfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True, stop_words='english', ngram_range=(1,4))
X = data_frame.content
y = data_frame.sentiment_num
_run_all_models(X, y, count_vectorizer, "Count Vectorizer")
_run_all_models(X, y, tfidf_vectorizer, "TF-IDF Vectorizer")
_run_all_models(X, y, stem_vectorizer, "Stem TF-IDF Vectorizer")
if __name__ == '__main__':
data_frame = get_data_frame()
start_time = time.time()
run_all_models(data_frame)
end_time = time.time()
print("total time: ", end_time - start_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment