Created
November 5, 2020 10:44
-
-
Save maxkleiner/80ad4ac9333b4effdaa443fed359c59a to your computer and use it in GitHub Desktop.
sentimenttree.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "sentimenttree.ipynb", | |
"provenance": [], | |
"authorship_tag": "ABX9TyMQp+o5eXHvsZcpll9yNH7z", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/maxkleiner/80ad4ac9333b4effdaa443fed359c59a/sentimenttree.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "pggupr0A9oOx" | |
}, | |
"source": [ | |
"# Sentiment Tree Classifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "04EukN1I80cD", | |
"outputId": "a0ff7b2b-bffe-488e-dfb6-744a279c2fc8", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"source": [ | |
"# use a preclassifier dtree with high recall and then a second \n", | |
"# classifier SVC to sort false positives out!\n", | |
"# use SVC as classifier and dtree to plot decision\n", | |
"\n", | |
"from sklearn import tree\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn import svm\n", | |
"from sklearn.metrics import confusion_matrix\n", | |
"from sklearn.metrics import classification_report\n", | |
"\n", | |
"#https://ritza.co/showcase/repl.it/introduction-to-machine-learning-with-python-and-repl-it.html\n", | |
"\n", | |
"positive_texts = [\n", | |
" \"we love you\",\n", | |
" \"they love us\",\n", | |
" \"you are good\",\n", | |
" \"he is good\",\n", | |
" \"they love max\"\n", | |
"]\n", | |
"\n", | |
"negative_texts = [\n", | |
" \"we hate you\",\n", | |
" \"they hate us\",\n", | |
" \"you are bad\",\n", | |
" \"he is bad\",\n", | |
" \"we hate max\"\n", | |
"]\n", | |
"\n", | |
"test_texts = [\n", | |
" \"they love bad max\", # this is labeled as positive !\n", | |
" \"they are good\",\n", | |
" \"why do you hate mary\",\n", | |
" \"they are almost always good\",\n", | |
" \"we are very bad\"\n", | |
"]\n", | |
"\n", | |
"print('testset: ',test_texts)\n", | |
"\n", | |
"training_texts = negative_texts + positive_texts\n", | |
"training_labels = [\"neg\"] * len(negative_texts) + [\"pos\"] * len(positive_texts)\n", | |
"#print(training_labels)\n", | |
"\n", | |
"#mapping the words to numbers (bag of words)\n", | |
"vectorizer = CountVectorizer()\n", | |
"vectorizer.fit(training_texts)\n", | |
"print('vocabulary: ',vectorizer.vocabulary_)\n", | |
"\n", | |
"#vectorizer.fit(test_texts)\n", | |
"#print(vectorizer.vocabulary_)\n", | |
"\n", | |
"#You can join all lines and then use split: \n", | |
"print('unique words:',set(\" \".join(test_texts).split()))\n", | |
"#print(set([wo for line in test_texts for wo in line.split()]))\n", | |
"\n", | |
"training_vectors = vectorizer.transform(training_texts)\n", | |
"testing_vectors = vectorizer.transform(test_texts)\n", | |
"\n", | |
"classifier = tree.DecisionTreeClassifier()\n", | |
"classifier.fit(training_vectors, training_labels)\n", | |
"predictions = classifier.predict(testing_vectors)\n", | |
"print('predict: ',predictions)\n", | |
"#print('testset: ',test_texts)\n", | |
"\n", | |
"#http://www.webgraphviz.com/\n", | |
"tree.export_graphviz(\n", | |
" classifier,\n", | |
" out_file='maxtree.dot',\n", | |
" feature_names=vectorizer.get_feature_names(),\n", | |
")\n", | |
"\n", | |
"\n", | |
"\n", | |
"# Create a linear SVM classifier\n", | |
"clf = svm.SVC(kernel='linear')\n", | |
"# Train & learn classifier\n", | |
"clf.fit(training_vectors, training_labels)\n", | |
"\n", | |
"# Make predictions on unseen test data\n", | |
"clf_predictions = clf.predict(testing_vectors)\n", | |
"print('predict SVC: ',clf_predictions)\n", | |
"test_labels=['pos','pos','neg','pos','neg']\n", | |
"print(\"Accuracy: {}%\".format(clf.score(testing_vectors,test_labels)*100))\n", | |
"\n", | |
"y_test = test_labels\n", | |
"y_pred = clf_predictions\n", | |
"\n", | |
"print('actual:',y_test)\n", | |
"print('predic:',y_pred)\n", | |
"\n", | |
"print('confusion matrix - classification report \\n')\n", | |
"print(confusion_matrix(y_test, y_pred))\n", | |
"print(classification_report(y_test, y_pred))" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"testset: ['they love bad max', 'they are good', 'why do you hate mary', 'they are almost always good', 'we are very bad']\n", | |
"vocabulary: {'we': 10, 'hate': 3, 'you': 11, 'they': 8, 'us': 9, 'are': 0, 'bad': 1, 'he': 4, 'is': 5, 'max': 7, 'love': 6, 'good': 2}\n", | |
"unique words: {'love', 'good', 'max', 'are', 'mary', 'almost', 'very', 'you', 'bad', 'hate', 'we', 'always', 'they', 'why', 'do'}\n", | |
"predict: ['pos' 'pos' 'neg' 'pos' 'neg']\n", | |
"predict SVC: ['neg' 'pos' 'neg' 'pos' 'neg']\n", | |
"Accuracy: 80.0%\n", | |
"actual: ['pos', 'pos', 'neg', 'pos', 'neg']\n", | |
"predic: ['neg' 'pos' 'neg' 'pos' 'neg']\n", | |
"confusion matrix - classification report \n", | |
"\n", | |
"[[2 0]\n", | |
" [1 2]]\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" neg 0.67 1.00 0.80 2\n", | |
" pos 1.00 0.67 0.80 3\n", | |
"\n", | |
" accuracy 0.80 5\n", | |
" macro avg 0.83 0.83 0.80 5\n", | |
"weighted avg 0.87 0.80 0.80 5\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "fRVScZB4-uDt" | |
}, | |
"source": [ | |
"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZWnuFOm--XX5" | |
}, | |
"source": [ | |
"Because love or hate has 3 counts as discriminator in training set its easier than good and bad with 2 ocurrences.\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "LG5br5Vu95Ac" | |
}, | |
"source": [ | |
"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zJcRJ6-_9Q_r" | |
}, | |
"source": [ | |
"# Sentiment Tree Classifier with Support Vector Machine and Decision Tree Graph" | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment