Last active
February 14, 2020 17:29
-
-
Save invegat/bb437d222f9d8eed2b07fa4f10247bfc to your computer and use it in GitHub Desktop.
Twitter NLP Classifiers.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Twitter NLP Classifiers.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
}, | |
"latex_envs": { | |
"LaTeX_envs_menu_present": true, | |
"autoclose": false, | |
"autocomplete": true, | |
"bibliofile": "biblio.bib", | |
"cite_by": "apalike", | |
"current_citInitial": 1, | |
"eqLabelWithNumbers": true, | |
"eqNumInitial": 1, | |
"hotkeys": { | |
"equation": "Ctrl-E", | |
"itemize": "Ctrl-I" | |
}, | |
"labels_anchors": false, | |
"latex_user_defs": false, | |
"report_style_numbering": false, | |
"user_envs_cfg": false | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/invegat/bb437d222f9d8eed2b07fa4f10247bfc/twitter-nlp-classifiers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "p84xDZIlsQ9J", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 66 | |
}, | |
"outputId": "aae3e04b-1a0b-4c5a-8998-138116d29c29" | |
}, | |
"source": [ | |
"import nltk\n", | |
"nltk.download('stopwords')" | |
], | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
"[nltk_data] Package stopwords is already up-to-date!\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 29 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Uf6JRflWq1cz", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import re\n", | |
"import io\n", | |
"import pandas as pd\n", | |
"import requests\n", | |
"import gensim\n", | |
"from gensim.models.word2vec import Word2Vec\n", | |
"from nltk.tokenize import word_tokenize\n", | |
"import string\n", | |
"from nltk.corpus import stopwords\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from xgboost import XGBClassifier\n", | |
"from sklearn.metrics import accuracy_score, roc_auc_score\n", | |
"from sklearn.linear_model import LogisticRegression" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:36.169063Z", | |
"start_time": "2019-03-29T17:50:36.165101Z" | |
}, | |
"id": "Fg-goHQyqOuj", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"url = \"https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv\"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:39.124431Z", | |
"start_time": "2019-03-29T17:50:36.698578Z" | |
}, | |
"id": "TZ_jEGwNqOuo", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"text = requests.get(url).text" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:42.867912Z", | |
"start_time": "2019-03-29T17:50:39.129493Z" | |
}, | |
"id": "bvcSi4PuqOus", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"df = pd.read_csv(url)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:45.513536Z", | |
"start_time": "2019-03-29T17:50:42.869797Z" | |
}, | |
"id": "gPNbgehNqOux", | |
"colab_type": "code", | |
"outputId": "03a945d3-d2cb-4b02-8808-2adfad990467", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 214 | |
} | |
}, | |
"source": [ | |
"stop_words = set(stopwords.words('english'))\n", | |
"\n", | |
"# turn a doc into clean tokens\n", | |
"def clean_doc(doc):\n", | |
"\t# split into tokens by white space\n", | |
"\ttokens = doc.split()\n", | |
"\t# remove punctuation from each token\n", | |
"\ttable = str.maketrans('', '', string.punctuation)\n", | |
"\ttokens = [w.translate(table) for w in tokens]\n", | |
"\t# remove remaining tokens that are not alphabetic\n", | |
"\ttokens = [word for word in tokens if word.isalpha()]\n", | |
"\t# filter out stop words\n", | |
"# \tstop_words = set(stopwords.words('english'))\n", | |
"\ttokens = [w for w in tokens if not w in stop_words]\n", | |
"\t# filter out short tokens\n", | |
"\ttokens = [word.lower() for word in tokens if len(word) > 1]\n", | |
"\treturn tokens\n", | |
"\n", | |
"def clean_sentence(doc):\n", | |
"\t# split into tokens by white space\n", | |
"\ttokens = doc.split()\n", | |
"\t# remove punctuation from each token\n", | |
"\ttable = str.maketrans('', '', string.punctuation)\n", | |
"\ttokens = [w.translate(table) for w in tokens]\n", | |
"\t# remove remaining tokens that are not alphabetic\n", | |
"\ttokens = [word for word in tokens if word.isalpha()]\n", | |
"\t# filter out stop words\n", | |
"# \tstop_words = set(stopwords.words('english'))\n", | |
"\ttokens = [w for w in tokens if not w in stop_words]\n", | |
"\t# filter out short tokens\n", | |
"\ttokens = [word.lower() for word in tokens if len(word) > 1]\n", | |
"\treturn \" \".join(tokens)\n", | |
"\n", | |
"df['cleaned'] = df.SentimentText.apply(clean_doc)\n", | |
"df['cleanedSentence'] = df.SentimentText.apply(clean_sentence)\n", | |
"print(df.shape)\n", | |
"df.head()" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(99989, 4)\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Sentiment</th>\n", | |
" <th>SentimentText</th>\n", | |
" <th>cleaned</th>\n", | |
" <th>cleanedSentence</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>is so sad for my APL frie...</td>\n", | |
" <td>[sad, apl, friend]</td>\n", | |
" <td>sad apl friend</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0</td>\n", | |
" <td>I missed the New Moon trail...</td>\n", | |
" <td>[missed, new, moon, trailer]</td>\n", | |
" <td>missed new moon trailer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>omg its already 7:30 :O</td>\n", | |
" <td>[omg, already]</td>\n", | |
" <td>omg already</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0</td>\n", | |
" <td>.. Omgaga. Im sooo im gunna CRy. I'...</td>\n", | |
" <td>[omgaga, im, sooo, im, gunna, cry, ive, dentis...</td>\n", | |
" <td>omgaga im sooo im gunna cry ive dentist since ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" <td>i think mi bf is cheating on me!!! ...</td>\n", | |
" <td>[think, mi, bf, cheating, tt]</td>\n", | |
" <td>think mi bf cheating tt</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Sentiment ... cleanedSentence\n", | |
"0 0 ... sad apl friend\n", | |
"1 0 ... missed new moon trailer\n", | |
"2 1 ... omg already\n", | |
"3 0 ... omgaga im sooo im gunna cry ive dentist since ...\n", | |
"4 0 ... think mi bf cheating tt\n", | |
"\n", | |
"[5 rows x 4 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 34 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:46.634914Z", | |
"start_time": "2019-03-29T17:50:45.516386Z" | |
}, | |
"id": "e1t8UfPHqOu2", | |
"colab_type": "code", | |
"outputId": "f5cdd776-284c-431c-c5d0-ed083abead75", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 133 | |
} | |
}, | |
"source": [ | |
"vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,1), stop_words='english')\n", | |
"vectorizer.fit(df.cleanedSentence)" | |
], | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.float64'>, encoding='utf-8',\n", | |
" input='content', lowercase=True, max_df=1.0, max_features=10000,\n", | |
" min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,\n", | |
" smooth_idf=True, stop_words='english', strip_accents=None,\n", | |
" sublinear_tf=False, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None, use_idf=True, vocabulary=None)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 35 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:54.165799Z", | |
"start_time": "2019-03-29T17:50:54.162824Z" | |
}, | |
"id": "urSxxHj2qOu9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"X_train = df.cleanedSentence" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:50:57.723419Z", | |
"start_time": "2019-03-29T17:50:55.005352Z" | |
}, | |
"id": "PCvseHfyqOvA", | |
"colab_type": "code", | |
"outputId": "2f8fc629-e5e8-4ce1-8a51-57fc22e033d2", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 263 | |
} | |
}, | |
"source": [ | |
"train_word_counts = vectorizer.transform(X_train)\n", | |
"X_train_vectorized = pd.DataFrame(train_word_counts[0:10000].toarray(), columns=vectorizer.get_feature_names())\n", | |
"X_train_vectorized = X_train_vectorized.fillna(0)\n", | |
"print(X_train_vectorized.shape)\n", | |
"X_train_vectorized.head()" | |
], | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(10000, 10000)\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>aa</th>\n", | |
" <th>aaa</th>\n", | |
" <th>aaah</th>\n", | |
" <th>aaahh</th>\n", | |
" <th>aafreen</th>\n", | |
" <th>aah</th>\n", | |
" <th>aahhh</th>\n", | |
" <th>aalaap</th>\n", | |
" <th>aamyhaanson</th>\n", | |
" <th>aaron</th>\n", | |
" <th>aaronfuller</th>\n", | |
" <th>aaronob</th>\n", | |
" <th>aaronrenfree</th>\n", | |
" <th>aaronrgillespie</th>\n", | |
" <th>aaw</th>\n", | |
" <th>aaww</th>\n", | |
" <th>aawww</th>\n", | |
" <th>ab</th>\n", | |
" <th>abandoned</th>\n", | |
" <th>abbaks</th>\n", | |
" <th>abbey</th>\n", | |
" <th>abbiefletcher</th>\n", | |
" <th>abbsound</th>\n", | |
" <th>abby</th>\n", | |
" <th>abbybradz</th>\n", | |
" <th>abbyharenberg</th>\n", | |
" <th>abbyyyy</th>\n", | |
" <th>abc</th>\n", | |
" <th>abcdefglynis</th>\n", | |
" <th>abcmsaj</th>\n", | |
" <th>abduzeedo</th>\n", | |
" <th>abeeliever</th>\n", | |
" <th>aber</th>\n", | |
" <th>abiban</th>\n", | |
" <th>abideedles</th>\n", | |
" <th>abiface</th>\n", | |
" <th>abigaelettuce</th>\n", | |
" <th>abigaill</th>\n", | |
" <th>ability</th>\n", | |
" <th>abirtmo</th>\n", | |
" <th>...</th>\n", | |
" <th>youu</th>\n", | |
" <th>youuu</th>\n", | |
" <th>youuuu</th>\n", | |
" <th>youuuuu</th>\n", | |
" <th>youve</th>\n", | |
" <th>youyou</th>\n", | |
" <th>yr</th>\n", | |
" <th>yrs</th>\n", | |
" <th>yt</th>\n", | |
" <th>yu</th>\n", | |
" <th>yuck</th>\n", | |
" <th>yucky</th>\n", | |
" <th>yuh</th>\n", | |
" <th>yuk</th>\n", | |
" <th>yum</th>\n", | |
" <th>yumm</th>\n", | |
" <th>yummm</th>\n", | |
" <th>yummmy</th>\n", | |
" <th>yummy</th>\n", | |
" <th>yun</th>\n", | |
" <th>yung</th>\n", | |
" <th>yup</th>\n", | |
" <th>yupp</th>\n", | |
" <th>yur</th>\n", | |
" <th>yw</th>\n", | |
" <th>zac</th>\n", | |
" <th>zach</th>\n", | |
" <th>zack</th>\n", | |
" <th>ze</th>\n", | |
" <th>zealand</th>\n", | |
" <th>zero</th>\n", | |
" <th>zip</th>\n", | |
" <th>zombie</th>\n", | |
" <th>zombies</th>\n", | |
" <th>zomg</th>\n", | |
" <th>zone</th>\n", | |
" <th>zones</th>\n", | |
" <th>zoo</th>\n", | |
" <th>zoom</th>\n", | |
" <th>zune</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>...</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>...</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>...</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>...</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>...</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 10000 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" aa aaa aaah aaahh aafreen aah ... zomg zone zones zoo zoom zune\n", | |
"0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n", | |
"1 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n", | |
"2 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n", | |
"3 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n", | |
"4 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n", | |
"\n", | |
"[5 rows x 10000 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 37 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:51:09.809785Z", | |
"start_time": "2019-03-29T17:51:09.761256Z" | |
}, | |
"colab_type": "code", | |
"id": "TX8OEgUP_3ee", | |
"colab": {} | |
}, | |
"source": [ | |
"dfs = df.sample(frac=0.1)\n", | |
"X = dfs.cleanedSentence\n", | |
"y = dfs.Sentiment.values\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:51:11.508193Z", | |
"start_time": "2019-03-29T17:51:11.504530Z" | |
}, | |
"id": "h4YsonNhqOvL", | |
"colab_type": "code", | |
"outputId": "59a68def-cac0-4e16-9c30-403e7f65db1b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 83 | |
} | |
}, | |
"source": [ | |
"print(X_train.shape)\n", | |
"print(X_test.shape)\n", | |
"print(y_train.shape)\n", | |
"print(y_test.shape)" | |
], | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(7999,)\n", | |
"(2000,)\n", | |
"(7999,)\n", | |
"(2000,)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:51:16.482859Z", | |
"start_time": "2019-03-29T17:51:16.361378Z" | |
}, | |
"id": "AlvWW5wRqOvP", | |
"colab_type": "code", | |
"outputId": "a1b2d96c-4f69-43f0-8f36-a4d8f8dcf099", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 116 | |
} | |
}, | |
"source": [ | |
"vectorizer = CountVectorizer(max_features=1000, ngram_range=(1,1), stop_words='english')\n", | |
"vectorizer.fit(X_train)" | |
], | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
" lowercase=True, max_df=1.0, max_features=1000, min_df=1,\n", | |
" ngram_range=(1, 1), preprocessor=None, stop_words='english',\n", | |
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None, vocabulary=None)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 40 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:51:18.167707Z", | |
"start_time": "2019-03-29T17:51:18.059599Z" | |
}, | |
"id": "MkQUL4jzqOvW", | |
"colab_type": "code", | |
"outputId": "f1146396-5ef0-4133-c5f8-84df0bfd6aad", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 50 | |
} | |
}, | |
"source": [ | |
"train_word_counts = vectorizer.transform(X_train)\n", | |
"X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns=vectorizer.get_feature_names())\n", | |
"\n", | |
"test_word_counts = vectorizer.transform(X_test)\n", | |
"X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names())\n", | |
"print(X_train_vectorized.shape)\n", | |
"print(X_test_vectorized.shape)" | |
], | |
"execution_count": 41, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(7999, 1000)\n", | |
"(2000, 1000)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:51:21.222904Z", | |
"start_time": "2019-03-29T17:51:21.218619Z" | |
}, | |
"id": "jIUocZm5qOva", | |
"colab_type": "code", | |
"outputId": "d9377399-8851-4a32-c45d-e691a01ded47", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 33 | |
} | |
}, | |
"source": [ | |
"df.Sentiment.unique()" | |
], | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([0, 1])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 42 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:48:51.962715Z", | |
"start_time": "2019-03-29T17:48:31.500744Z" | |
}, | |
"id": "acEs8nlnqOve", | |
"colab_type": "code", | |
"outputId": "c05aa698-dfcf-4a65-a877-8b30f244d7a1", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 83 | |
} | |
}, | |
"source": [ | |
"XGB = XGBClassifier(n_estimators=200, objective=\"binary:logistic\").fit(X_train_vectorized, y_train)\n", | |
"train_predictions = XGB.predict(X_train_vectorized)\n", | |
"test_predictions = XGB.predict(X_test_vectorized)\n", | |
"print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n", | |
"print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n", | |
"print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n", | |
"print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')" | |
], | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Train Accuracy: 0.700962620327541\n", | |
"Test Accuracy: 0.6615\n", | |
"Train Roc Auc: 0.6712842399415706\n", | |
"Test Roc Auc: 0.6423723426511531\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:49:33.866072Z", | |
"start_time": "2019-03-29T17:48:53.730994Z" | |
}, | |
"id": "5JIC_D9hqOvi", | |
"colab_type": "code", | |
"outputId": "9efa2958-961a-439e-e5b5-48e76e0a3d2a", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 83 | |
} | |
}, | |
"source": [ | |
"RFC = RandomForestClassifier(n_estimators=200).fit(X_train_vectorized, y_train)\n", | |
"\n", | |
"train_predictions = RFC.predict(X_train_vectorized)\n", | |
"test_predictions = RFC.predict(X_test_vectorized)\n", | |
"\n", | |
"print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n", | |
"print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n", | |
"print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n", | |
"print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')" | |
], | |
"execution_count": 44, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Train Accuracy: 0.9577447180897612\n", | |
"Test Accuracy: 0.6755\n", | |
"Train Roc Auc: 0.9540209266139531\n", | |
"Test Roc Auc: 0.670136709634851\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-29T17:49:38.271311Z", | |
"start_time": "2019-03-29T17:49:36.806563Z" | |
}, | |
"id": "KWeYS1RuqOvo", | |
"colab_type": "code", | |
"outputId": "c08a65cf-487c-4c26-8f0d-168b6d223d58", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 83 | |
} | |
}, | |
"source": [ | |
"LR = LogisticRegression(random_state=42, solver=\"newton-cg\").fit(X_train_vectorized, y_train)\n", | |
"\n", | |
"train_predictions = LR.predict(X_train_vectorized)\n", | |
"test_predictions = LR.predict(X_test_vectorized)\n", | |
"print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n", | |
"print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n", | |
"print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n", | |
"print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')" | |
], | |
"execution_count": 45, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Train Accuracy: 0.7727215901987748\n", | |
"Test Accuracy: 0.687\n", | |
"Train Roc Auc: 0.7609365215458385\n", | |
"Test Roc Auc: 0.6786840792417\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6thJvUHBqOwD", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment