invegat · February 14, 2020 17:29
diff --git a/twitter-nlp-classifiers.ipynb b/twitter-nlp-classifiers.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Twitter NLP Classifiers.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.6"
    },
    "latex_envs": {
      "LaTeX_envs_menu_present": true,
      "autoclose": false,
      "autocomplete": true,
      "bibliofile": "biblio.bib",
      "cite_by": "apalike",
      "current_citInitial": 1,
      "eqLabelWithNumbers": true,
      "eqNumInitial": 1,
      "hotkeys": {
        "equation": "Ctrl-E",
        "itemize": "Ctrl-I"
      },
      "labels_anchors": false,
      "latex_user_defs": false,
      "report_style_numbering": false,
      "user_envs_cfg": false
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/invegat/bb437d222f9d8eed2b07fa4f10247bfc/twitter-nlp-classifiers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "p84xDZIlsQ9J",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66
        },
        "outputId": "aae3e04b-1a0b-4c5a-8998-138116d29c29"
      },
      "source": [
        "import nltk\n",
        "nltk.download('stopwords')"
      ],
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 29
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Uf6JRflWq1cz",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import re\n",
        "import io\n",
        "import pandas as pd\n",
        "import requests\n",
        "import gensim\n",
        "from gensim.models.word2vec import Word2Vec\n",
        "from nltk.tokenize import word_tokenize\n",
        "import string\n",
        "from nltk.corpus import stopwords\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from xgboost import XGBClassifier\n",
        "from sklearn.metrics import accuracy_score, roc_auc_score\n",
        "from sklearn.linear_model import LogisticRegression"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:36.169063Z",
          "start_time": "2019-03-29T17:50:36.165101Z"
        },
        "id": "Fg-goHQyqOuj",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "url = \"https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv\""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:39.124431Z",
          "start_time": "2019-03-29T17:50:36.698578Z"
        },
        "id": "TZ_jEGwNqOuo",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "text = requests.get(url).text"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:42.867912Z",
          "start_time": "2019-03-29T17:50:39.129493Z"
        },
        "id": "bvcSi4PuqOus",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "df = pd.read_csv(url)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:45.513536Z",
          "start_time": "2019-03-29T17:50:42.869797Z"
        },
        "id": "gPNbgehNqOux",
        "colab_type": "code",
        "outputId": "03a945d3-d2cb-4b02-8808-2adfad990467",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 214
        }
      },
      "source": [
        "stop_words = set(stopwords.words('english'))\n",
        "\n",
        "# turn a doc into clean tokens\n",
        "def clean_doc(doc):\n",
        "\t# split into tokens by white space\n",
        "\ttokens = doc.split()\n",
        "\t# remove punctuation from each token\n",
        "\ttable = str.maketrans('', '', string.punctuation)\n",
        "\ttokens = [w.translate(table) for w in tokens]\n",
        "\t# remove remaining tokens that are not alphabetic\n",
        "\ttokens = [word for word in tokens if word.isalpha()]\n",
        "\t# filter out stop words\n",
        "# \tstop_words = set(stopwords.words('english'))\n",
        "\ttokens = [w for w in tokens if not w in stop_words]\n",
        "\t# filter out short tokens\n",
        "\ttokens = [word.lower() for word in tokens if len(word) > 1]\n",
        "\treturn tokens\n",
        "\n",
        "def clean_sentence(doc):\n",
        "\t# split into tokens by white space\n",
        "\ttokens = doc.split()\n",
        "\t# remove punctuation from each token\n",
        "\ttable = str.maketrans('', '', string.punctuation)\n",
        "\ttokens = [w.translate(table) for w in tokens]\n",
        "\t# remove remaining tokens that are not alphabetic\n",
        "\ttokens = [word for word in tokens if word.isalpha()]\n",
        "\t# filter out stop words\n",
        "# \tstop_words = set(stopwords.words('english'))\n",
        "\ttokens = [w for w in tokens if not w in stop_words]\n",
        "\t# filter out short tokens\n",
        "\ttokens = [word.lower() for word in tokens if len(word) > 1]\n",
        "\treturn \" \".join(tokens)\n",
        "\n",
        "df['cleaned'] = df.SentimentText.apply(clean_doc)\n",
        "df['cleanedSentence'] = df.SentimentText.apply(clean_sentence)\n",
        "print(df.shape)\n",
        "df.head()"
      ],
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(99989, 4)\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Sentiment</th>\n",
              "      <th>SentimentText</th>\n",
              "      <th>cleaned</th>\n",
              "      <th>cleanedSentence</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>is so sad for my APL frie...</td>\n",
              "      <td>[sad, apl, friend]</td>\n",
              "      <td>sad apl friend</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>0</td>\n",
              "      <td>I missed the New Moon trail...</td>\n",
              "      <td>[missed, new, moon, trailer]</td>\n",
              "      <td>missed new moon trailer</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>omg its already 7:30 :O</td>\n",
              "      <td>[omg, already]</td>\n",
              "      <td>omg already</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>0</td>\n",
              "      <td>.. Omgaga. Im sooo  im gunna CRy. I'...</td>\n",
              "      <td>[omgaga, im, sooo, im, gunna, cry, ive, dentis...</td>\n",
              "      <td>omgaga im sooo im gunna cry ive dentist since ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0</td>\n",
              "      <td>i think mi bf is cheating on me!!!   ...</td>\n",
              "      <td>[think, mi, bf, cheating, tt]</td>\n",
              "      <td>think mi bf cheating tt</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   Sentiment  ...                                    cleanedSentence\n",
              "0          0  ...                                     sad apl friend\n",
              "1          0  ...                            missed new moon trailer\n",
              "2          1  ...                                        omg already\n",
              "3          0  ...  omgaga im sooo im gunna cry ive dentist since ...\n",
              "4          0  ...                            think mi bf cheating tt\n",
              "\n",
              "[5 rows x 4 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 34
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:46.634914Z",
          "start_time": "2019-03-29T17:50:45.516386Z"
        },
        "id": "e1t8UfPHqOu2",
        "colab_type": "code",
        "outputId": "f5cdd776-284c-431c-c5d0-ed083abead75",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 133
        }
      },
      "source": [
        "vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,1), stop_words='english')\n",
        "vectorizer.fit(df.cleanedSentence)"
      ],
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
              "                dtype=<class 'numpy.float64'>, encoding='utf-8',\n",
              "                input='content', lowercase=True, max_df=1.0, max_features=10000,\n",
              "                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,\n",
              "                smooth_idf=True, stop_words='english', strip_accents=None,\n",
              "                sublinear_tf=False, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
              "                tokenizer=None, use_idf=True, vocabulary=None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 35
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:54.165799Z",
          "start_time": "2019-03-29T17:50:54.162824Z"
        },
        "id": "urSxxHj2qOu9",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "X_train = df.cleanedSentence"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:50:57.723419Z",
          "start_time": "2019-03-29T17:50:55.005352Z"
        },
        "id": "PCvseHfyqOvA",
        "colab_type": "code",
        "outputId": "2f8fc629-e5e8-4ce1-8a51-57fc22e033d2",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 263
        }
      },
      "source": [
        "train_word_counts = vectorizer.transform(X_train)\n",
        "X_train_vectorized = pd.DataFrame(train_word_counts[0:10000].toarray(), columns=vectorizer.get_feature_names())\n",
        "X_train_vectorized = X_train_vectorized.fillna(0)\n",
        "print(X_train_vectorized.shape)\n",
        "X_train_vectorized.head()"
      ],
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(10000, 10000)\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>aa</th>\n",
              "      <th>aaa</th>\n",
              "      <th>aaah</th>\n",
              "      <th>aaahh</th>\n",
              "      <th>aafreen</th>\n",
              "      <th>aah</th>\n",
              "      <th>aahhh</th>\n",
              "      <th>aalaap</th>\n",
              "      <th>aamyhaanson</th>\n",
              "      <th>aaron</th>\n",
              "      <th>aaronfuller</th>\n",
              "      <th>aaronob</th>\n",
              "      <th>aaronrenfree</th>\n",
              "      <th>aaronrgillespie</th>\n",
              "      <th>aaw</th>\n",
              "      <th>aaww</th>\n",
              "      <th>aawww</th>\n",
              "      <th>ab</th>\n",
              "      <th>abandoned</th>\n",
              "      <th>abbaks</th>\n",
              "      <th>abbey</th>\n",
              "      <th>abbiefletcher</th>\n",
              "      <th>abbsound</th>\n",
              "      <th>abby</th>\n",
              "      <th>abbybradz</th>\n",
              "      <th>abbyharenberg</th>\n",
              "      <th>abbyyyy</th>\n",
              "      <th>abc</th>\n",
              "      <th>abcdefglynis</th>\n",
              "      <th>abcmsaj</th>\n",
              "      <th>abduzeedo</th>\n",
              "      <th>abeeliever</th>\n",
              "      <th>aber</th>\n",
              "      <th>abiban</th>\n",
              "      <th>abideedles</th>\n",
              "      <th>abiface</th>\n",
              "      <th>abigaelettuce</th>\n",
              "      <th>abigaill</th>\n",
              "      <th>ability</th>\n",
              "      <th>abirtmo</th>\n",
              "      <th>...</th>\n",
              "      <th>youu</th>\n",
              "      <th>youuu</th>\n",
              "      <th>youuuu</th>\n",
              "      <th>youuuuu</th>\n",
              "      <th>youve</th>\n",
              "      <th>youyou</th>\n",
              "      <th>yr</th>\n",
              "      <th>yrs</th>\n",
              "      <th>yt</th>\n",
              "      <th>yu</th>\n",
              "      <th>yuck</th>\n",
              "      <th>yucky</th>\n",
              "      <th>yuh</th>\n",
              "      <th>yuk</th>\n",
              "      <th>yum</th>\n",
              "      <th>yumm</th>\n",
              "      <th>yummm</th>\n",
              "      <th>yummmy</th>\n",
              "      <th>yummy</th>\n",
              "      <th>yun</th>\n",
              "      <th>yung</th>\n",
              "      <th>yup</th>\n",
              "      <th>yupp</th>\n",
              "      <th>yur</th>\n",
              "      <th>yw</th>\n",
              "      <th>zac</th>\n",
              "      <th>zach</th>\n",
              "      <th>zack</th>\n",
              "      <th>ze</th>\n",
              "      <th>zealand</th>\n",
              "      <th>zero</th>\n",
              "      <th>zip</th>\n",
              "      <th>zombie</th>\n",
              "      <th>zombies</th>\n",
              "      <th>zomg</th>\n",
              "      <th>zone</th>\n",
              "      <th>zones</th>\n",
              "      <th>zoo</th>\n",
              "      <th>zoom</th>\n",
              "      <th>zune</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>5 rows × 10000 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "    aa  aaa  aaah  aaahh  aafreen  aah  ...  zomg  zone  zones  zoo  zoom  zune\n",
              "0  0.0  0.0   0.0    0.0      0.0  0.0  ...   0.0   0.0    0.0  0.0   0.0   0.0\n",
              "1  0.0  0.0   0.0    0.0      0.0  0.0  ...   0.0   0.0    0.0  0.0   0.0   0.0\n",
              "2  0.0  0.0   0.0    0.0      0.0  0.0  ...   0.0   0.0    0.0  0.0   0.0   0.0\n",
              "3  0.0  0.0   0.0    0.0      0.0  0.0  ...   0.0   0.0    0.0  0.0   0.0   0.0\n",
              "4  0.0  0.0   0.0    0.0      0.0  0.0  ...   0.0   0.0    0.0  0.0   0.0   0.0\n",
              "\n",
              "[5 rows x 10000 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 37
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:51:09.809785Z",
          "start_time": "2019-03-29T17:51:09.761256Z"
        },
        "colab_type": "code",
        "id": "TX8OEgUP_3ee",
        "colab": {}
      },
      "source": [
        "dfs = df.sample(frac=0.1)\n",
        "X = dfs.cleanedSentence\n",
        "y = dfs.Sentiment.values\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:51:11.508193Z",
          "start_time": "2019-03-29T17:51:11.504530Z"
        },
        "id": "h4YsonNhqOvL",
        "colab_type": "code",
        "outputId": "59a68def-cac0-4e16-9c30-403e7f65db1b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 83
        }
      },
      "source": [
        "print(X_train.shape)\n",
        "print(X_test.shape)\n",
        "print(y_train.shape)\n",
        "print(y_test.shape)"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(7999,)\n",
            "(2000,)\n",
            "(7999,)\n",
            "(2000,)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:51:16.482859Z",
          "start_time": "2019-03-29T17:51:16.361378Z"
        },
        "id": "AlvWW5wRqOvP",
        "colab_type": "code",
        "outputId": "a1b2d96c-4f69-43f0-8f36-a4d8f8dcf099",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 116
        }
      },
      "source": [
        "vectorizer = CountVectorizer(max_features=1000, ngram_range=(1,1), stop_words='english')\n",
        "vectorizer.fit(X_train)"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
              "                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
              "                lowercase=True, max_df=1.0, max_features=1000, min_df=1,\n",
              "                ngram_range=(1, 1), preprocessor=None, stop_words='english',\n",
              "                strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
              "                tokenizer=None, vocabulary=None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 40
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:51:18.167707Z",
          "start_time": "2019-03-29T17:51:18.059599Z"
        },
        "id": "MkQUL4jzqOvW",
        "colab_type": "code",
        "outputId": "f1146396-5ef0-4133-c5f8-84df0bfd6aad",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 50
        }
      },
      "source": [
        "train_word_counts = vectorizer.transform(X_train)\n",
        "X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns=vectorizer.get_feature_names())\n",
        "\n",
        "test_word_counts = vectorizer.transform(X_test)\n",
        "X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names())\n",
        "print(X_train_vectorized.shape)\n",
        "print(X_test_vectorized.shape)"
      ],
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(7999, 1000)\n",
            "(2000, 1000)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:51:21.222904Z",
          "start_time": "2019-03-29T17:51:21.218619Z"
        },
        "id": "jIUocZm5qOva",
        "colab_type": "code",
        "outputId": "d9377399-8851-4a32-c45d-e691a01ded47",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 33
        }
      },
      "source": [
        "df.Sentiment.unique()"
      ],
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([0, 1])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 42
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:48:51.962715Z",
          "start_time": "2019-03-29T17:48:31.500744Z"
        },
        "id": "acEs8nlnqOve",
        "colab_type": "code",
        "outputId": "c05aa698-dfcf-4a65-a877-8b30f244d7a1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 83
        }
      },
      "source": [
        "XGB = XGBClassifier(n_estimators=200, objective=\"binary:logistic\").fit(X_train_vectorized, y_train)\n",
        "train_predictions = XGB.predict(X_train_vectorized)\n",
        "test_predictions = XGB.predict(X_test_vectorized)\n",
        "print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n",
        "print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n",
        "print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n",
        "print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')"
      ],
      "execution_count": 43,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Train Accuracy: 0.700962620327541\n",
            "Test Accuracy: 0.6615\n",
            "Train Roc Auc: 0.6712842399415706\n",
            "Test Roc Auc: 0.6423723426511531\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:49:33.866072Z",
          "start_time": "2019-03-29T17:48:53.730994Z"
        },
        "id": "5JIC_D9hqOvi",
        "colab_type": "code",
        "outputId": "9efa2958-961a-439e-e5b5-48e76e0a3d2a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 83
        }
      },
      "source": [
        "RFC = RandomForestClassifier(n_estimators=200).fit(X_train_vectorized, y_train)\n",
        "\n",
        "train_predictions = RFC.predict(X_train_vectorized)\n",
        "test_predictions = RFC.predict(X_test_vectorized)\n",
        "\n",
        "print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n",
        "print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n",
        "print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n",
        "print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')"
      ],
      "execution_count": 44,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Train Accuracy: 0.9577447180897612\n",
            "Test Accuracy: 0.6755\n",
            "Train Roc Auc: 0.9540209266139531\n",
            "Test Roc Auc: 0.670136709634851\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2019-03-29T17:49:38.271311Z",
          "start_time": "2019-03-29T17:49:36.806563Z"
        },
        "id": "KWeYS1RuqOvo",
        "colab_type": "code",
        "outputId": "c08a65cf-487c-4c26-8f0d-168b6d223d58",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 83
        }
      },
      "source": [
        "LR = LogisticRegression(random_state=42, solver=\"newton-cg\").fit(X_train_vectorized, y_train)\n",
        "\n",
        "train_predictions = LR.predict(X_train_vectorized)\n",
        "test_predictions = LR.predict(X_test_vectorized)\n",
        "print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n",
        "print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n",
        "print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n",
        "print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Train Accuracy: 0.7727215901987748\n",
            "Test Accuracy: 0.687\n",
            "Train Roc Auc: 0.7609365215458385\n",
            "Test Roc Auc: 0.6786840792417\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6thJvUHBqOwD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
 }