akhileshravi · December 26, 2019 19:33
diff --git a/ReadMe.txt b/ReadMe.txt
 Name: Akhilesh Ravi
 Roll No.: 16110007
diff --git a/NLP Assignment 3 16110007.pdf b/NLP Assignment 3 16110007.pdf
diff --git a/NLP_Assignment3_16110007.ipynb b/NLP_Assignment3_16110007.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "NLP_Assignment3_16110007",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "9dTzSCUXYQyj",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "c335ef41-cb62-450e-a2d8-4bbfbb599a7d"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YJ_e3TKg9U_N",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "path = \"/content/gdrive/My Drive/Semester 7/NLP/Assignment3/\"\n",
        "with open(path + \"train.txt\", 'r') as ftrain:\n",
        "    train_text = ftrain.read()\n",
        "with open(path + \"test.txt\", 'r') as ftest:\n",
        "    test_text = ftest.read()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IIFVs3aW78NQ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import nltk\n",
        "# nltk.download('stopwords')\n",
        "from nltk.corpus import stopwords\n",
        "import re"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ac5LT9gO8b44",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "help(stopwords)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zUoUl3RN86fc",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "a6ac02d2-1312-4aff-cfe9-d8ad82ae4f3a"
      },
      "source": [
        "stopwords_en = stopwords.words('english')\n",
        "exclude = ['very', 'not', 'never', 'no', 'ever', 'nothing', 'really', 'extremely']\n",
        "for i in exclude:\n",
        "    if i not in stopwords_en:\n",
        "        print(i, end=' ')"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "never ever nothing really extremely "
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2im06gLjauis",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "emojis = ['😂', '❤', '♥', '😍', '😭', '😘', '😊', '👌', '💕', '👏', '😁', '☺', '♡', '👍', '😩', '🙏', '✌', '😏', '😉', '🙌',\n",
        "     '🙈', '💪', '😄', '😒', '💃', '💖', '😃', '😔', '😱', '🎉', '😜', '☯', '🌸', '💜', '💙', '✨', '😳', '💗', '★',\n",
        "     '☀', '😡', '😎', '😢', '💋', '😋', '🙊', '😴', '🎶', '💞', '😌']\n",
        "emoji_dict = {emojis[i]: i for i in range(len(emojis))}\n",
        "# 50 most frequently used emojis from https://www.kaggle.com/thomasseleck/emoji-sentiment-data"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kb4JWfw9-dvK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_id = []\n",
        "train_data = []\n",
        "train_hin, train_eng, train_o, train_labels = [], [], [], []\n",
        "# emoji_train = []\n",
        "for sample in train_text.split('\\n\\n'):\n",
        "    \n",
        "    lines_sample = sample.split('\\n')\n",
        "    # print(lines_sample[0])\n",
        "    try:\n",
        "        train_labels.append(lines_sample[0].split()[2])\n",
        "        train_id.append(lines_sample[0].split()[1])\n",
        "    except IndexError:\n",
        "        del train_id[-1]\n",
        "        continue\n",
        "    temp, temp_eng, temp_hin, temp_o = [], [], [], []\n",
        "    \n",
        "    \n",
        "    for line in lines_sample[1:]:\n",
        "        t = line.split('\\t')\n",
        "\n",
        "        if t[1] != 'O':\n",
        "            t[0]=re.sub('[\\W_]+', '', t[0])\n",
        "        if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
        "            continue\n",
        "        if 'http' in t[0]:\n",
        "            continue\n",
        "        temp.append(t[0])\n",
        "        if t[1] == 'Eng':\n",
        "            temp_eng.append(t[0])\n",
        "        elif t[1] == 'Hin':\n",
        "            temp_hin.append(t[0])\n",
        "        elif t[1] == 'O':\n",
        "            temp_o.append(t[0])\n",
        "    if temp == []:\n",
        "        continue\n",
        "    train_data.append(temp)\n",
        "    train_eng.append(temp_eng)\n",
        "    train_hin.append(temp_hin)\n",
        "    train_o.append(temp_o)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Vgrdip_g_Fej",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "bf6b5e72-10b6-4075-9f93-49a79dace711"
      },
      "source": [
        "print(len(train_text.split('\\n\\n')))"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "15132\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fZu39o6-0L2i",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "a2488ce9-89d1-4b49-c9d9-df756a66352c"
      },
      "source": [
        "len(train_data)"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "15131"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "X3elTZuw0gQb",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test_id = []\n",
        "test_data = []\n",
        "test_hin, test_eng, test_o, test_labels = [], [], [], []\n",
        "\n",
        "for sample in test_text.split('\\n\\n'):\n",
        "    \n",
        "    lines_sample = sample.split('\\n')\n",
        "    # print(lines_sample[0])\n",
        "    try:\n",
        "        test_labels.append(lines_sample[0].split()[2])\n",
        "        test_id.append(lines_sample[0].split()[1])\n",
        "    except IndexError:\n",
        "        del test_id[-1]\n",
        "        continue\n",
        "    temp, temp_eng, temp_hin, temp_o = [], [], [], []\n",
        "    \n",
        "    \n",
        "    for line in lines_sample[1:]:\n",
        "        t = line.split('\\t')\n",
        "        t[0] = t[0].lower()\n",
        "        if t[1] != 'O':\n",
        "            t[0]=re.sub('[\\W_]+', '', t[0])\n",
        "        if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
        "            continue\n",
        "        if 'http' in t[0]:\n",
        "            continue\n",
        "        temp.append(t[0])\n",
        "        if t[1] == 'Eng':\n",
        "            temp_eng.append(t[0])\n",
        "        elif t[1] == 'Hin':\n",
        "            temp_hin.append(t[0])\n",
        "        elif t[1] == 'O':\n",
        "            temp_o.append(t[0])\n",
        "    if temp == []:\n",
        "        continue\n",
        "    test_data.append(temp)\n",
        "    test_eng.append(temp_eng)\n",
        "    test_hin.append(temp_hin)\n",
        "    test_o.append(temp_o)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KoLNJ4VA0wCF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_tweets = [' '.join(i) for i in train_data]\n",
        "test_tweets = [' '.join(i) for i in test_data]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DA0RtaGc36nz",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_tweets_dict = {}\n",
        "test_tweets_dict = {}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3wseReG-4oib",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_tweets_dict['eng'] = [' '.join(i) for i in train_eng]\n",
        "test_tweets_dict['eng'] = [' '.join(i) for i in test_eng]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uC29oWZv36k9",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_tweets_dict['hin'] = [' '.join(i) for i in train_hin]\n",
        "test_tweets_dict['hin'] = [' '.join(i) for i in test_hin]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5K_BLtFW38SK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_tweets_dict['o'] = [' '.join(i) for i in train_o]\n",
        "test_tweets_dict['o'] = [' '.join(i) for i in test_o]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sNh20-xu2Yq7",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "7fc16ef6-f5ce-4b31-afca-1858dc7d81b2"
      },
      "source": [
        "np.unique(train_labels)"
      ],
      "execution_count": 122,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array(['negative', 'neutral', 'positive'], dtype='<U8')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 122
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "n3u6_cLo07yx",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 81
        },
        "outputId": "13537e4a-5ddc-461a-a1db-ed66b752f647"
      },
      "source": [
        "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "import pandas as pd\n",
        "import random\n",
        "import numpy as np\n",
        "from keras.preprocessing import sequence\n",
        "from keras.utils import np_utils\n",
        "\n",
        "from keras.models import Sequential\n",
        "from keras.layers.core import Dense, Dropout, Activation, Lambda\n",
        "from keras.layers.embeddings import Embedding\n",
        "from keras.layers.recurrent import LSTM, SimpleRNN, GRU\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "from keras import optimizers"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Using TensorFlow backend.\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "<p style=\"color: red;\">\n",
              "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n",
              "We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n",
              "or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n",
              "<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZJGGWOMY2cBR",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "label_values = {'negative':0, 'neutral':1, 'positive':2}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OKygtMNS2KVn",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "y_train = np.array([label_values[i] for i in train_labels])\n",
        "y_test = np.array([label_values[i] for i in test_labels])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CDe4kCvp76Ez",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WbKqX45L1K9p",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "max_features = 20000\n",
        "tokenizer1 = Tokenizer(num_words=max_features)\n",
        "tokenizer1.fit_on_texts(train_tweets)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kqUXn83O1i0V",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "outputId": "5c7ed4a4-f86f-491b-dc28-794d11738843"
      },
      "source": [
        "max_len = 250\n",
        "num_classes = 3\n",
        "\n",
        "sequences_train = tokenizer1.texts_to_sequences(train_tweets)\n",
        "sequences_test = tokenizer1.texts_to_sequences(test_tweets)\n",
        "\n",
        "X_train = sequence.pad_sequences(sequences_train, maxlen=max_len)\n",
        "X_test = sequence.pad_sequences(sequences_test, maxlen=max_len)\n",
        "\n",
        "Y_train = np_utils.to_categorical(y_train, num_classes)\n",
        "Y_test = np_utils.to_categorical(y_test, num_classes)\n",
        "\n",
        "print('X_train shape:', X_train.shape)\n",
        "print('X_test shape:', X_test.shape)"
      ],
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "X_train shape: (15131, 250)\n",
            "X_test shape: (1869, 250)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AA6THqVE3Elf",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "input_dim = X_train.shape[1]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nQJDXF6T22KB",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "input_dim = X_train.shape[1]\n",
        "model1 = Sequential()\n",
        "\n",
        "model1.add(Dense(input_dim))\n",
        "model1.add(Dropout(0.2))\n",
        "model1.add(Activation('relu'))\n",
        "model1.add(Dense(200))\n",
        "model1.add(Dropout(0.2))\n",
        "model1.add(Activation('tanh'))\n",
        "model1.add(Dense(100))\n",
        "model1.add(Dropout(0.2))\n",
        "model1.add(Activation('sigmoid'))\n",
        "model1.add(Dense(3))\n",
        "model1.add(Activation('softmax'))\n",
        "\n",
        "adam = optimizers.Adam(lr=0.01, decay=1e-6)\n",
        "\n",
        "model1.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Cz5Qg-hI2rk_",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "f121d048-6d4b-419d-82de-a50d24ea7cce"
      },
      "source": [
        "model1.fit(X_train, Y_train, batch_size = 256, epochs=1)"
      ],
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/1\n",
            "15131/15131 [==============================] - 1s 70us/step - loss: 0.6535 - acc: 0.6507\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f220a2c18>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 34
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IUh9VqF13a8q",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "81e6d0c1-ab8f-46ea-ae70-cc547f187fa5"
      },
      "source": [
        "preds = model1.predict_classes(X_test, verbose=0)\n",
        "np.sum(preds==y_test)/len(y_test)"
      ],
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.4002140181915463"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 35
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ILc7YMLjJsaF",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 538
        },
        "outputId": "8d5c648b-16cf-4850-b3fd-875fa4a5c1e4"
      },
      "source": [
        "model1.summary()"
      ],
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_2\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "dense_5 (Dense)              (None, 250)               62750     \n",
            "_________________________________________________________________\n",
            "dropout_4 (Dropout)          (None, 250)               0         \n",
            "_________________________________________________________________\n",
            "activation_5 (Activation)    (None, 250)               0         \n",
            "_________________________________________________________________\n",
            "dense_6 (Dense)              (None, 200)               50200     \n",
            "_________________________________________________________________\n",
            "dropout_5 (Dropout)          (None, 200)               0         \n",
            "_________________________________________________________________\n",
            "activation_6 (Activation)    (None, 200)               0         \n",
            "_________________________________________________________________\n",
            "dense_7 (Dense)              (None, 100)               20100     \n",
            "_________________________________________________________________\n",
            "dropout_6 (Dropout)          (None, 100)               0         \n",
            "_________________________________________________________________\n",
            "activation_7 (Activation)    (None, 100)               0         \n",
            "_________________________________________________________________\n",
            "dense_8 (Dense)              (None, 3)                 303       \n",
            "_________________________________________________________________\n",
            "activation_8 (Activation)    (None, 3)                 0         \n",
            "=================================================================\n",
            "Total params: 133,353\n",
            "Trainable params: 133,353\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8fDdsocU4zd4",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "tokenizers = {}\n",
        "for v in ['eng', 'hin', 'o']:\n",
        "    tokenizers[v] = Tokenizer(num_words=max_features)\n",
        "    tokenizers[v].fit_on_texts(train_tweets_dict[v])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gp4HxqeK5NHm",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "outputId": "abb433b3-1990-439f-b712-331f6a101cd5"
      },
      "source": [
        "sequences_train_dict, sequences_test_dict = {}, {}\n",
        "X_train_dict, X_test_dict = {}, {}\n",
        "for v in ['eng', 'hin', 'o']:\n",
        "    sequences_train_dict[v] = tokenizers[v].texts_to_sequences(train_tweets)\n",
        "    sequences_test_dict[v] = tokenizers[v].texts_to_sequences(test_tweets)\n",
        "\n",
        "    X_train_dict[v] = sequence.pad_sequences(sequences_train_dict[v], maxlen=max_len)\n",
        "    X_test_dict[v] = sequence.pad_sequences(sequences_test_dict[v], maxlen=max_len)\n",
        "\n",
        "# Y_train = np_utils.to_categorical(y_train, num_classes)\n",
        "# Y_test = np_utils.to_categorical(y_test, num_classes)\n",
        "\n",
        "X_train2 = np.hstack(tuple([X_train] + [X_train_dict[v] for v in ['eng', 'hin', 'o']]))\n",
        "X_test2 = np.hstack(tuple([X_test] + [X_test_dict[v] for v in ['eng', 'hin', 'o']]))\n",
        "\n",
        "print('X_train2 shape:', X_train2.shape)\n",
        "print('X_test2 shape:', X_test2.shape)"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "X_train2 shape: (15131, 1000)\n",
            "X_test2 shape: (1869, 1000)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "q01Pe0BFXwXl",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# sum(np.sum(emoji_train, axis=0) > 0)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "w66AmyT36fNY",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "input_dim2 = X_train2.shape[1]\n",
        "model2 = Sequential()\n",
        "\n",
        "model2.add(Dense(input_dim))\n",
        "model2.add(Dropout(0.2))\n",
        "model2.add(Activation('relu'))\n",
        "model2.add(Dense(200))\n",
        "model2.add(Dropout(0.2))\n",
        "model2.add(Activation('tanh'))\n",
        "model2.add(Dense(100))\n",
        "model2.add(Dropout(0.2))\n",
        "model2.add(Activation('sigmoid'))\n",
        "model2.add(Dense(3))\n",
        "model2.add(Activation('softmax'))\n",
        "\n",
        "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
        "\n",
        "model2.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "k__XRSio68V4",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "ebcb5fc3-d773-4e93-f679-f244b84c78c4"
      },
      "source": [
        "model2.fit(X_train2, Y_train, batch_size = 256, epochs=1)"
      ],
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/1\n",
            "15131/15131 [==============================] - 2s 113us/step - loss: 0.6664 - acc: 0.6436\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f1a1f4cf8>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 42
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sOXvqSjh7DO1",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "199fef9a-949b-421c-bce9-45a689ee336d"
      },
      "source": [
        "preds2 = model2.predict_classes(X_test2, verbose=0)\n",
        "np.sum(preds2==y_test)/len(y_test)"
      ],
      "execution_count": 43,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.4071696094168004"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 43
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6pAujGiH7T0N",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "82579038-25f0-459f-d89e-3ef10e10a1d0"
      },
      "source": [
        "preds2[:10]"
      ],
      "execution_count": 44,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 44
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SmkqnBdkLMr6",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "outputId": "6077f179-a37a-4102-c20d-c72029482ebe"
      },
      "source": [
        "X_train3 = X_train.copy()\n",
        "X_test3 = X_test.copy()\n",
        "\n",
        "emoji_train = np.zeros((X_train.shape[0], 50))\n",
        "emoji_test = np.zeros((X_test.shape[0], 50))\n",
        "\n",
        "i = 0\n",
        "for sample in train_text.split('\\n\\n'):\n",
        "    \n",
        "    lines_sample = sample.split('\\n')\n",
        "    try:\n",
        "        tmp = lines_sample[0].split()[2]\n",
        "        tmp = lines_sample[0].split()[1]\n",
        "    except IndexError:\n",
        "        continue    \n",
        "    \n",
        "    for line in lines_sample[1:]:\n",
        "        t = line.split('\\t')\n",
        "        for ch in t[0]:\n",
        "            if ch in emojis:\n",
        "                emoji_train[i][emoji_dict[ch]] += 1\n",
        "    i += 1\n",
        "\n",
        "i = 0\n",
        "for sample in test_text.split('\\n\\n'):\n",
        "    \n",
        "    lines_sample = sample.split('\\n')\n",
        "    try:\n",
        "        tmp = lines_sample[0].split()[2]\n",
        "        tmp = lines_sample[0].split()[1]\n",
        "    except IndexError:\n",
        "        continue\n",
        "\n",
        "    for line in lines_sample[1:]:\n",
        "        t = line.split('\\t')\n",
        "        for ch in t[0]:\n",
        "            if ch in emojis:\n",
        "                emoji_test[i][emoji_dict[ch]] += 1\n",
        "    i += 1\n",
        "\n",
        "X_train3 = np.hstack((X_train, emoji_train))\n",
        "X_test3 = np.hstack((X_test, emoji_test))\n",
        "\n",
        "print('X_train3 shape:', X_train3.shape)\n",
        "print('X_test3 shape:', X_test3.shape)"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "X_train3 shape: (15131, 300)\n",
            "X_test3 shape: (1869, 300)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vH7e1na5aZT4",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "7dfc1f1e-31aa-4644-8640-ada6a9a61b1d"
      },
      "source": [
        "sum(np.sum(emoji_train, axis=1)>0)"
      ],
      "execution_count": 46,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "2112"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 46
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K7X1G-3VOwnT",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "input_dim3 = X_train3.shape[1]\n",
        "model3 = Sequential()\n",
        "\n",
        "model3.add(Dense(input_dim))\n",
        "model3.add(Dropout(0.2))\n",
        "model3.add(Activation('relu'))\n",
        "model3.add(Dense(200))\n",
        "model3.add(Dropout(0.2))\n",
        "model3.add(Activation('tanh'))\n",
        "model3.add(Dense(100))\n",
        "model3.add(Dropout(0.2))\n",
        "model3.add(Activation('sigmoid'))\n",
        "model3.add(Dense(3))\n",
        "model3.add(Activation('softmax'))\n",
        "\n",
        "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
        "\n",
        "model3.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sXYYCFNDPCRI",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 173
        },
        "outputId": "fe744d46-436c-4319-ae93-90adffb4572a"
      },
      "source": [
        "model3.fit(X_train3, Y_train, batch_size = 512, epochs=4)"
      ],
      "execution_count": 60,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/4\n",
            "15131/15131 [==============================] - 2s 115us/step - loss: 0.7066 - acc: 0.6207\n",
            "Epoch 2/4\n",
            "15131/15131 [==============================] - 1s 38us/step - loss: 0.6396 - acc: 0.6603\n",
            "Epoch 3/4\n",
            "15131/15131 [==============================] - 1s 38us/step - loss: 0.6359 - acc: 0.6626\n",
            "Epoch 4/4\n",
            "15131/15131 [==============================] - 1s 38us/step - loss: 0.6343 - acc: 0.6631\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f19749be0>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 60
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ETjxCbb8PHMl",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "bc8196c9-d391-4208-ac33-1e8aaf325677"
      },
      "source": [
        "preds3 = model3.predict_classes(X_test3, verbose=0)\n",
        "np.sum(preds3==y_test)/len(y_test)"
      ],
      "execution_count": 61,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.32691278758694486"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 61
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FGYJbkjcYtSn",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "ede4745b-1786-47fd-b332-b33b39553ca7"
      },
      "source": [
        "sum(sum(emoji_train))"
      ],
      "execution_count": 62,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "5102.0"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 62
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8lGOcNgnPNKM",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "b799dff6-1721-418d-b43a-017a0c0e269c"
      },
      "source": [
        "X_train.shape"
      ],
      "execution_count": 149,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(15131, 250)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 149
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "avVAaVEX3l1I",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from sklearn.tree import DecisionTreeClassifier"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oBhhwjWO3pvX",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 121
        },
        "outputId": "1f996a56-f2af-4c3f-8f73-ff032c845a49"
      },
      "source": [
        "dtree1 = DecisionTreeClassifier()\n",
        "dtree1.fit(X_train, Y_train)"
      ],
      "execution_count": 65,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
              "                       max_features=None, max_leaf_nodes=None,\n",
              "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
              "                       min_samples_leaf=1, min_samples_split=2,\n",
              "                       min_weight_fraction_leaf=0.0, presort=False,\n",
              "                       random_state=None, splitter='best')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 65
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l5rwDIWGKi7T",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "3a3a69e5-afea-456d-f21e-4d02da6c0a9f"
      },
      "source": [
        "predsd1 = dtree1.predict(X_test)\n",
        "predsd1 = np.argmax(predsd1, axis=1)\n",
        "np.sum(predsd1==y_test)/len(y_test)"
      ],
      "execution_count": 66,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.3911182450508293"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 66
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "T2eJDbzBK9fq",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 121
        },
        "outputId": "3e561970-f213-4824-87a6-a23fab886df6"
      },
      "source": [
        "dtree2 = DecisionTreeClassifier()\n",
        "dtree2.fit(X_train2, Y_train)"
      ],
      "execution_count": 67,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
              "                       max_features=None, max_leaf_nodes=None,\n",
              "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
              "                       min_samples_leaf=1, min_samples_split=2,\n",
              "                       min_weight_fraction_leaf=0.0, presort=False,\n",
              "                       random_state=None, splitter='best')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 67
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2IPi6lqNK8Ct",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "c63065ce-d635-4d59-a87e-605dd94c86c8"
      },
      "source": [
        "predsd2 = dtree2.predict(X_test2)\n",
        "predsd2 = np.argmax(predsd2, axis=1)\n",
        "np.sum(predsd2==y_test)/len(y_test)"
      ],
      "execution_count": 68,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.38095238095238093"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 68
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CKjk91n8P0Ju",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 121
        },
        "outputId": "963755c1-6807-4845-9db5-b43eb9f1a1d1"
      },
      "source": [
        "dtree3 = DecisionTreeClassifier()\n",
        "dtree3.fit(X_train3, Y_train)"
      ],
      "execution_count": 69,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
              "                       max_features=None, max_leaf_nodes=None,\n",
              "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
              "                       min_samples_leaf=1, min_samples_split=2,\n",
              "                       min_weight_fraction_leaf=0.0, presort=False,\n",
              "                       random_state=None, splitter='best')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 69
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Q7R4-ZYoP2_L",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "c7aa9058-d994-4adb-c4f2-1303c84effda"
      },
      "source": [
        "predsd3 = dtree3.predict(X_test3)\n",
        "predsd3 = np.argmax(predsd3, axis=1)\n",
        "np.sum(predsd3==y_test)/len(y_test)"
      ],
      "execution_count": 70,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.39058319957196364"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 70
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "f8grLdKPQRGN",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "876bd815-1b44-443d-c8fb-00440ae0cc5d"
      },
      "source": [
        "a = X_test3[(np.sum(emoji_test, axis=1)>0)]\n",
        "p = y_test[(np.sum(emoji_test, axis=1)>0)]\n",
        "predsd4 = dtree3.predict(a)\n",
        "predsd4 = np.argmax(predsd4, axis=1)\n",
        "np.sum(predsd4==p)/len(p)"
      ],
      "execution_count": 78,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.4262295081967213"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 78
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4A1riiirhQ5p",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from keras.layers.normalization import BatchNormalization\n",
        "from keras.layers import SpatialDropout1D\n",
        "from keras.models import Model\n",
        "from keras.layers import Input,Flatten, Dense, Embedding, RNN, Conv1D, BatchNormalization, MaxPooling1D, Activation, Dropout, concatenate, Lambda\n",
        "from keras import optimizers\n",
        "from keras.layers.convolutional import Convolution1D\n",
        "from keras import backend as K"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gqCJcTt9hEoD",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 72
        },
        "outputId": "8b529ddd-651f-4f8e-d8c6-f3e96cee1811"
      },
      "source": [
        "nb_filter = 300\n",
        "filter_length = 3\n",
        "hidden_dims = 300 # 250\n",
        "nb_epoch = 2\n",
        "\n",
        "\n",
        "cmodel1 = Sequential()\n",
        "cmodel1.add(Embedding(max_features, 300))\n",
        "cmodel1.add(SpatialDropout1D(0.2))\n",
        "# we add a Convolution1D, which will learn nb_filter\n",
        "# word group filters of size filter_length:\n",
        "cmodel1.add(Convolution1D(nb_filter=nb_filter,\n",
        "                       filter_length=filter_length,\n",
        "                       border_mode='valid',\n",
        "                       activation='tanh',\n",
        "                       subsample_length=1))\n",
        "\n",
        "#cmodel1.add(BatchNormalization())\n",
        "from keras import optimizers\n",
        "def max_1d(X):\n",
        "    return K.max(X, axis=1)\n",
        "\n",
        "cmodel1.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
        "cmodel1.add(Dense(hidden_dims))\n",
        "cmodel1.add(Dropout(0.2))\n",
        "cmodel1.add(Activation('relu'))\n",
        "cmodel1.add(Dense(num_classes))\n",
        "cmodel1.add(Activation('sigmoid'))\n",
        "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
        "cmodel1.compile(loss='binary_crossentropy',\n",
        "             optimizer=adam,\n",
        "             metrics=['accuracy'])"
      ],
      "execution_count": 106,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
            "  app.launch_new_instance()\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5SIgsZezhsJv",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "d31955e9-b6dd-459f-a3ec-51f9fbd2f776"
      },
      "source": [
        "cmodel1.fit(X_train3, Y_train, epochs = 1)"
      ],
      "execution_count": 107,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/1\n",
            "15131/15131 [==============================] - 237s 16ms/step - loss: 0.5342 - acc: 0.7211\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f16bf6748>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 107
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bPB2nA7wixq8",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "ae705a5a-4024-42d4-90b1-1460caaa1ea2"
      },
      "source": [
        "predsc1 = cmodel1.predict_classes(X_test3, verbose=0)\n",
        "np.sum(predsc1==y_test)/len(y_test)"
      ],
      "execution_count": 109,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.565008025682183"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 109
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "cFj06lng5S12",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "f9b8a66d-bb98-47a2-dd14-e02571cd691b"
      },
      "source": [
        "prf(y_test, predsc1, average='micro')"
      ],
      "execution_count": 167,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(0.565008025682183, 0.565008025682183, 0.565008025682183, None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 167
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OV8yft9S5n2p",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "8463406e-a582-4ac1-d737-62a6a45dee8f"
      },
      "source": [
        "prf(y_test, predsc1, average='macro')"
      ],
      "execution_count": 168,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(0.5659138903963613, 0.5840501910447199, 0.5662953882918141, None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 168
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LFglzGiw5rVT",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "d8262ee5-fb84-4959-a23d-3245894fa599"
      },
      "source": [
        "prf(y_test, predsc1, average='weighted')"
      ],
      "execution_count": 169,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(0.5675574871168725, 0.565008025682183, 0.5569762553083624, None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 169
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iKXK20SYjOZj",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 469
        },
        "outputId": "df05a7c1-d2ac-45a9-bec8-44a0b7673c43"
      },
      "source": [
        "cmodel1.summary()"
      ],
      "execution_count": 108,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_20\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_12 (Embedding)     (None, None, 300)         6000000   \n",
            "_________________________________________________________________\n",
            "spatial_dropout1d_11 (Spatia (None, None, 300)         0         \n",
            "_________________________________________________________________\n",
            "conv1d_9 (Conv1D)            (None, None, 300)         270300    \n",
            "_________________________________________________________________\n",
            "lambda_9 (Lambda)            (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "dense_46 (Dense)             (None, 300)               90300     \n",
            "_________________________________________________________________\n",
            "dropout_32 (Dropout)         (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "activation_46 (Activation)   (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "dense_47 (Dense)             (None, 3)                 903       \n",
            "_________________________________________________________________\n",
            "activation_47 (Activation)   (None, 3)                 0         \n",
            "=================================================================\n",
            "Total params: 6,361,503\n",
            "Trainable params: 6,361,503\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NWkJByaVs_ow",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 72
        },
        "outputId": "351f49b0-1849-47cc-ba7f-c09ab01aef60"
      },
      "source": [
        "nb_filter = 300\n",
        "filter_length = 3\n",
        "hidden_dims = 300 # 250\n",
        "nb_epoch = 2\n",
        "\n",
        "\n",
        "cmodel1a = Sequential()\n",
        "cmodel1a.add(Embedding(max_features, 300))\n",
        "cmodel1a.add(SpatialDropout1D(0.2))\n",
        "# we add a Convolution1D, which will learn nb_filter\n",
        "# word group filters of size filter_length:\n",
        "cmodel1a.add(Convolution1D(nb_filter=nb_filter,\n",
        "                       filter_length=filter_length,\n",
        "                       border_mode='valid',\n",
        "                       activation='tanh',\n",
        "                       subsample_length=1))\n",
        "\n",
        "cmodel1a.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
        "cmodel1a.add(Dense(hidden_dims))\n",
        "cmodel1a.add(Dropout(0.2))\n",
        "cmodel1a.add(Activation('relu'))\n",
        "cmodel1a.add(Dense(num_classes))\n",
        "cmodel1a.add(Activation('sigmoid'))\n",
        "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
        "cmodel1a.compile(loss='binary_crossentropy',\n",
        "             optimizer=adam,\n",
        "             metrics=['accuracy'])"
      ],
      "execution_count": 145,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
            "  app.launch_new_instance()\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OgoK_J2cjtic",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 104
        },
        "outputId": "7c41bb80-bfb6-4548-f280-7254855740a8"
      },
      "source": [
        "cmodel1a.fit(X_train3, Y_train, epochs = 2, batch_size=256)"
      ],
      "execution_count": 146,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/2\n",
            "15131/15131 [==============================] - 173s 11ms/step - loss: 0.5879 - acc: 0.6856\n",
            "Epoch 2/2\n",
            "15131/15131 [==============================] - 168s 11ms/step - loss: 0.4583 - acc: 0.7792\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f1368c940>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 146
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SgnkbkcPtJHB",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "d4ae2a11-387e-4178-fe2a-a2df44faa7c5"
      },
      "source": [
        "predsc1a = cmodel1a.predict_classes(X_test3, verbose=0)\n",
        "np.sum(predsc1a==y_test)/len(y_test)"
      ],
      "execution_count": 147,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.5644729802033173"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 147
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VcAy5qNE207S",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from sklearn.metrics import precision_recall_fscore_support as prf"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mvjiLm4z3CDa",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "2e407171-f2e3-436c-9f5f-c873a290962e"
      },
      "source": [
        "prf(y_test, predsc1a, average='micro')"
      ],
      "execution_count": 163,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(0.5644729802033173, 0.5644729802033173, 0.5644729802033173, None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 163
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RRe5rJKX4IPQ",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "a4045c61-c53d-41f5-a146-1f37c97a9743"
      },
      "source": [
        "prf(y_test, predsc1a, average='macro')"
      ],
      "execution_count": 164,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(0.5708475748562786, 0.563563693418115, 0.5666094135651139, None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 164
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KUOgZQBe4NQK",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "e9451770-bfdb-45e0-bf94-8b7973afbf0f"
      },
      "source": [
        "prf(y_test, predsc1a, average='weighted')"
      ],
      "execution_count": 166,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(0.5663970904202026, 0.5644729802033173, 0.5648397339912556, None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 166
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "outputId": "b8acaefe-008c-4714-9919-28d2f304d49a",
        "id": "anNM5TySjt77",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 72
        }
      },
      "source": [
        "cmodel2 = Sequential()\n",
        "cmodel2.add(Embedding(max_features, 500))\n",
        "cmodel2.add(SpatialDropout1D(0.2))\n",
        "# we add a Convolution1D, which will learn nb_filter\n",
        "# word group filters of size filter_length:\n",
        "cmodel2.add(Convolution1D(nb_filter=nb_filter,\n",
        "                       filter_length=filter_length,\n",
        "                       border_mode='valid',\n",
        "                       activation='tanh',\n",
        "                       subsample_length=1))\n",
        "\n",
        "cmodel2.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
        "cmodel2.add(Dense(hidden_dims))\n",
        "cmodel2.add(Dropout(0.2))\n",
        "cmodel2.add(Activation('relu'))\n",
        "cmodel2.add(Dense(num_classes))\n",
        "cmodel2.add(Activation('sigmoid'))\n",
        "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
        "cmodel2.compile(loss='binary_crossentropy',\n",
        "             optimizer=adam,\n",
        "             metrics=['accuracy'])"
      ],
      "execution_count": 132,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:10: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
            "  # Remove the CWD from sys.path while we load stuff.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "L3s7q3rqj50A",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "0ff1d71c-e282-4cc7-b846-bacfae25dbb1"
      },
      "source": [
        "cmodel2.fit(X_train2, Y_train, epochs = 1)"
      ],
      "execution_count": 133,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/1\n",
            "15131/15131 [==============================] - 986s 65ms/step - loss: 0.5525 - acc: 0.7080\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f13d670b8>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 133
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oB7-3ZwTkGBE",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "5d6905c3-5f8b-4d1d-e381-632aaee0069e"
      },
      "source": [
        "predsc2 = cmodel2.predict_classes(X_test3, verbose=0)\n",
        "np.sum(predsc2==y_test)/len(y_test)"
      ],
      "execution_count": 134,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.46441947565543074"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 134
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ch95QsenkLAY",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 469
        },
        "outputId": "f235abbe-3114-4524-8194-cb9cdd49259b"
      },
      "source": [
        "cmodel2.summary()"
      ],
      "execution_count": 117,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_22\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_14 (Embedding)     (None, None, 500)         10000000  \n",
            "_________________________________________________________________\n",
            "spatial_dropout1d_13 (Spatia (None, None, 500)         0         \n",
            "_________________________________________________________________\n",
            "conv1d_11 (Conv1D)           (None, None, 300)         450300    \n",
            "_________________________________________________________________\n",
            "lambda_11 (Lambda)           (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "dense_50 (Dense)             (None, 300)               90300     \n",
            "_________________________________________________________________\n",
            "dropout_34 (Dropout)         (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "activation_50 (Activation)   (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "dense_51 (Dense)             (None, 3)                 903       \n",
            "_________________________________________________________________\n",
            "activation_51 (Activation)   (None, 3)                 0         \n",
            "=================================================================\n",
            "Total params: 10,541,503\n",
            "Trainable params: 10,541,503\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "cItt6ON9TKMF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install emoji\n",
        "import emoji"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F8jVacTFkbu2",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "new_train_data = []\n",
        "\n",
        "for sample in train_text.split('\\n\\n'):\n",
        "    \n",
        "    lines_sample = sample.split('\\n')\n",
        "    try:\n",
        "        tmp = (lines_sample[0].split()[2])\n",
        "        tmp = (lines_sample[0].split()[1])\n",
        "    except IndexError:\n",
        "        continue\n",
        "    temp = []\n",
        "    \n",
        "    for line in lines_sample[1:]:\n",
        "        t = line.split('\\t')\n",
        "\n",
        "        if t[1] != 'O':\n",
        "            t[0]=re.sub('[\\W_]+', '', t[0])\n",
        "            new = ''\n",
        "            for ch in t[0]:\n",
        "                if ch in emojis:\n",
        "                    new += ' ' + emoji.demojize(ch) + ' '\n",
        "        if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
        "            continue\n",
        "        if 'http' in t[0]:\n",
        "            continue\n",
        "        temp.append(t[0])\n",
        "    if temp == []:\n",
        "        continue\n",
        "    new_train_data.append(temp)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fZOiuRo1k2M_",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "new_test_data = []\n",
        "# emoji_test = []\n",
        "for sample in test_text.split('\\n\\n'):\n",
        "    \n",
        "    lines_sample = sample.split('\\n')\n",
        "    try:\n",
        "        tmp = (lines_sample[0].split()[2])\n",
        "        tmp = (lines_sample[0].split()[1])\n",
        "    except IndexError:\n",
        "        continue\n",
        "    temp = []\n",
        "    \n",
        "    for line in lines_sample[1:]:\n",
        "        t = line.split('\\t')\n",
        "\n",
        "        if t[1] != 'O':\n",
        "            t[0]=re.sub('[\\W_]+', '', t[0])\n",
        "            new = ''\n",
        "            for ch in t[0]:\n",
        "                if ch in emojis:\n",
        "                    new += ' ' + emoji.demojize(ch) + ' '\n",
        "        if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
        "            continue\n",
        "        if 'http' in t[0]:\n",
        "            continue\n",
        "        temp.append(t[0])\n",
        "    if temp == []:\n",
        "        continue\n",
        "    new_test_data.append(temp)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gAVZFFL5kSyE",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "new_train_tweets = [' '.join(i) for i in new_train_data]\n",
        "new_test_tweets = [' '.join(i) for i in new_test_data]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "siNMAS_0mUfV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "max_features = 20000\n",
        "tokenizer2 = Tokenizer(num_words=max_features)\n",
        "tokenizer2.fit_on_texts(new_train_tweets)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m5U5FfAKkTh_",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "outputId": "e4529ffe-5fe1-4879-ea03-5896cb3ed70c"
      },
      "source": [
        "max_len = 250\n",
        "num_classes = 3\n",
        "\n",
        "new_sequences_train = tokenizer2.texts_to_sequences(new_train_tweets)\n",
        "new_sequences_test = tokenizer2.texts_to_sequences(new_test_tweets)\n",
        "\n",
        "X_train4 = sequence.pad_sequences(new_sequences_train, maxlen=max_len)\n",
        "X_test4 = sequence.pad_sequences(new_sequences_test, maxlen=max_len)\n",
        "\n",
        "# Y_train = np_utils.to_categorical(y_train, num_classes)\n",
        "# Y_test = np_utils.to_categorical(y_test, num_classes)\n",
        "\n",
        "print('X_train4 shape:', X_train4.shape)\n",
        "print('X_test4 shape:', X_test4.shape)"
      ],
      "execution_count": 126,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "X_train4 shape: (15131, 250)\n",
            "X_test4 shape: (1869, 250)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fNAfKJRYmyaR",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "outputId": "2d4f0ab4-1e04-431e-e3c2-20e0f26ca7cc",
        "id": "mZODp4rXm0G3",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 72
        }
      },
      "source": [
        "cmodel3 = Sequential()\n",
        "cmodel3.add(Embedding(max_features, 300))\n",
        "cmodel3.add(SpatialDropout1D(0.2))\n",
        "# we add a Convolution1D, which will learn nb_filter\n",
        "# word group filters of size filter_length:\n",
        "cmodel3.add(Convolution1D(nb_filter=nb_filter,\n",
        "                       filter_length=filter_length,\n",
        "                       border_mode='valid',\n",
        "                       activation='tanh',\n",
        "                       subsample_length=1))\n",
        "\n",
        "cmodel3.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
        "cmodel3.add(Dense(hidden_dims))\n",
        "cmodel3.add(Dropout(0.2))\n",
        "cmodel3.add(Activation('relu'))\n",
        "cmodel3.add(Dense(num_classes))\n",
        "cmodel3.add(Activation('sigmoid'))\n",
        "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
        "cmodel3.compile(loss='binary_crossentropy',\n",
        "             optimizer=adam,\n",
        "             metrics=['accuracy'])"
      ],
      "execution_count": 127,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:10: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
            "  # Remove the CWD from sys.path while we load stuff.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "69J1sfpOm9ug",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "ad67b870-dbc5-4ad4-d162-509a99a116f5"
      },
      "source": [
        "cmodel3.fit(X_train4, Y_train, epochs = 1)"
      ],
      "execution_count": 129,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/1\n",
            "15131/15131 [==============================] - 216s 14ms/step - loss: 0.5303 - acc: 0.7264\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f0f143d9898>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 129
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HWM5IozSnrQb",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "5c630355-d5e8-42b8-826a-01d5621d4cbb"
      },
      "source": [
        "predsc3 = cmodel3.predict_classes(X_test4, verbose=0)\n",
        "np.sum(predsc3==y_test)/len(y_test)"
      ],
      "execution_count": 130,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.5521669341894061"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 130
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OSVeBC5uoD_d",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 469
        },
        "outputId": "ae0b085c-9980-42ba-f598-8307bd3210a3"
      },
      "source": [
        "cmodel3.summary()"
      ],
      "execution_count": 131,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_23\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_15 (Embedding)     (None, None, 300)         6000000   \n",
            "_________________________________________________________________\n",
            "spatial_dropout1d_14 (Spatia (None, None, 300)         0         \n",
            "_________________________________________________________________\n",
            "conv1d_12 (Conv1D)           (None, None, 300)         270300    \n",
            "_________________________________________________________________\n",
            "lambda_12 (Lambda)           (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "dense_52 (Dense)             (None, 300)               90300     \n",
            "_________________________________________________________________\n",
            "dropout_35 (Dropout)         (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "activation_52 (Activation)   (None, 300)               0         \n",
            "_________________________________________________________________\n",
            "dense_53 (Dense)             (None, 3)                 903       \n",
            "_________________________________________________________________\n",
            "activation_53 (Activation)   (None, 3)                 0         \n",
            "=================================================================\n",
            "Total params: 6,361,503\n",
            "Trainable params: 6,361,503\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5-B6qEc6BJJJ",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 191
        },
        "outputId": "6e4a8b2b-0fbc-429c-a907-2caa8eac5d8e"
      },
      "source": [
        "print(train_text[:100])"
      ],
      "execution_count": 126,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "meta\t3\tnegative\n",
            "@\tO\n",
            "AdilNisarButt\tHin\n",
            "pakistan\tHin\n",
            "ka\tHin\n",
            "ghra\tHin\n",
            "tauq\tHin\n",
            "he\tEng\n",
            "Pakistan\tEng\n",
            "Isra\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XHN9MSnd5wcC",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 121
        },
        "outputId": "282da6de-3570-453a-9c8b-31166287f94e"
      },
      "source": [
        "a = np.array([[1,1,1,1], [2,2,2,2]])\n",
        "b = np.array([[3,3],[4,4]])\n",
        "print(a)\n",
        "print(b)\n",
        "print(np.hstack((a,b)))"
      ],
      "execution_count": 57,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[[1 1 1 1]\n",
            " [2 2 2 2]]\n",
            "[[3 3]\n",
            " [4 4]]\n",
            "[[1 1 1 1 3 3]\n",
            " [2 2 2 2 4 4]]\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
 }