Skip to content

Instantly share code, notes, and snippets.

@akhileshravi
Last active December 26, 2019 19:33
Show Gist options
  • Save akhileshravi/ec8e861903072643ca40a4ddd5358f4c to your computer and use it in GitHub Desktop.
Save akhileshravi/ec8e861903072643ca40a4ddd5358f4c to your computer and use it in GitHub Desktop.
16110007 Assignment 3 NLP
Name: Akhilesh Ravi
Roll No.: 16110007
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "NLP_Assignment3_16110007",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "9dTzSCUXYQyj",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "c335ef41-cb62-450e-a2d8-4bbfbb599a7d"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/gdrive')"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "YJ_e3TKg9U_N",
"colab_type": "code",
"colab": {}
},
"source": [
"path = \"/content/gdrive/My Drive/Semester 7/NLP/Assignment3/\"\n",
"with open(path + \"train.txt\", 'r') as ftrain:\n",
" train_text = ftrain.read()\n",
"with open(path + \"test.txt\", 'r') as ftest:\n",
" test_text = ftest.read()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "IIFVs3aW78NQ",
"colab_type": "code",
"colab": {}
},
"source": [
"import nltk\n",
"# nltk.download('stopwords')\n",
"from nltk.corpus import stopwords\n",
"import re"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ac5LT9gO8b44",
"colab_type": "code",
"colab": {}
},
"source": [
"help(stopwords)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zUoUl3RN86fc",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "a6ac02d2-1312-4aff-cfe9-d8ad82ae4f3a"
},
"source": [
"stopwords_en = stopwords.words('english')\n",
"exclude = ['very', 'not', 'never', 'no', 'ever', 'nothing', 'really', 'extremely']\n",
"for i in exclude:\n",
" if i not in stopwords_en:\n",
" print(i, end=' ')"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"never ever nothing really extremely "
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "2im06gLjauis",
"colab_type": "code",
"colab": {}
},
"source": [
"emojis = ['😂', '❤', '♥', '😍', '😭', '😘', '😊', '👌', '💕', '👏', '😁', '☺', '♡', '👍', '😩', '🙏', '✌', '😏', '😉', '🙌',\n",
" '🙈', '💪', '😄', '😒', '💃', '💖', '😃', '😔', '😱', '🎉', '😜', '☯', '🌸', '💜', '💙', '✨', '😳', '💗', '★',\n",
" '☀', '😡', '😎', '😢', '💋', '😋', '🙊', '😴', '🎶', '💞', '😌']\n",
"emoji_dict = {emojis[i]: i for i in range(len(emojis))}\n",
"# 50 most frequently used emojis from https://www.kaggle.com/thomasseleck/emoji-sentiment-data"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "kb4JWfw9-dvK",
"colab_type": "code",
"colab": {}
},
"source": [
"train_id = []\n",
"train_data = []\n",
"train_hin, train_eng, train_o, train_labels = [], [], [], []\n",
"# emoji_train = []\n",
"for sample in train_text.split('\\n\\n'):\n",
" \n",
" lines_sample = sample.split('\\n')\n",
" # print(lines_sample[0])\n",
" try:\n",
" train_labels.append(lines_sample[0].split()[2])\n",
" train_id.append(lines_sample[0].split()[1])\n",
" except IndexError:\n",
" del train_id[-1]\n",
" continue\n",
" temp, temp_eng, temp_hin, temp_o = [], [], [], []\n",
" \n",
" \n",
" for line in lines_sample[1:]:\n",
" t = line.split('\\t')\n",
"\n",
" if t[1] != 'O':\n",
" t[0]=re.sub('[\\W_]+', '', t[0])\n",
" if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
" continue\n",
" if 'http' in t[0]:\n",
" continue\n",
" temp.append(t[0])\n",
" if t[1] == 'Eng':\n",
" temp_eng.append(t[0])\n",
" elif t[1] == 'Hin':\n",
" temp_hin.append(t[0])\n",
" elif t[1] == 'O':\n",
" temp_o.append(t[0])\n",
" if temp == []:\n",
" continue\n",
" train_data.append(temp)\n",
" train_eng.append(temp_eng)\n",
" train_hin.append(temp_hin)\n",
" train_o.append(temp_o)\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Vgrdip_g_Fej",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "bf6b5e72-10b6-4075-9f93-49a79dace711"
},
"source": [
"print(len(train_text.split('\\n\\n')))"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"15132\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fZu39o6-0L2i",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "a2488ce9-89d1-4b49-c9d9-df756a66352c"
},
"source": [
"len(train_data)"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"15131"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "X3elTZuw0gQb",
"colab_type": "code",
"colab": {}
},
"source": [
"test_id = []\n",
"test_data = []\n",
"test_hin, test_eng, test_o, test_labels = [], [], [], []\n",
"\n",
"for sample in test_text.split('\\n\\n'):\n",
" \n",
" lines_sample = sample.split('\\n')\n",
" # print(lines_sample[0])\n",
" try:\n",
" test_labels.append(lines_sample[0].split()[2])\n",
" test_id.append(lines_sample[0].split()[1])\n",
" except IndexError:\n",
" del test_id[-1]\n",
" continue\n",
" temp, temp_eng, temp_hin, temp_o = [], [], [], []\n",
" \n",
" \n",
" for line in lines_sample[1:]:\n",
" t = line.split('\\t')\n",
" t[0] = t[0].lower()\n",
" if t[1] != 'O':\n",
" t[0]=re.sub('[\\W_]+', '', t[0])\n",
" if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
" continue\n",
" if 'http' in t[0]:\n",
" continue\n",
" temp.append(t[0])\n",
" if t[1] == 'Eng':\n",
" temp_eng.append(t[0])\n",
" elif t[1] == 'Hin':\n",
" temp_hin.append(t[0])\n",
" elif t[1] == 'O':\n",
" temp_o.append(t[0])\n",
" if temp == []:\n",
" continue\n",
" test_data.append(temp)\n",
" test_eng.append(temp_eng)\n",
" test_hin.append(temp_hin)\n",
" test_o.append(temp_o)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KoLNJ4VA0wCF",
"colab_type": "code",
"colab": {}
},
"source": [
"train_tweets = [' '.join(i) for i in train_data]\n",
"test_tweets = [' '.join(i) for i in test_data]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "DA0RtaGc36nz",
"colab_type": "code",
"colab": {}
},
"source": [
"train_tweets_dict = {}\n",
"test_tweets_dict = {}"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "3wseReG-4oib",
"colab_type": "code",
"colab": {}
},
"source": [
"train_tweets_dict['eng'] = [' '.join(i) for i in train_eng]\n",
"test_tweets_dict['eng'] = [' '.join(i) for i in test_eng]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uC29oWZv36k9",
"colab_type": "code",
"colab": {}
},
"source": [
"train_tweets_dict['hin'] = [' '.join(i) for i in train_hin]\n",
"test_tweets_dict['hin'] = [' '.join(i) for i in test_hin]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5K_BLtFW38SK",
"colab_type": "code",
"colab": {}
},
"source": [
"train_tweets_dict['o'] = [' '.join(i) for i in train_o]\n",
"test_tweets_dict['o'] = [' '.join(i) for i in test_o]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "sNh20-xu2Yq7",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "7fc16ef6-f5ce-4b31-afca-1858dc7d81b2"
},
"source": [
"np.unique(train_labels)"
],
"execution_count": 122,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['negative', 'neutral', 'positive'], dtype='<U8')"
]
},
"metadata": {
"tags": []
},
"execution_count": 122
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "n3u6_cLo07yx",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 81
},
"outputId": "13537e4a-5ddc-461a-a1db-ed66b752f647"
},
"source": [
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"from keras.preprocessing.text import Tokenizer\n",
"from sklearn.model_selection import train_test_split\n",
"import pandas as pd\n",
"import random\n",
"import numpy as np\n",
"from keras.preprocessing import sequence\n",
"from keras.utils import np_utils\n",
"\n",
"from keras.models import Sequential\n",
"from keras.layers.core import Dense, Dropout, Activation, Lambda\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.layers.recurrent import LSTM, SimpleRNN, GRU\n",
"from keras.preprocessing.text import Tokenizer\n",
"from keras import optimizers"
],
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
],
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"<p style=\"color: red;\">\n",
"The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n",
"We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n",
"or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n",
"<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ZJGGWOMY2cBR",
"colab_type": "code",
"colab": {}
},
"source": [
"label_values = {'negative':0, 'neutral':1, 'positive':2}"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OKygtMNS2KVn",
"colab_type": "code",
"colab": {}
},
"source": [
"y_train = np.array([label_values[i] for i in train_labels])\n",
"y_test = np.array([label_values[i] for i in test_labels])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CDe4kCvp76Ez",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "WbKqX45L1K9p",
"colab_type": "code",
"colab": {}
},
"source": [
"max_features = 20000\n",
"tokenizer1 = Tokenizer(num_words=max_features)\n",
"tokenizer1.fit_on_texts(train_tweets)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "kqUXn83O1i0V",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "5c7ed4a4-f86f-491b-dc28-794d11738843"
},
"source": [
"max_len = 250\n",
"num_classes = 3\n",
"\n",
"sequences_train = tokenizer1.texts_to_sequences(train_tweets)\n",
"sequences_test = tokenizer1.texts_to_sequences(test_tweets)\n",
"\n",
"X_train = sequence.pad_sequences(sequences_train, maxlen=max_len)\n",
"X_test = sequence.pad_sequences(sequences_test, maxlen=max_len)\n",
"\n",
"Y_train = np_utils.to_categorical(y_train, num_classes)\n",
"Y_test = np_utils.to_categorical(y_test, num_classes)\n",
"\n",
"print('X_train shape:', X_train.shape)\n",
"print('X_test shape:', X_test.shape)"
],
"execution_count": 28,
"outputs": [
{
"output_type": "stream",
"text": [
"X_train shape: (15131, 250)\n",
"X_test shape: (1869, 250)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "AA6THqVE3Elf",
"colab_type": "code",
"colab": {}
},
"source": [
"input_dim = X_train.shape[1]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "nQJDXF6T22KB",
"colab_type": "code",
"colab": {}
},
"source": [
"input_dim = X_train.shape[1]\n",
"model1 = Sequential()\n",
"\n",
"model1.add(Dense(input_dim))\n",
"model1.add(Dropout(0.2))\n",
"model1.add(Activation('relu'))\n",
"model1.add(Dense(200))\n",
"model1.add(Dropout(0.2))\n",
"model1.add(Activation('tanh'))\n",
"model1.add(Dense(100))\n",
"model1.add(Dropout(0.2))\n",
"model1.add(Activation('sigmoid'))\n",
"model1.add(Dense(3))\n",
"model1.add(Activation('softmax'))\n",
"\n",
"adam = optimizers.Adam(lr=0.01, decay=1e-6)\n",
"\n",
"model1.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Cz5Qg-hI2rk_",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"outputId": "f121d048-6d4b-419d-82de-a50d24ea7cce"
},
"source": [
"model1.fit(X_train, Y_train, batch_size = 256, epochs=1)"
],
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/1\n",
"15131/15131 [==============================] - 1s 70us/step - loss: 0.6535 - acc: 0.6507\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f220a2c18>"
]
},
"metadata": {
"tags": []
},
"execution_count": 34
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "IUh9VqF13a8q",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "81e6d0c1-ab8f-46ea-ae70-cc547f187fa5"
},
"source": [
"preds = model1.predict_classes(X_test, verbose=0)\n",
"np.sum(preds==y_test)/len(y_test)"
],
"execution_count": 35,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.4002140181915463"
]
},
"metadata": {
"tags": []
},
"execution_count": 35
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ILc7YMLjJsaF",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 538
},
"outputId": "8d5c648b-16cf-4850-b3fd-875fa4a5c1e4"
},
"source": [
"model1.summary()"
],
"execution_count": 36,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential_2\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"dense_5 (Dense) (None, 250) 62750 \n",
"_________________________________________________________________\n",
"dropout_4 (Dropout) (None, 250) 0 \n",
"_________________________________________________________________\n",
"activation_5 (Activation) (None, 250) 0 \n",
"_________________________________________________________________\n",
"dense_6 (Dense) (None, 200) 50200 \n",
"_________________________________________________________________\n",
"dropout_5 (Dropout) (None, 200) 0 \n",
"_________________________________________________________________\n",
"activation_6 (Activation) (None, 200) 0 \n",
"_________________________________________________________________\n",
"dense_7 (Dense) (None, 100) 20100 \n",
"_________________________________________________________________\n",
"dropout_6 (Dropout) (None, 100) 0 \n",
"_________________________________________________________________\n",
"activation_7 (Activation) (None, 100) 0 \n",
"_________________________________________________________________\n",
"dense_8 (Dense) (None, 3) 303 \n",
"_________________________________________________________________\n",
"activation_8 (Activation) (None, 3) 0 \n",
"=================================================================\n",
"Total params: 133,353\n",
"Trainable params: 133,353\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "8fDdsocU4zd4",
"colab_type": "code",
"colab": {}
},
"source": [
"tokenizers = {}\n",
"for v in ['eng', 'hin', 'o']:\n",
" tokenizers[v] = Tokenizer(num_words=max_features)\n",
" tokenizers[v].fit_on_texts(train_tweets_dict[v])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gp4HxqeK5NHm",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "abb433b3-1990-439f-b712-331f6a101cd5"
},
"source": [
"sequences_train_dict, sequences_test_dict = {}, {}\n",
"X_train_dict, X_test_dict = {}, {}\n",
"for v in ['eng', 'hin', 'o']:\n",
" sequences_train_dict[v] = tokenizers[v].texts_to_sequences(train_tweets)\n",
" sequences_test_dict[v] = tokenizers[v].texts_to_sequences(test_tweets)\n",
"\n",
" X_train_dict[v] = sequence.pad_sequences(sequences_train_dict[v], maxlen=max_len)\n",
" X_test_dict[v] = sequence.pad_sequences(sequences_test_dict[v], maxlen=max_len)\n",
"\n",
"# Y_train = np_utils.to_categorical(y_train, num_classes)\n",
"# Y_test = np_utils.to_categorical(y_test, num_classes)\n",
"\n",
"X_train2 = np.hstack(tuple([X_train] + [X_train_dict[v] for v in ['eng', 'hin', 'o']]))\n",
"X_test2 = np.hstack(tuple([X_test] + [X_test_dict[v] for v in ['eng', 'hin', 'o']]))\n",
"\n",
"print('X_train2 shape:', X_train2.shape)\n",
"print('X_test2 shape:', X_test2.shape)"
],
"execution_count": 38,
"outputs": [
{
"output_type": "stream",
"text": [
"X_train2 shape: (15131, 1000)\n",
"X_test2 shape: (1869, 1000)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "q01Pe0BFXwXl",
"colab_type": "code",
"colab": {}
},
"source": [
"# sum(np.sum(emoji_train, axis=0) > 0)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "w66AmyT36fNY",
"colab_type": "code",
"colab": {}
},
"source": [
"input_dim2 = X_train2.shape[1]\n",
"model2 = Sequential()\n",
"\n",
"model2.add(Dense(input_dim))\n",
"model2.add(Dropout(0.2))\n",
"model2.add(Activation('relu'))\n",
"model2.add(Dense(200))\n",
"model2.add(Dropout(0.2))\n",
"model2.add(Activation('tanh'))\n",
"model2.add(Dense(100))\n",
"model2.add(Dropout(0.2))\n",
"model2.add(Activation('sigmoid'))\n",
"model2.add(Dense(3))\n",
"model2.add(Activation('softmax'))\n",
"\n",
"adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
"\n",
"model2.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "k__XRSio68V4",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"outputId": "ebcb5fc3-d773-4e93-f679-f244b84c78c4"
},
"source": [
"model2.fit(X_train2, Y_train, batch_size = 256, epochs=1)"
],
"execution_count": 42,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/1\n",
"15131/15131 [==============================] - 2s 113us/step - loss: 0.6664 - acc: 0.6436\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f1a1f4cf8>"
]
},
"metadata": {
"tags": []
},
"execution_count": 42
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "sOXvqSjh7DO1",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "199fef9a-949b-421c-bce9-45a689ee336d"
},
"source": [
"preds2 = model2.predict_classes(X_test2, verbose=0)\n",
"np.sum(preds2==y_test)/len(y_test)"
],
"execution_count": 43,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.4071696094168004"
]
},
"metadata": {
"tags": []
},
"execution_count": 43
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "6pAujGiH7T0N",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "82579038-25f0-459f-d89e-3ef10e10a1d0"
},
"source": [
"preds2[:10]"
],
"execution_count": 44,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])"
]
},
"metadata": {
"tags": []
},
"execution_count": 44
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "SmkqnBdkLMr6",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "6077f179-a37a-4102-c20d-c72029482ebe"
},
"source": [
"X_train3 = X_train.copy()\n",
"X_test3 = X_test.copy()\n",
"\n",
"emoji_train = np.zeros((X_train.shape[0], 50))\n",
"emoji_test = np.zeros((X_test.shape[0], 50))\n",
"\n",
"i = 0\n",
"for sample in train_text.split('\\n\\n'):\n",
" \n",
" lines_sample = sample.split('\\n')\n",
" try:\n",
" tmp = lines_sample[0].split()[2]\n",
" tmp = lines_sample[0].split()[1]\n",
" except IndexError:\n",
" continue \n",
" \n",
" for line in lines_sample[1:]:\n",
" t = line.split('\\t')\n",
" for ch in t[0]:\n",
" if ch in emojis:\n",
" emoji_train[i][emoji_dict[ch]] += 1\n",
" i += 1\n",
"\n",
"i = 0\n",
"for sample in test_text.split('\\n\\n'):\n",
" \n",
" lines_sample = sample.split('\\n')\n",
" try:\n",
" tmp = lines_sample[0].split()[2]\n",
" tmp = lines_sample[0].split()[1]\n",
" except IndexError:\n",
" continue\n",
"\n",
" for line in lines_sample[1:]:\n",
" t = line.split('\\t')\n",
" for ch in t[0]:\n",
" if ch in emojis:\n",
" emoji_test[i][emoji_dict[ch]] += 1\n",
" i += 1\n",
"\n",
"X_train3 = np.hstack((X_train, emoji_train))\n",
"X_test3 = np.hstack((X_test, emoji_test))\n",
"\n",
"print('X_train3 shape:', X_train3.shape)\n",
"print('X_test3 shape:', X_test3.shape)"
],
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"text": [
"X_train3 shape: (15131, 300)\n",
"X_test3 shape: (1869, 300)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "vH7e1na5aZT4",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "7dfc1f1e-31aa-4644-8640-ada6a9a61b1d"
},
"source": [
"sum(np.sum(emoji_train, axis=1)>0)"
],
"execution_count": 46,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"2112"
]
},
"metadata": {
"tags": []
},
"execution_count": 46
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "K7X1G-3VOwnT",
"colab_type": "code",
"colab": {}
},
"source": [
"input_dim3 = X_train3.shape[1]\n",
"model3 = Sequential()\n",
"\n",
"model3.add(Dense(input_dim))\n",
"model3.add(Dropout(0.2))\n",
"model3.add(Activation('relu'))\n",
"model3.add(Dense(200))\n",
"model3.add(Dropout(0.2))\n",
"model3.add(Activation('tanh'))\n",
"model3.add(Dense(100))\n",
"model3.add(Dropout(0.2))\n",
"model3.add(Activation('sigmoid'))\n",
"model3.add(Dense(3))\n",
"model3.add(Activation('softmax'))\n",
"\n",
"adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
"\n",
"model3.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "sXYYCFNDPCRI",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 173
},
"outputId": "fe744d46-436c-4319-ae93-90adffb4572a"
},
"source": [
"model3.fit(X_train3, Y_train, batch_size = 512, epochs=4)"
],
"execution_count": 60,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/4\n",
"15131/15131 [==============================] - 2s 115us/step - loss: 0.7066 - acc: 0.6207\n",
"Epoch 2/4\n",
"15131/15131 [==============================] - 1s 38us/step - loss: 0.6396 - acc: 0.6603\n",
"Epoch 3/4\n",
"15131/15131 [==============================] - 1s 38us/step - loss: 0.6359 - acc: 0.6626\n",
"Epoch 4/4\n",
"15131/15131 [==============================] - 1s 38us/step - loss: 0.6343 - acc: 0.6631\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f19749be0>"
]
},
"metadata": {
"tags": []
},
"execution_count": 60
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ETjxCbb8PHMl",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "bc8196c9-d391-4208-ac33-1e8aaf325677"
},
"source": [
"preds3 = model3.predict_classes(X_test3, verbose=0)\n",
"np.sum(preds3==y_test)/len(y_test)"
],
"execution_count": 61,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.32691278758694486"
]
},
"metadata": {
"tags": []
},
"execution_count": 61
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "FGYJbkjcYtSn",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "ede4745b-1786-47fd-b332-b33b39553ca7"
},
"source": [
"sum(sum(emoji_train))"
],
"execution_count": 62,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"5102.0"
]
},
"metadata": {
"tags": []
},
"execution_count": 62
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "8lGOcNgnPNKM",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "b799dff6-1721-418d-b43a-017a0c0e269c"
},
"source": [
"X_train.shape"
],
"execution_count": 149,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(15131, 250)"
]
},
"metadata": {
"tags": []
},
"execution_count": 149
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "avVAaVEX3l1I",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.tree import DecisionTreeClassifier"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "oBhhwjWO3pvX",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 121
},
"outputId": "1f996a56-f2af-4c3f-8f73-ff032c845a49"
},
"source": [
"dtree1 = DecisionTreeClassifier()\n",
"dtree1.fit(X_train, Y_train)"
],
"execution_count": 65,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
" max_features=None, max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, presort=False,\n",
" random_state=None, splitter='best')"
]
},
"metadata": {
"tags": []
},
"execution_count": 65
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "l5rwDIWGKi7T",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "3a3a69e5-afea-456d-f21e-4d02da6c0a9f"
},
"source": [
"predsd1 = dtree1.predict(X_test)\n",
"predsd1 = np.argmax(predsd1, axis=1)\n",
"np.sum(predsd1==y_test)/len(y_test)"
],
"execution_count": 66,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.3911182450508293"
]
},
"metadata": {
"tags": []
},
"execution_count": 66
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "T2eJDbzBK9fq",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 121
},
"outputId": "3e561970-f213-4824-87a6-a23fab886df6"
},
"source": [
"dtree2 = DecisionTreeClassifier()\n",
"dtree2.fit(X_train2, Y_train)"
],
"execution_count": 67,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
" max_features=None, max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, presort=False,\n",
" random_state=None, splitter='best')"
]
},
"metadata": {
"tags": []
},
"execution_count": 67
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "2IPi6lqNK8Ct",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "c63065ce-d635-4d59-a87e-605dd94c86c8"
},
"source": [
"predsd2 = dtree2.predict(X_test2)\n",
"predsd2 = np.argmax(predsd2, axis=1)\n",
"np.sum(predsd2==y_test)/len(y_test)"
],
"execution_count": 68,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.38095238095238093"
]
},
"metadata": {
"tags": []
},
"execution_count": 68
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "CKjk91n8P0Ju",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 121
},
"outputId": "963755c1-6807-4845-9db5-b43eb9f1a1d1"
},
"source": [
"dtree3 = DecisionTreeClassifier()\n",
"dtree3.fit(X_train3, Y_train)"
],
"execution_count": 69,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
" max_features=None, max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, presort=False,\n",
" random_state=None, splitter='best')"
]
},
"metadata": {
"tags": []
},
"execution_count": 69
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Q7R4-ZYoP2_L",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "c7aa9058-d994-4adb-c4f2-1303c84effda"
},
"source": [
"predsd3 = dtree3.predict(X_test3)\n",
"predsd3 = np.argmax(predsd3, axis=1)\n",
"np.sum(predsd3==y_test)/len(y_test)"
],
"execution_count": 70,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.39058319957196364"
]
},
"metadata": {
"tags": []
},
"execution_count": 70
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "f8grLdKPQRGN",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "876bd815-1b44-443d-c8fb-00440ae0cc5d"
},
"source": [
"a = X_test3[(np.sum(emoji_test, axis=1)>0)]\n",
"p = y_test[(np.sum(emoji_test, axis=1)>0)]\n",
"predsd4 = dtree3.predict(a)\n",
"predsd4 = np.argmax(predsd4, axis=1)\n",
"np.sum(predsd4==p)/len(p)"
],
"execution_count": 78,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.4262295081967213"
]
},
"metadata": {
"tags": []
},
"execution_count": 78
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "4A1riiirhQ5p",
"colab_type": "code",
"colab": {}
},
"source": [
"from keras.layers.normalization import BatchNormalization\n",
"from keras.layers import SpatialDropout1D\n",
"from keras.models import Model\n",
"from keras.layers import Input,Flatten, Dense, Embedding, RNN, Conv1D, BatchNormalization, MaxPooling1D, Activation, Dropout, concatenate, Lambda\n",
"from keras import optimizers\n",
"from keras.layers.convolutional import Convolution1D\n",
"from keras import backend as K"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gqCJcTt9hEoD",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
},
"outputId": "8b529ddd-651f-4f8e-d8c6-f3e96cee1811"
},
"source": [
"nb_filter = 300\n",
"filter_length = 3\n",
"hidden_dims = 300 # 250\n",
"nb_epoch = 2\n",
"\n",
"\n",
"cmodel1 = Sequential()\n",
"cmodel1.add(Embedding(max_features, 300))\n",
"cmodel1.add(SpatialDropout1D(0.2))\n",
"# we add a Convolution1D, which will learn nb_filter\n",
"# word group filters of size filter_length:\n",
"cmodel1.add(Convolution1D(nb_filter=nb_filter,\n",
" filter_length=filter_length,\n",
" border_mode='valid',\n",
" activation='tanh',\n",
" subsample_length=1))\n",
"\n",
"#cmodel1.add(BatchNormalization())\n",
"from keras import optimizers\n",
"def max_1d(X):\n",
" return K.max(X, axis=1)\n",
"\n",
"cmodel1.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
"cmodel1.add(Dense(hidden_dims))\n",
"cmodel1.add(Dropout(0.2))\n",
"cmodel1.add(Activation('relu'))\n",
"cmodel1.add(Dense(num_classes))\n",
"cmodel1.add(Activation('sigmoid'))\n",
"adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
"cmodel1.compile(loss='binary_crossentropy',\n",
" optimizer=adam,\n",
" metrics=['accuracy'])"
],
"execution_count": 106,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
" app.launch_new_instance()\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "5SIgsZezhsJv",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"outputId": "d31955e9-b6dd-459f-a3ec-51f9fbd2f776"
},
"source": [
"cmodel1.fit(X_train3, Y_train, epochs = 1)"
],
"execution_count": 107,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/1\n",
"15131/15131 [==============================] - 237s 16ms/step - loss: 0.5342 - acc: 0.7211\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f16bf6748>"
]
},
"metadata": {
"tags": []
},
"execution_count": 107
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bPB2nA7wixq8",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "ae705a5a-4024-42d4-90b1-1460caaa1ea2"
},
"source": [
"predsc1 = cmodel1.predict_classes(X_test3, verbose=0)\n",
"np.sum(predsc1==y_test)/len(y_test)"
],
"execution_count": 109,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.565008025682183"
]
},
"metadata": {
"tags": []
},
"execution_count": 109
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cFj06lng5S12",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "f9b8a66d-bb98-47a2-dd14-e02571cd691b"
},
"source": [
"prf(y_test, predsc1, average='micro')"
],
"execution_count": 167,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.565008025682183, 0.565008025682183, 0.565008025682183, None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 167
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "OV8yft9S5n2p",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "8463406e-a582-4ac1-d737-62a6a45dee8f"
},
"source": [
"prf(y_test, predsc1, average='macro')"
],
"execution_count": 168,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.5659138903963613, 0.5840501910447199, 0.5662953882918141, None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 168
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LFglzGiw5rVT",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "d8262ee5-fb84-4959-a23d-3245894fa599"
},
"source": [
"prf(y_test, predsc1, average='weighted')"
],
"execution_count": 169,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.5675574871168725, 0.565008025682183, 0.5569762553083624, None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 169
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "iKXK20SYjOZj",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 469
},
"outputId": "df05a7c1-d2ac-45a9-bec8-44a0b7673c43"
},
"source": [
"cmodel1.summary()"
],
"execution_count": 108,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential_20\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_12 (Embedding) (None, None, 300) 6000000 \n",
"_________________________________________________________________\n",
"spatial_dropout1d_11 (Spatia (None, None, 300) 0 \n",
"_________________________________________________________________\n",
"conv1d_9 (Conv1D) (None, None, 300) 270300 \n",
"_________________________________________________________________\n",
"lambda_9 (Lambda) (None, 300) 0 \n",
"_________________________________________________________________\n",
"dense_46 (Dense) (None, 300) 90300 \n",
"_________________________________________________________________\n",
"dropout_32 (Dropout) (None, 300) 0 \n",
"_________________________________________________________________\n",
"activation_46 (Activation) (None, 300) 0 \n",
"_________________________________________________________________\n",
"dense_47 (Dense) (None, 3) 903 \n",
"_________________________________________________________________\n",
"activation_47 (Activation) (None, 3) 0 \n",
"=================================================================\n",
"Total params: 6,361,503\n",
"Trainable params: 6,361,503\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "NWkJByaVs_ow",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
},
"outputId": "351f49b0-1849-47cc-ba7f-c09ab01aef60"
},
"source": [
"nb_filter = 300\n",
"filter_length = 3\n",
"hidden_dims = 300 # 250\n",
"nb_epoch = 2\n",
"\n",
"\n",
"cmodel1a = Sequential()\n",
"cmodel1a.add(Embedding(max_features, 300))\n",
"cmodel1a.add(SpatialDropout1D(0.2))\n",
"# we add a Convolution1D, which will learn nb_filter\n",
"# word group filters of size filter_length:\n",
"cmodel1a.add(Convolution1D(nb_filter=nb_filter,\n",
" filter_length=filter_length,\n",
" border_mode='valid',\n",
" activation='tanh',\n",
" subsample_length=1))\n",
"\n",
"cmodel1a.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
"cmodel1a.add(Dense(hidden_dims))\n",
"cmodel1a.add(Dropout(0.2))\n",
"cmodel1a.add(Activation('relu'))\n",
"cmodel1a.add(Dense(num_classes))\n",
"cmodel1a.add(Activation('sigmoid'))\n",
"adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
"cmodel1a.compile(loss='binary_crossentropy',\n",
" optimizer=adam,\n",
" metrics=['accuracy'])"
],
"execution_count": 145,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
" app.launch_new_instance()\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "OgoK_J2cjtic",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 104
},
"outputId": "7c41bb80-bfb6-4548-f280-7254855740a8"
},
"source": [
"cmodel1a.fit(X_train3, Y_train, epochs = 2, batch_size=256)"
],
"execution_count": 146,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/2\n",
"15131/15131 [==============================] - 173s 11ms/step - loss: 0.5879 - acc: 0.6856\n",
"Epoch 2/2\n",
"15131/15131 [==============================] - 168s 11ms/step - loss: 0.4583 - acc: 0.7792\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f1368c940>"
]
},
"metadata": {
"tags": []
},
"execution_count": 146
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "SgnkbkcPtJHB",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "d4ae2a11-387e-4178-fe2a-a2df44faa7c5"
},
"source": [
"predsc1a = cmodel1a.predict_classes(X_test3, verbose=0)\n",
"np.sum(predsc1a==y_test)/len(y_test)"
],
"execution_count": 147,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.5644729802033173"
]
},
"metadata": {
"tags": []
},
"execution_count": 147
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "VcAy5qNE207S",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.metrics import precision_recall_fscore_support as prf"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "mvjiLm4z3CDa",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "2e407171-f2e3-436c-9f5f-c873a290962e"
},
"source": [
"prf(y_test, predsc1a, average='micro')"
],
"execution_count": 163,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.5644729802033173, 0.5644729802033173, 0.5644729802033173, None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 163
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RRe5rJKX4IPQ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "a4045c61-c53d-41f5-a146-1f37c97a9743"
},
"source": [
"prf(y_test, predsc1a, average='macro')"
],
"execution_count": 164,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.5708475748562786, 0.563563693418115, 0.5666094135651139, None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 164
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KUOgZQBe4NQK",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "e9451770-bfdb-45e0-bf94-8b7973afbf0f"
},
"source": [
"prf(y_test, predsc1a, average='weighted')"
],
"execution_count": 166,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.5663970904202026, 0.5644729802033173, 0.5648397339912556, None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 166
}
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"outputId": "b8acaefe-008c-4714-9919-28d2f304d49a",
"id": "anNM5TySjt77",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
}
},
"source": [
"cmodel2 = Sequential()\n",
"cmodel2.add(Embedding(max_features, 500))\n",
"cmodel2.add(SpatialDropout1D(0.2))\n",
"# we add a Convolution1D, which will learn nb_filter\n",
"# word group filters of size filter_length:\n",
"cmodel2.add(Convolution1D(nb_filter=nb_filter,\n",
" filter_length=filter_length,\n",
" border_mode='valid',\n",
" activation='tanh',\n",
" subsample_length=1))\n",
"\n",
"cmodel2.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
"cmodel2.add(Dense(hidden_dims))\n",
"cmodel2.add(Dropout(0.2))\n",
"cmodel2.add(Activation('relu'))\n",
"cmodel2.add(Dense(num_classes))\n",
"cmodel2.add(Activation('sigmoid'))\n",
"adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
"cmodel2.compile(loss='binary_crossentropy',\n",
" optimizer=adam,\n",
" metrics=['accuracy'])"
],
"execution_count": 132,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:10: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
" # Remove the CWD from sys.path while we load stuff.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "L3s7q3rqj50A",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"outputId": "0ff1d71c-e282-4cc7-b846-bacfae25dbb1"
},
"source": [
"cmodel2.fit(X_train2, Y_train, epochs = 1)"
],
"execution_count": 133,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/1\n",
"15131/15131 [==============================] - 986s 65ms/step - loss: 0.5525 - acc: 0.7080\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f13d670b8>"
]
},
"metadata": {
"tags": []
},
"execution_count": 133
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "oB7-3ZwTkGBE",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "5d6905c3-5f8b-4d1d-e381-632aaee0069e"
},
"source": [
"predsc2 = cmodel2.predict_classes(X_test3, verbose=0)\n",
"np.sum(predsc2==y_test)/len(y_test)"
],
"execution_count": 134,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.46441947565543074"
]
},
"metadata": {
"tags": []
},
"execution_count": 134
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ch95QsenkLAY",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 469
},
"outputId": "f235abbe-3114-4524-8194-cb9cdd49259b"
},
"source": [
"cmodel2.summary()"
],
"execution_count": 117,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential_22\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_14 (Embedding) (None, None, 500) 10000000 \n",
"_________________________________________________________________\n",
"spatial_dropout1d_13 (Spatia (None, None, 500) 0 \n",
"_________________________________________________________________\n",
"conv1d_11 (Conv1D) (None, None, 300) 450300 \n",
"_________________________________________________________________\n",
"lambda_11 (Lambda) (None, 300) 0 \n",
"_________________________________________________________________\n",
"dense_50 (Dense) (None, 300) 90300 \n",
"_________________________________________________________________\n",
"dropout_34 (Dropout) (None, 300) 0 \n",
"_________________________________________________________________\n",
"activation_50 (Activation) (None, 300) 0 \n",
"_________________________________________________________________\n",
"dense_51 (Dense) (None, 3) 903 \n",
"_________________________________________________________________\n",
"activation_51 (Activation) (None, 3) 0 \n",
"=================================================================\n",
"Total params: 10,541,503\n",
"Trainable params: 10,541,503\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cItt6ON9TKMF",
"colab_type": "code",
"colab": {}
},
"source": [
"!pip install emoji\n",
"import emoji"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "F8jVacTFkbu2",
"colab_type": "code",
"colab": {}
},
"source": [
"new_train_data = []\n",
"\n",
"for sample in train_text.split('\\n\\n'):\n",
" \n",
" lines_sample = sample.split('\\n')\n",
" try:\n",
" tmp = (lines_sample[0].split()[2])\n",
" tmp = (lines_sample[0].split()[1])\n",
" except IndexError:\n",
" continue\n",
" temp = []\n",
" \n",
" for line in lines_sample[1:]:\n",
" t = line.split('\\t')\n",
"\n",
" if t[1] != 'O':\n",
" t[0]=re.sub('[\\W_]+', '', t[0])\n",
" new = ''\n",
" for ch in t[0]:\n",
" if ch in emojis:\n",
" new += ' ' + emoji.demojize(ch) + ' '\n",
" if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
" continue\n",
" if 'http' in t[0]:\n",
" continue\n",
" temp.append(t[0])\n",
" if temp == []:\n",
" continue\n",
" new_train_data.append(temp)\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fZOiuRo1k2M_",
"colab_type": "code",
"colab": {}
},
"source": [
"new_test_data = []\n",
"# emoji_test = []\n",
"for sample in test_text.split('\\n\\n'):\n",
" \n",
" lines_sample = sample.split('\\n')\n",
" try:\n",
" tmp = (lines_sample[0].split()[2])\n",
" tmp = (lines_sample[0].split()[1])\n",
" except IndexError:\n",
" continue\n",
" temp = []\n",
" \n",
" for line in lines_sample[1:]:\n",
" t = line.split('\\t')\n",
"\n",
" if t[1] != 'O':\n",
" t[0]=re.sub('[\\W_]+', '', t[0])\n",
" new = ''\n",
" for ch in t[0]:\n",
" if ch in emojis:\n",
" new += ' ' + emoji.demojize(ch) + ' '\n",
" if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n",
" continue\n",
" if 'http' in t[0]:\n",
" continue\n",
" temp.append(t[0])\n",
" if temp == []:\n",
" continue\n",
" new_test_data.append(temp)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gAVZFFL5kSyE",
"colab_type": "code",
"colab": {}
},
"source": [
"new_train_tweets = [' '.join(i) for i in new_train_data]\n",
"new_test_tweets = [' '.join(i) for i in new_test_data]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "siNMAS_0mUfV",
"colab_type": "code",
"colab": {}
},
"source": [
"max_features = 20000\n",
"tokenizer2 = Tokenizer(num_words=max_features)\n",
"tokenizer2.fit_on_texts(new_train_tweets)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "m5U5FfAKkTh_",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "e4529ffe-5fe1-4879-ea03-5896cb3ed70c"
},
"source": [
"max_len = 250\n",
"num_classes = 3\n",
"\n",
"new_sequences_train = tokenizer2.texts_to_sequences(new_train_tweets)\n",
"new_sequences_test = tokenizer2.texts_to_sequences(new_test_tweets)\n",
"\n",
"X_train4 = sequence.pad_sequences(new_sequences_train, maxlen=max_len)\n",
"X_test4 = sequence.pad_sequences(new_sequences_test, maxlen=max_len)\n",
"\n",
"# Y_train = np_utils.to_categorical(y_train, num_classes)\n",
"# Y_test = np_utils.to_categorical(y_test, num_classes)\n",
"\n",
"print('X_train4 shape:', X_train4.shape)\n",
"print('X_test4 shape:', X_test4.shape)"
],
"execution_count": 126,
"outputs": [
{
"output_type": "stream",
"text": [
"X_train4 shape: (15131, 250)\n",
"X_test4 shape: (1869, 250)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fNAfKJRYmyaR",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"outputId": "2d4f0ab4-1e04-431e-e3c2-20e0f26ca7cc",
"id": "mZODp4rXm0G3",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
}
},
"source": [
"cmodel3 = Sequential()\n",
"cmodel3.add(Embedding(max_features, 300))\n",
"cmodel3.add(SpatialDropout1D(0.2))\n",
"# we add a Convolution1D, which will learn nb_filter\n",
"# word group filters of size filter_length:\n",
"cmodel3.add(Convolution1D(nb_filter=nb_filter,\n",
" filter_length=filter_length,\n",
" border_mode='valid',\n",
" activation='tanh',\n",
" subsample_length=1))\n",
"\n",
"cmodel3.add(Lambda(max_1d, output_shape=(nb_filter,)))\n",
"cmodel3.add(Dense(hidden_dims))\n",
"cmodel3.add(Dropout(0.2))\n",
"cmodel3.add(Activation('relu'))\n",
"cmodel3.add(Dense(num_classes))\n",
"cmodel3.add(Activation('sigmoid'))\n",
"adam = optimizers.Adam(lr=0.001, decay=1e-6)\n",
"cmodel3.compile(loss='binary_crossentropy',\n",
" optimizer=adam,\n",
" metrics=['accuracy'])"
],
"execution_count": 127,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:10: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n",
" # Remove the CWD from sys.path while we load stuff.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "69J1sfpOm9ug",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"outputId": "ad67b870-dbc5-4ad4-d162-509a99a116f5"
},
"source": [
"cmodel3.fit(X_train4, Y_train, epochs = 1)"
],
"execution_count": 129,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/1\n",
"15131/15131 [==============================] - 216s 14ms/step - loss: 0.5303 - acc: 0.7264\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f0f143d9898>"
]
},
"metadata": {
"tags": []
},
"execution_count": 129
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "HWM5IozSnrQb",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "5c630355-d5e8-42b8-826a-01d5621d4cbb"
},
"source": [
"predsc3 = cmodel3.predict_classes(X_test4, verbose=0)\n",
"np.sum(predsc3==y_test)/len(y_test)"
],
"execution_count": 130,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.5521669341894061"
]
},
"metadata": {
"tags": []
},
"execution_count": 130
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "OSVeBC5uoD_d",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 469
},
"outputId": "ae0b085c-9980-42ba-f598-8307bd3210a3"
},
"source": [
"cmodel3.summary()"
],
"execution_count": 131,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential_23\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_15 (Embedding) (None, None, 300) 6000000 \n",
"_________________________________________________________________\n",
"spatial_dropout1d_14 (Spatia (None, None, 300) 0 \n",
"_________________________________________________________________\n",
"conv1d_12 (Conv1D) (None, None, 300) 270300 \n",
"_________________________________________________________________\n",
"lambda_12 (Lambda) (None, 300) 0 \n",
"_________________________________________________________________\n",
"dense_52 (Dense) (None, 300) 90300 \n",
"_________________________________________________________________\n",
"dropout_35 (Dropout) (None, 300) 0 \n",
"_________________________________________________________________\n",
"activation_52 (Activation) (None, 300) 0 \n",
"_________________________________________________________________\n",
"dense_53 (Dense) (None, 3) 903 \n",
"_________________________________________________________________\n",
"activation_53 (Activation) (None, 3) 0 \n",
"=================================================================\n",
"Total params: 6,361,503\n",
"Trainable params: 6,361,503\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "5-B6qEc6BJJJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 191
},
"outputId": "6e4a8b2b-0fbc-429c-a907-2caa8eac5d8e"
},
"source": [
"print(train_text[:100])"
],
"execution_count": 126,
"outputs": [
{
"output_type": "stream",
"text": [
"meta\t3\tnegative\n",
"@\tO\n",
"AdilNisarButt\tHin\n",
"pakistan\tHin\n",
"ka\tHin\n",
"ghra\tHin\n",
"tauq\tHin\n",
"he\tEng\n",
"Pakistan\tEng\n",
"Isra\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "XHN9MSnd5wcC",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 121
},
"outputId": "282da6de-3570-453a-9c8b-31166287f94e"
},
"source": [
"a = np.array([[1,1,1,1], [2,2,2,2]])\n",
"b = np.array([[3,3],[4,4]])\n",
"print(a)\n",
"print(b)\n",
"print(np.hstack((a,b)))"
],
"execution_count": 57,
"outputs": [
{
"output_type": "stream",
"text": [
"[[1 1 1 1]\n",
" [2 2 2 2]]\n",
"[[3 3]\n",
" [4 4]]\n",
"[[1 1 1 1 3 3]\n",
" [2 2 2 2 4 4]]\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment