Created
August 30, 2020 05:59
-
-
Save sayan1999/d008ef965c72371602c399284b7ab189 to your computer and use it in GitHub Desktop.
word_seq2seq-extended.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"colab": { | |
"name": "word_seq2seq-extended.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"include_colab_link": true | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/sayan1999/d008ef965c72371602c399284b7ab189/word_seq2seq-extended.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zMy7gQFI9qXy", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 900 | |
}, | |
"outputId": "a69f4c6c-39b4-4dc4-b481-ffd690d9b25a" | |
}, | |
"source": [ | |
"!pip install bidict\n", | |
"!pip install pixiedust\n", | |
"import nltk\n", | |
"nltk.download('punkt')" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting bidict\n", | |
" Downloading https://files.pythonhosted.org/packages/7a/7a/1fcfc397e61b22091267aa767266d8ab200a00b7dbf3aadead7fd41a74b9/bidict-0.21.0-py2.py3-none-any.whl\n", | |
"Installing collected packages: bidict\n", | |
"Successfully installed bidict-0.21.0\n", | |
"Collecting pixiedust\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/16/ba/7488f06b48238205562f9d63aaae2303c060c5dfd63b1ddd3bd9d4656eb1/pixiedust-1.1.18.tar.gz (197kB)\n", | |
"\u001b[K |████████████████████████████████| 204kB 8.7MB/s \n", | |
"\u001b[?25hCollecting mpld3\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/66/31/89bd2afd21b920e3612996623e7b3aac14d741537aa77600ea5102a34be0/mpld3-0.5.1.tar.gz (1.0MB)\n", | |
"\u001b[K |████████████████████████████████| 1.0MB 16.7MB/s \n", | |
"\u001b[?25hRequirement already satisfied: lxml in /usr/local/lib/python3.6/dist-packages (from pixiedust) (4.2.6)\n", | |
"Collecting geojson\n", | |
" Downloading https://files.pythonhosted.org/packages/e4/8d/9e28e9af95739e6d2d2f8d4bef0b3432da40b7c3588fbad4298c1be09e48/geojson-2.5.0-py2.py3-none-any.whl\n", | |
"Requirement already satisfied: astunparse in /usr/local/lib/python3.6/dist-packages (from pixiedust) (1.6.3)\n", | |
"Requirement already satisfied: markdown in /usr/local/lib/python3.6/dist-packages (from pixiedust) (3.2.2)\n", | |
"Collecting colour\n", | |
" Downloading https://files.pythonhosted.org/packages/74/46/e81907704ab203206769dee1385dc77e1407576ff8f50a0681d0a6b541be/colour-0.1.5-py2.py3-none-any.whl\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pixiedust) (2.23.0)\n", | |
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from mpld3->pixiedust) (2.11.2)\n", | |
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from mpld3->pixiedust) (3.2.2)\n", | |
"Requirement already satisfied: six<2.0,>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from astunparse->pixiedust) (1.15.0)\n", | |
"Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from astunparse->pixiedust) (0.35.1)\n", | |
"Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown->pixiedust) (1.7.0)\n", | |
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pixiedust) (1.24.3)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pixiedust) (2.10)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pixiedust) (2020.6.20)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pixiedust) (3.0.4)\n", | |
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->mpld3->pixiedust) (1.1.1)\n", | |
"Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib->mpld3->pixiedust) (1.18.5)\n", | |
"Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->mpld3->pixiedust) (2.8.1)\n", | |
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->mpld3->pixiedust) (0.10.0)\n", | |
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->mpld3->pixiedust) (1.2.0)\n", | |
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->mpld3->pixiedust) (2.4.7)\n", | |
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown->pixiedust) (3.1.0)\n", | |
"Building wheels for collected packages: pixiedust, mpld3\n", | |
" Building wheel for pixiedust (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for pixiedust: filename=pixiedust-1.1.18-cp36-none-any.whl size=321727 sha256=aca85894b80a6fe25fb05217aa1095c664d98c2cb4fdd4eb17715d867b26db89\n", | |
" Stored in directory: /root/.cache/pip/wheels/e8/b1/86/c2f2e16e6bf9bfe556f9dbf8adb9f41816c476d73078c7d0eb\n", | |
" Building wheel for mpld3 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for mpld3: filename=mpld3-0.5.1-cp36-none-any.whl size=364064 sha256=6ca44fc5e92e2085a5b69fe24c1291795c168a4c20cd1f7e5e4d6066281036b0\n", | |
" Stored in directory: /root/.cache/pip/wheels/38/68/06/d119af6c3f9a2d1e123c1f72d276576b457131b3a7bf94e402\n", | |
"Successfully built pixiedust mpld3\n", | |
"Installing collected packages: mpld3, geojson, colour, pixiedust\n", | |
"Successfully installed colour-0.1.5 geojson-2.5.0 mpld3-0.5.1 pixiedust-1.1.18\n", | |
"[nltk_data] Downloading package punkt to /root/nltk_data...\n", | |
"[nltk_data] Unzipping tokenizers/punkt.zip.\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 1 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "aLt_9sNL9qX9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import random\n", | |
"import re, os, difflib\n", | |
"from matplotlib import pyplot as plt\n", | |
"from sklearn.utils import shuffle\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Bidirectional, Concatenate\n", | |
"from tensorflow.keras.models import Model, Sequential\n", | |
"import tensorflow as tf\n", | |
"from nltk import word_tokenize\n", | |
"from gensim.models import Word2Vec\n", | |
"from sklearn.preprocessing import OneHotEncoder\n", | |
"from bidict import bidict\n", | |
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n", | |
"from sklearn.metrics import classification_report\n", | |
"from tensorflow.keras.callbacks import TensorBoard\n", | |
"from tensorflow.keras.utils import plot_model\n", | |
"from tensorflow.keras import backend as K" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "osRUcUVb9qYA", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 92 | |
}, | |
"outputId": "a6171656-ac7d-4778-8cd5-9cb57693895e" | |
}, | |
"source": [ | |
"# parameters\n", | |
"# if running on colab turn this false, and select GPU runtime\n", | |
"batch_size=32 if not tf.test.is_gpu_available() else 256\n", | |
"colab=True\n", | |
"training=True\n", | |
"validation=True\n", | |
"ctx_vec_len=128\n", | |
"embedding_dim=128\n", | |
"epochs=25\n", | |
"# either length or list of index such as range(1, 2200)\n", | |
"training_samples=100\n", | |
"dropout=0.2\n", | |
"weight_file='word-seq2seq.hdf5'" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"WARNING:tensorflow:From <ipython-input-3-3badf1bfea45>:3: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Use `tf.config.list_physical_devices('GPU')` instead.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wf7r24jQ9qYH", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 225 | |
}, | |
"outputId": "71b10c13-8b88-4a4b-e0c2-fc1267cf6437" | |
}, | |
"source": [ | |
"from IPython.display import display, Markdown\n", | |
"if not colab:\n", | |
" display(Markdown('''## Architecture For Neural Machine Trans\n", | |
"![Architecture Neural Machine Trans](image/NeuralMachineTrans.jpg)'''))\n", | |
" \n", | |
"else:\n", | |
" display(Markdown('''## Architecture For Neural Machine Trans\n", | |
"![Architecture Neural Machine Trans]()'''))" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/markdown": "## Architecture For Neural Machine Trans\n![Architecture Neural Machine Trans]()", | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cJ9Cfl5D9qYN", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"if not colab:\n", | |
" # if on local machine \n", | |
" root_dir='.'\n", | |
" \n", | |
"else:\n", | |
" # if using google colab use this code\n", | |
" from google.colab import drive\n", | |
" drive.mount('/content/drive')\n", | |
" root_dir = \"/content/drive/My Drive/Colab Notebooks\"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Ct0UBp8y9qYS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"data_path = os.path.join(root_dir, \"fra.csv\")\n", | |
"doc = pd.read_csv(data_path, nrows=training_samples)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tvVhQmsq9qYY", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# replace contracted forms for english words\n", | |
"contracted_dict={\"won't\" : \"will not\", \"can\\'t\" : \"can not\", \"n\\'t\" : \" not\", \"\\'re\" : \" are\", \"\\'s\" : \" is\", \"\\'d\" : \" would\", \"\\'ll\" : \" will\", \"\\'t\" : \" not\", \"\\'ve\" : \" have\", \"\\'m\" : \" am\"}\n", | |
"\n", | |
"def replace_contracted(text):\n", | |
"\n", | |
" regex = re.compile(\"|\".join(map(re.escape, contracted_dict.keys( ))))\n", | |
" return regex.sub(lambda match: contracted_dict[match.group(0)], text)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "WPWsAj-u9qYd", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# apply decontraction and lowercase\n", | |
"doc=doc.apply(np.vectorize(lambda sent : replace_contracted(str(sent).strip().lower())))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "54f6A85j9qYg", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# tokenize sentences and add start_ and _end keyword to target sentences\n", | |
"source_sents=doc.Source.apply(lambda x: x + ' _END').apply(lambda sent: word_tokenize(sent))\n", | |
"target_sents=doc.Target.apply(lambda x : 'START_ '+ x + ' _END').apply(lambda sent: word_tokenize(sent))\n", | |
"temp = list(zip(source_sents, target_sents)) \n", | |
"random.shuffle(temp) \n", | |
"source_sents, target_sents = zip(*temp)\n", | |
"source_sents, target_sents = pd.Series(source_sents), pd.Series(target_sents)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"scrolled": true, | |
"id": "c0PwKMAA9qYj", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"del(doc)\n", | |
"# building the vocabulary\n", | |
"source_vocab=set().union(*source_sents)\n", | |
"target_vocab=set().union(*target_sents)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "KSuPWE2C9qYo", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# max sentence length for each language in the dataset\n", | |
"max_source_len=max(source_sents.apply(len))\n", | |
"max_target_len=max(target_sents.apply(len))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Yhpt_X7Y9qYr", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# numeric identity for each word in vocab\n", | |
"source_wordint_rel=bidict(enumerate(source_vocab, 1))\n", | |
"temp={0:'paddingZero'}\n", | |
"temp.update(dict(enumerate(target_vocab, 1)))\n", | |
"target_wordint_rel=bidict(temp)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "h0_0s05X9qYv", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# prepare inputs and outputs\n", | |
"encoder_source_arr=[list(map(lambda word : source_wordint_rel.inv[word], sent)) for sent in source_sents]\n", | |
"decoder_source_arr=[list(map(lambda word : target_wordint_rel.inv[word], sent)) for sent in target_sents]\n", | |
"decoder_output_arr=[list(map(lambda word : target_wordint_rel.inv[word], sent[1:])) for sent in target_sents]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"scrolled": true, | |
"id": "WtaBqNMZ9qYy", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# pad the inputs and outputs to max length\n", | |
"padded_encoder_source_arr=pad_sequences(encoder_source_arr, maxlen=max_source_len, padding='post')\n", | |
"padded_decoder_source_arr=pad_sequences(decoder_source_arr, maxlen=max_target_len, padding='post')\n", | |
"padded_decoder_output_arr=pad_sequences(decoder_output_arr, maxlen=max_target_len, padding='post')\n", | |
"onehotted_decoder_output_arr=tf.one_hot(padded_decoder_output_arr, len(target_vocab)+1).numpy()\n", | |
"\n", | |
"del encoder_source_arr, decoder_source_arr, decoder_output_arr, padded_decoder_output_arr" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "T9AF9kdc9qY1", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Model Preparation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "YH3L903Z9qY2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# context-vector length\n", | |
"latent_dim=ctx_vec_len\n", | |
"\n", | |
"# this is the source languge consumtion layer\n", | |
"encoder_inputs = Input(shape=(None,), name='encoder_sources')\n", | |
"# embed the 2-d source into 3-d\n", | |
"enc_emb = Embedding(len(source_vocab)+1, embedding_dim, mask_zero = True, name='enc_emb')(encoder_inputs)\n", | |
"\n", | |
"# LSTM layer to encode the source sentence into context-vector representation\n", | |
"encoder_lstm = Bidirectional(LSTM(latent_dim, return_state=True, return_sequences=True, name='encoder_lstm1', dropout=dropout), name='encoder_bi-lstm1', merge_mode=\"concat\")\n", | |
"\n", | |
"encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_emb)\n", | |
"encoder_states = [forward_h, forward_c, backward_h, backward_c]\n", | |
"\n", | |
"encoder_lstm1 = Bidirectional(LSTM(latent_dim, return_state=True, name='encoder_lstm2', dropout=dropout), name='encoder_bi-lstm2', merge_mode=\"concat\")\n", | |
"encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm1(encoder_outputs, initial_state=encoder_states)\n", | |
"\n", | |
"state_h = Concatenate()([forward_h, backward_h])\n", | |
"state_c = Concatenate()([forward_c, backward_c])\n", | |
"# encoded-states tensor stores the context-vector\n", | |
"encoder_states = [state_h, state_c]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "WOssNVT79qY6", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# this is the target languge consumtion layer\n", | |
"decoder_inputs = Input(shape=(None,), name='decoder_sources')\n", | |
"# embed the 2-d source into 3-d\n", | |
"dec_emb_layer = Embedding(len(target_vocab)+1, embedding_dim, mask_zero = True, name='dec_emb_layer')\n", | |
"dec_emb = dec_emb_layer(decoder_inputs)\n", | |
"\n", | |
"# decoder LSTM, this takes in the context-vector and starting or so-far decoded part of the target sentence\n", | |
"decoder_lstm1 = LSTM(latent_dim, return_sequences=True, name='decoder_lstm1', dropout=dropout)\n", | |
"decoder_outputs11 = decoder_lstm1(dec_emb)\n", | |
"decoder_lstm2 = LSTM(latent_dim, return_sequences=True, name='decoder_lstm2', dropout=dropout)\n", | |
"decoder_outputs12 = decoder_lstm2(decoder_outputs11)\n", | |
"decoder_lstm3 = LSTM(latent_dim*2, return_sequences=True, return_state=True, name='decoder_lstm', dropout=dropout)\n", | |
"decoder_outputs13, _, _ = decoder_lstm3(decoder_outputs12, initial_state=encoder_states)\n", | |
"\n", | |
"# final layer that gives a probabilty distribution of the next possible words\n", | |
"decoder_dense = Dense(len(target_vocab)+1, activation='softmax', name='decoder_dense')\n", | |
"decoder_outputs14 = decoder_dense(decoder_outputs13)\n", | |
"\n", | |
"# Encode the source sequence to get the \"Context vectors\"\n", | |
"encoder_model = Model(encoder_inputs, encoder_states, name='Model_Encoder')\n", | |
"encoder_model.summary()\n", | |
"plot_model(encoder_model, show_shapes=True, show_layer_names=True)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "rYR6yV_KZgvG", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Custom Loss Function to get rid of padding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VNE30xzaZZX7", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"vocab_len=len(onehotted_decoder_output_arr[0][0])\n", | |
"\n", | |
"def PaddedCategoricalCrossentropy(eps=1e-12):\n", | |
" def loss(y_true, y_pred):\n", | |
" mask_value = np.zeros((vocab_len))\n", | |
" mask_value[0] = 1\n", | |
" # find out which timesteps in `y_true` are not the padding character \n", | |
" mask = K.equal(y_true, mask_value)\n", | |
" mask = 1 - K.cast(mask, K.floatx())\n", | |
" mask = K.sum(mask,axis=2)/2\n", | |
" # multplying the loss by the mask. the loss for padding will be zero\n", | |
" loss = tf.keras.layers.multiply([K.categorical_crossentropy(y_true, y_pred), mask])\n", | |
" return K.sum(loss) / K.sum(mask)\n", | |
" return loss" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tiYu7bVY9qY9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# model building and summary\n", | |
"model = Model([encoder_inputs, decoder_inputs], decoder_outputs14, name='Model_Translation')\n", | |
"model.compile(optimizer='Adam', loss=PaddedCategoricalCrossentropy(), metrics=['acc'])\n", | |
"model.summary()\n", | |
"plot_model(model, show_shapes=True, show_layer_names=True)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "wjYkSeMjVz04", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Training" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "g_dxY6fL9qZB", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# TensorBoard Callback \n", | |
"tbCallBack = TensorBoard(log_dir=os.path.join(root_dir, 'Graph'), histogram_freq=0, write_graph=True, write_images=True)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"scrolled": true, | |
"id": "6SsaUZXF9qZF", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"if training:\n", | |
" # train the model\n", | |
" history=model.fit([padded_encoder_source_arr, padded_decoder_source_arr], onehotted_decoder_output_arr, epochs=epochs, validation_split=0.02, callbacks=[tbCallBack], batch_size=batch_size)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "a6-S3Z9QH3iX", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"if training:\n", | |
" model.save_weights(os.path.join(root_dir, weight_file))\n", | |
" with plt.style.context('dark_background'):\n", | |
" plt.plot(history.history['acc'])\n", | |
" plt.plot(history.history['val_acc'])\n", | |
" plt.title('model accuracy')\n", | |
" plt.ylabel('accuracy')\n", | |
" plt.xlabel('epoch')\n", | |
" plt.legend(['train', 'val'], loc='upper left')\n", | |
" plt.show()\n", | |
" plt.plot(history.history['loss'])\n", | |
" plt.plot(history.history['val_loss'])\n", | |
" plt.title('model loss')\n", | |
" plt.ylabel('loss')\n", | |
" plt.xlabel('epoch')\n", | |
" plt.legend(['train', 'val'], loc='upper left')\n", | |
" plt.show()\n", | |
" print(f'Accuracy while saving is {model.evaluate([padded_encoder_source_arr, padded_decoder_source_arr], onehotted_decoder_output_arr)}')" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "CZpJP5Fk9qZI", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Decoder Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Ksm-Vbao9qZJ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Decoder setup\n", | |
"# Below tensors will hold the states of the previous time step\n", | |
"state_h = Input(shape=(latent_dim*2,))\n", | |
"state_c = Input(shape=(latent_dim*2,))\n", | |
"\n", | |
"\n", | |
"decoder_state_input = [state_h, state_c]\n", | |
"# Get the embeddings of the decoder sequence\n", | |
"dec_emb2= dec_emb_layer(decoder_inputs)\n", | |
"# To predict the next word in the sequence, set the initial states to the states from the previous time step\n", | |
"decoder_outputs21 = decoder_lstm1(dec_emb2)\n", | |
"decoder_outputs22 = decoder_lstm2(decoder_outputs21)\n", | |
"decoder_outputs23, state_h2, state_c2 = decoder_lstm3(decoder_outputs22, initial_state=decoder_state_input)\n", | |
"decoder_states2 = [state_h2, state_c2]\n", | |
"# A dense softmax layer to generate prob dist. over the target vocabulary\n", | |
"decoder_outputs24 = decoder_dense(decoder_outputs23)\n", | |
"# Final decoder model\n", | |
"decoder_model = Model(\n", | |
" [decoder_inputs] + decoder_state_input,\n", | |
" [decoder_outputs24] + decoder_states2, name='Model_Decoder')\n", | |
"decoder_model.summary()\n", | |
"plot_model(decoder_model, show_shapes=True, show_layer_names=True)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5yRFPYsS9qZM", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Decoding Logic" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_hAps5WG9qZN", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def decode_sequence(source_seq):\n", | |
" \n", | |
" # Encode the source as state vectors.\n", | |
" states_value = encoder_model.predict(source_seq)\n", | |
" # Generate empty target sequence of length 1.\n", | |
" target_seq = np.zeros((1,1))\n", | |
" # Populate the first character of \n", | |
" #target sequence with the start character.\n", | |
" target_seq[0, 0] = target_wordint_rel.inv['START_']\n", | |
" # Sampling loop for a batch of sequences\n", | |
" # (to simplify, here we assume a batch of size 1).\n", | |
" stop_condition = False\n", | |
" decoded_sentence = []\n", | |
" while not stop_condition:\n", | |
" output_tokens, h, c = decoder_model.predict([target_seq] + states_value)\n", | |
" # Sample a token\n", | |
" sampled_token_index = np.argmax(output_tokens[0, -1, :])\n", | |
" sampled_word =target_wordint_rel[sampled_token_index]\n", | |
" decoded_sentence += [sampled_word]\n", | |
" # Exit condition: either hit max length\n", | |
" # or find stop character.\n", | |
" if (sampled_word == '_END' or\n", | |
" len(decoded_sentence) > 50):\n", | |
" stop_condition = True\n", | |
" # Update the target sequence (of length 1).\n", | |
" target_seq = np.zeros((1,1))\n", | |
" target_seq[0, 0] = sampled_token_index\n", | |
" # Update states\n", | |
" states_value = [h, c]\n", | |
"\n", | |
" return decoded_sentence" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EZFK5mg_9qZQ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Prediction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"scrolled": false, | |
"id": "mWwKlX5a9qZQ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"start=1000\n", | |
"offset=100\n", | |
"def calc_strdiff(true, pred):\n", | |
" # return sum([1 for char in list(difflib.ndiff(true, pred)) if '+ ' in char or '- ' in char])/(len(true))\n", | |
" return nltk.translate.bleu_score.sentence_bleu([word_tokenize(true)], word_tokenize(pred))\n", | |
" \n", | |
"if validation:\n", | |
" \n", | |
" model.load_weights(os.path.join(root_dir, weight_file))\n", | |
" print(f'Accuracy after loading is {model.evaluate([padded_encoder_source_arr, padded_decoder_source_arr], onehotted_decoder_output_arr)}')\n", | |
" y_truePred = [(' '.join(source_sents[seq_index][:-1]), ' '.join(target_sents[seq_index][1:-1]), ' '.join(decode_sequence(padded_encoder_source_arr[seq_index:seq_index+1])[:-1])) for seq_index, _ in enumerate(padded_encoder_source_arr[start:start+offset], start)]\n", | |
" bleu_score=[calc_strdiff(true, pred) for _, true, pred, in y_truePred]\n", | |
" print(f'Bleu Scores are {bleu_score}')\n", | |
" print(f'Avg bleu score for {len(y_truePred)} tests was {sum(bleu_score)/len(y_truePred)}.')\n", | |
" print(f\"{pd.DataFrame(y_truePred, columns=['Source', 'Expected', 'Predicted'])}\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Nsddt0BCzEJz", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"pd.DataFrame(y_truePred, columns=['Source', 'Expected', 'Predicted']).to_excel(os.path.join(root_dir, 'review.xlsx'))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment