Created
August 21, 2019 04:14
-
-
Save suyash/b77334af071113606b26db4aaefd5154 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
}, | |
"colab": { | |
"name": "spm2.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [] | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2hEa2g4yX0Y5", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 680 | |
}, | |
"outputId": "a84e4aaa-5392-4b29-a2d2-c882562d44dd" | |
}, | |
"source": [ | |
"!pip install tensorflow==2.0.0b1 sentencepiece tf_sentencepiece" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting tensorflow==2.0.0b1\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/29/6c/2c9a5c4d095c63c2fb37d20def0e4f92685f7aee9243d6aae25862694fd1/tensorflow-2.0.0b1-cp36-cp36m-manylinux1_x86_64.whl (87.9MB)\n", | |
"\u001b[K |████████████████████████████████| 87.9MB 346kB/s \n", | |
"\u001b[?25hCollecting sentencepiece\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n", | |
"\u001b[K |████████████████████████████████| 1.0MB 35.0MB/s \n", | |
"\u001b[?25hCollecting tf_sentencepiece\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/dc/2c/20800032089a9271757921f3adc1f2c7ec2d294ec9fa07b3115fab9d27c2/tf_sentencepiece-0.1.83-py2.py3-none-manylinux1_x86_64.whl (2.7MB)\n", | |
"\u001b[K |████████████████████████████████| 2.7MB 36.8MB/s \n", | |
"\u001b[?25hRequirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.2.2)\n", | |
"Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.11.2)\n", | |
"Collecting tb-nightly<1.14.0a20190604,>=1.14.0a20190603 (from tensorflow==2.0.0b1)\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/a4/96/571b875cd81dda9d5dfa1422a4f9d749e67c0a8d4f4f0b33a4e5f5f35e27/tb_nightly-1.14.0a20190603-py3-none-any.whl (3.1MB)\n", | |
"\u001b[K |████████████████████████████████| 3.1MB 38.1MB/s \n", | |
"\u001b[?25hRequirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.8.0)\n", | |
"Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (3.7.1)\n", | |
"Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.16.4)\n", | |
"Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n", | |
"Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n", | |
"Collecting tf-estimator-nightly<1.14.0.dev2019060502,>=1.14.0.dev2019060501 (from tensorflow==2.0.0b1)\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/32/dd/99c47dd007dcf10d63fd895611b063732646f23059c618a373e85019eb0e/tf_estimator_nightly-1.14.0.dev2019060501-py2.py3-none-any.whl (496kB)\n", | |
"\u001b[K |████████████████████████████████| 501kB 31.9MB/s \n", | |
"\u001b[?25hRequirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.33.4)\n", | |
"Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.12.0)\n", | |
"Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.15.0)\n", | |
"Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.7.1)\n", | |
"Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.1.7)\n", | |
"Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.0.8)\n", | |
"Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (3.1.1)\n", | |
"Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (0.15.5)\n", | |
"Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (41.0.1)\n", | |
"Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tensorflow==2.0.0b1) (2.8.0)\n", | |
"Installing collected packages: tb-nightly, tf-estimator-nightly, tensorflow, sentencepiece, tf-sentencepiece\n", | |
" Found existing installation: tensorflow 1.14.0\n", | |
" Uninstalling tensorflow-1.14.0:\n", | |
" Successfully uninstalled tensorflow-1.14.0\n", | |
"Successfully installed sentencepiece-0.1.83 tb-nightly-1.14.0a20190603 tensorflow-2.0.0b1 tf-estimator-nightly-1.14.0.dev2019060501 tf-sentencepiece-0.1.83\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "mndedWQYX0ZE", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import sentencepiece as spm\n", | |
"import tensorflow as tf\n", | |
"import tf_sentencepiece as tfs" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "X8UBiY3CX0ZL", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 208 | |
}, | |
"outputId": "a8796557-7450-49a0-fbb5-902a031d8c90" | |
}, | |
"source": [ | |
"!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2019-08-21 04:12:26-- https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt\n", | |
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", | |
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 278779 (272K) [text/plain]\n", | |
"Saving to: ‘botchan.txt’\n", | |
"\n", | |
"\rbotchan.txt 0%[ ] 0 --.-KB/s \rbotchan.txt 100%[===================>] 272.25K --.-KB/s in 0.03s \n", | |
"\n", | |
"2019-08-21 04:12:27 (9.21 MB/s) - ‘botchan.txt’ saved [278779/278779]\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gmBlKDIQX0ZS", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "a0732ecd-7f5d-4439-b3ca-946d740487be" | |
}, | |
"source": [ | |
"spm.SentencePieceTrainer.train('--model_prefix=m --input=botchan.txt --vocab_size=1200')" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "AQ3Z5PgNX0ZW", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Get piece size" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "y40NpIjuX0ZY", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "20011571-aa58-4220-c5bd-db9e7f39a4ea" | |
}, | |
"source": [ | |
"size = tfs.piece_size(model_file='m.model')\n", | |
"size" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=0, shape=(), dtype=int32, numpy=1200>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "v35YSLUxX0Zd", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### id_to_piece and piece_to_id (constant)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ubx0qO1nX0Ze", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "2d3004d9-2b1c-4604-a5af-171a1fddeb11" | |
}, | |
"source": [ | |
"input_ids = tf.constant(100, dtype=tf.int32)\n", | |
"pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n", | |
"pieces" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=3, shape=(), dtype=string, numpy=b'll'>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MSf54BtcX0Zk", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "d46a62e9-7311-4a25-ff83-37f5f16f0e33" | |
}, | |
"source": [ | |
"tfs.piece_to_id(pieces, model_file='m.model')" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=5, shape=(), dtype=int32, numpy=100>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 7 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "C8lw1u6BX0Zr", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### id_to_piece and piece_to_id (1D)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IgB4-Kx8X0Zt", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 69 | |
}, | |
"outputId": "b8ce5df7-de3f-458c-e9f9-24679dcdafa5" | |
}, | |
"source": [ | |
"input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n", | |
"pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n", | |
"pieces" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=8, shape=(6,), dtype=string, numpy=\n", | |
"array([b'<unk>', b'<s>', b'</s>', b',', b'.', b'\\xe2\\x96\\x81the'],\n", | |
" dtype=object)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "TWHri41YX0Z0", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "b54b7643-34c3-4218-f14b-7e635f428ac2" | |
}, | |
"source": [ | |
"ids = tfs.piece_to_id(pieces, model_file='m.model')\n", | |
"ids" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=10, shape=(6,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5], dtype=int32)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 9 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZiexeLJFX0Z5", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### id_to_piece and piece_to_id (2D)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "KjvvkQ8UX0Z8", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 86 | |
}, | |
"outputId": "7cfdd4ab-6bad-4df8-8d09-73cb9cfa6a30" | |
}, | |
"source": [ | |
"input_ids = tf.constant([[0,1,2,3,4],[5,6,7,8,9]], dtype=tf.int32)\n", | |
"pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n", | |
"pieces" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=13, shape=(2, 5), dtype=string, numpy=\n", | |
"array([[b'<unk>', b'<s>', b'</s>', b',', b'.'],\n", | |
" [b'\\xe2\\x96\\x81the', b's', b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81',\n", | |
" b'\\xe2\\x96\\x81to']], dtype=object)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "F9twViRVX0aA", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 69 | |
}, | |
"outputId": "488e3a11-cb55-42d3-e812-0ee917d4c401" | |
}, | |
"source": [ | |
"ids = tfs.piece_to_id(pieces, model_file='m.model')\n", | |
"ids" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=15, shape=(2, 5), dtype=int32, numpy=\n", | |
"array([[0, 1, 2, 3, 4],\n", | |
" [5, 6, 7, 8, 9]], dtype=int32)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "7HPssPr_X0aF", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### proto" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GuphTn1sX0aH", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "744a139d-ad32-4087-c38f-707de7c9dd3f" | |
}, | |
"source": [ | |
"proto = tf.io.gfile.GFile('m.model', 'rb').read()\n", | |
"tfs.piece_size(model_proto=proto)" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=17, shape=(), dtype=int32, numpy=1200>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "S96BAnfMX0aL", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### is_unknown and is_control" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vIJ2q9GsX0aO", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
}, | |
"outputId": "bcb7a5d5-712f-46e8-ebeb-315d25f68d12" | |
}, | |
"source": [ | |
"input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n", | |
"is_unknown = tfs.is_unknown(input_ids, model_file='m.model')\n", | |
"is_control = tfs.is_control(input_ids, model_file='m.model')\n", | |
"is_unknown, is_control" | |
], | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(<tf.Tensor: id=20, shape=(6,), dtype=bool, numpy=array([ True, False, False, False, False, False])>,\n", | |
" <tf.Tensor: id=21, shape=(6,), dtype=bool, numpy=array([False, True, True, False, False, False])>)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 13 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "N-HeR_hYX0aS", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### encode, encode_sparse, decode" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jgQtpZRfX0aU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"input_text = ['hello world.', 'I have a dog.', 'I have an apple.', 'this is a problem that we have to solve', 'Suyash is a good boy']" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "f7c9-05ZX0aX", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"model_proto = tf.io.gfile.GFile('m.model', 'rb').read()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "uopMpoumX0ab", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 156 | |
}, | |
"outputId": "345ab5cd-4c0b-41b4-b8d9-aa80944474b7" | |
}, | |
"source": [ | |
"ids, seq_len = tfs.encode(input_text, model_proto=model_proto)\n", | |
"ids, seq_len" | |
], | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(<tf.Tensor: id=27, shape=(5, 13), dtype=int32, numpy=\n", | |
" array([[ 35, 100, 22, 940, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 7, 68, 10, 85, 46, 4, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 7, 68, 154, 10, 37, 37, 78, 4, 0, 0, 0, 0, 0],\n", | |
" [ 56, 42, 10, 223, 339, 30, 28, 112, 68, 9, 63, 44, 143],\n", | |
" [210, 54, 31, 439, 42, 10, 281, 316, 31, 0, 0, 0, 0]],\n", | |
" dtype=int32)>,\n", | |
" <tf.Tensor: id=28, shape=(5,), dtype=int32, numpy=array([ 5, 6, 8, 13, 9], dtype=int32)>)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 16 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zwMD1qpUX0af", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 139 | |
}, | |
"outputId": "122fbe20-24ad-4785-e0f8-e50c9f0b0bea" | |
}, | |
"source": [ | |
"sparse_ids = tfs.encode_sparse(input_text, model_proto=model_proto)\n", | |
"tf.sparse.to_dense(sparse_ids)" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=38, shape=(5, 13), dtype=int32, numpy=\n", | |
"array([[ 35, 100, 22, 940, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 7, 68, 10, 85, 46, 4, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 7, 68, 154, 10, 37, 37, 78, 4, 0, 0, 0, 0, 0],\n", | |
" [ 56, 42, 10, 223, 339, 30, 28, 112, 68, 9, 63, 44, 143],\n", | |
" [210, 54, 31, 439, 42, 10, 281, 316, 31, 0, 0, 0, 0]],\n", | |
" dtype=int32)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 17 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wEC5G5y5X0ai", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 86 | |
}, | |
"outputId": "f86ff06c-0139-43bc-ae27-1ae76409768b" | |
}, | |
"source": [ | |
"tfs.decode(ids, seq_len, model_proto=model_proto)" | |
], | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=40, shape=(5,), dtype=string, numpy=\n", | |
"array([b'hello world.', b'I have a dog.', b'I have an apple.',\n", | |
" b'this is a problem that we have to solve',\n", | |
" b'Suyash is a good boy'], dtype=object)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 18 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2rUKDjKbYHH8", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 312 | |
}, | |
"outputId": "681aae8d-ae38-4d1c-a163-33e836b24d4c" | |
}, | |
"source": [ | |
"tfs.id_to_piece(ids, model_proto=model_proto)" | |
], | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: id=42, shape=(5, 13), dtype=string, numpy=\n", | |
"array([[b'\\xe2\\x96\\x81he', b'll', b'o', b'\\xe2\\x96\\x81world', b'.',\n", | |
" b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>',\n", | |
" b'<unk>', b'<unk>'],\n", | |
" [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81a',\n", | |
" b'\\xe2\\x96\\x81do', b'g', b'.', b'<unk>', b'<unk>', b'<unk>',\n", | |
" b'<unk>', b'<unk>', b'<unk>', b'<unk>'],\n", | |
" [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81an',\n", | |
" b'\\xe2\\x96\\x81a', b'p', b'p', b'le', b'.', b'<unk>', b'<unk>',\n", | |
" b'<unk>', b'<unk>', b'<unk>'],\n", | |
" [b'\\xe2\\x96\\x81this', b'\\xe2\\x96\\x81is', b'\\xe2\\x96\\x81a',\n", | |
" b'\\xe2\\x96\\x81pro', b'ble', b'm', b'\\xe2\\x96\\x81that',\n", | |
" b'\\xe2\\x96\\x81we', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81to',\n", | |
" b'\\xe2\\x96\\x81so', b'l', b've'],\n", | |
" [b'\\xe2\\x96\\x81S', b'u', b'y', b'ash', b'\\xe2\\x96\\x81is',\n", | |
" b'\\xe2\\x96\\x81a', b'\\xe2\\x96\\x81good', b'\\xe2\\x96\\x81bo', b'y',\n", | |
" b'<unk>', b'<unk>', b'<unk>', b'<unk>']], dtype=object)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 19 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment