Created
December 13, 2019 03:42
-
-
Save csarron/ed68a43e1716db855fbde02b53ecc1bb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": true, | |
| "pycharm": { | |
| "is_executing": false | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n", | |
| "# assume you installed tensorflow and tensorlow_hub\n", | |
| "# wget https://raw.githubusercontent.com/google-research/ALBERT/master/tokenization.py\n", | |
| "# pip install sentencepiece" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": [ | |
| "['tokenization_info', 'mlm', 'tokens']\n", | |
| "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", | |
| "spm_model_file b'/var/folders/v6/vnz79w0d2dn95fj0mtnqs27m0000gn/T/tfhub_modules/098d91f064a4f53dffc7633d00c3d8e87f3a4716/assets/30k-clean.model'\n", | |
| "INFO:tensorflow:loading sentence piece model\n" | |
| ], | |
| "output_type": "stream" | |
| }, | |
| { | |
| "name": "stderr", | |
| "text": [ | |
| "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", | |
| "INFO:tensorflow:loading sentence piece model\n" | |
| ], | |
| "output_type": "stream" | |
| } | |
| ], | |
| "source": [ | |
| "import tensorflow_hub as hub\n", | |
| "import tensorflow as tf\n", | |
| "\n", | |
| "import tokenization\n", | |
| "\n", | |
| "albert_module = hub.Module(\"https://tfhub.dev/google/albert_base/2\",\n", | |
| " trainable=True)\n", | |
| "\n", | |
| "print(albert_module.get_signature_names())\n", | |
| "\n", | |
| "tokenization_info = albert_module(signature=\"tokenization_info\",\n", | |
| " as_dict=True)\n", | |
| "with tf.Session() as sess:\n", | |
| " spm_model_file, lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", | |
| " tokenization_info[\"do_lower_case\"]])\n", | |
| "print('spm_model_file', spm_model_file)\n", | |
| "tokenizer = tokenization.FullTokenizer(vocab_file=None,\n", | |
| " do_lower_case=lower_case,\n", | |
| " spm_model_file=spm_model_file)" | |
| ], | |
| "metadata": { | |
| "collapsed": false, | |
| "pycharm": { | |
| "name": "#%%\n", | |
| "is_executing": false | |
| } | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": [ | |
| "INFO:tensorflow:using sentence piece tokenzier.\n", | |
| "INFO:tensorflow:using sentence piece tokenzier.\n", | |
| "input_tokens [['▁', 'N', 'ew', '▁', 'D', 'el', 'hi', '▁is', '▁the', '▁capital', '▁of', '▁', 'I', 'nd', 'ia'], ['▁', 'T', 'he', '▁capital', '▁of', '▁', 'I', 'nd', 'ia', '▁is', '▁', 'D', 'el', 'hi']]\n", | |
| "input_ids [[13, 1, 4460, 13, 1, 532, 1822, 25, 14, 1057, 16, 13, 1, 706, 549], [13, 1, 438, 1057, 16, 13, 1, 706, 549, 25, 13, 1, 532, 1822]]\n", | |
| "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", | |
| "pooled_output [[ 0.9138424 -0.91918063 -0.9750561 ... -0.09777299 -0.6580293\n", | |
| " 0.85734063]\n", | |
| " [ 0.9554146 -0.969018 -0.98720574 ... -0.33370218 -0.9834848\n", | |
| " 0.9518022 ]]\n", | |
| "sequence_output [[[-0.79346025 -0.9684168 0.68525565 ... -1.2617347 0.73567647\n", | |
| " 0.4075023 ]\n", | |
| " [-1.4199482 -1.4559035 0.29453015 ... -1.1323366 0.622609\n", | |
| " -0.10111377]\n", | |
| " [-1.2416221 -1.3402516 0.5318976 ... -1.100372 0.61021024\n", | |
| " -0.0320334 ]\n", | |
| " ...\n", | |
| " [-1.4569476 -1.4031975 0.5440552 ... -1.0294002 0.57723916\n", | |
| " -0.19555172]\n", | |
| " [-1.4182447 -1.2609751 0.8265942 ... -0.83261204 0.5850436\n", | |
| " -0.36173826]\n", | |
| " [-1.3521252 -1.089019 1.0707805 ... -0.7020445 0.7099029\n", | |
| " -0.35090202]]\n", | |
| "\n", | |
| " [[ 0.50215864 -0.8857083 0.21554916 ... 0.10951228 -0.1213384\n", | |
| " 0.29680854]\n", | |
| " [-1.0673571 -2.8334231 0.41561085 ... 1.5424173 -0.46648303\n", | |
| " 0.21906391]\n", | |
| " [-0.9393208 -2.6763246 0.82966477 ... 1.5915289 -0.55339867\n", | |
| " 0.21424139]\n", | |
| " ...\n", | |
| " [-0.90017676 -2.3508196 0.7401754 ... 1.0992681 -0.6233507\n", | |
| " 0.17976789]\n", | |
| " [-0.92405754 -2.305296 0.9214774 ... 1.1633668 -0.49488556\n", | |
| " 0.21457243]\n", | |
| " [-0.9365634 -2.3063788 1.0833699 ... 1.3754547 -0.5631695\n", | |
| " 0.19878839]]]\n" | |
| ], | |
| "output_type": "stream" | |
| }, | |
| { | |
| "name": "stderr", | |
| "text": [ | |
| "INFO:tensorflow:using sentence piece tokenzier.\n", | |
| "INFO:tensorflow:using sentence piece tokenzier.\n", | |
| "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" | |
| ], | |
| "output_type": "stream" | |
| } | |
| ], | |
| "source": [ | |
| "sample_input_texts = ['New Delhi is the capital of India',\n", | |
| " 'The capital of India is Delhi']\n", | |
| "input_tokens = [tokenizer.tokenize(t) for t in sample_input_texts]\n", | |
| "input_ids = [tokenizer.convert_tokens_to_ids(t) for t in input_tokens]\n", | |
| "print('input_tokens', input_tokens)\n", | |
| "print('input_ids', input_ids) # len(input_ids[0])==15, len(input_ids[1]) == 14\n", | |
| "#\n", | |
| "input_ids[1].append(0) # pad second sentence tokens to 15\n", | |
| "segment_ids = [[0] * 15, [0] * 15]\n", | |
| "\n", | |
| "# only last token of the second sentence is masked\n", | |
| "input_mask = [[1] * 15, [1] * 14 + [0]]\n", | |
| "with tf.Session() as sess:\n", | |
| " albert_inputs = dict(\n", | |
| " input_ids=input_ids,\n", | |
| " input_mask=input_mask,\n", | |
| " segment_ids=segment_ids)\n", | |
| " albert_outputs = albert_module(\n", | |
| " inputs=albert_inputs,\n", | |
| " signature=\"tokens\",\n", | |
| " as_dict=True)\n", | |
| " sess.run(tf.global_variables_initializer())\n", | |
| " sess.run(tf.tables_initializer())\n", | |
| " pooled_output, sequence_output = sess.run(\n", | |
| " [albert_outputs[\"pooled_output\"], albert_outputs[\"sequence_output\"]])\n", | |
| "print('pooled_output', pooled_output)\n", | |
| "print('sequence_output', sequence_output)\n" | |
| ], | |
| "metadata": { | |
| "collapsed": false, | |
| "pycharm": { | |
| "name": "#%%\n", | |
| "is_executing": false | |
| } | |
| } | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.6" | |
| }, | |
| "pycharm": { | |
| "stem_cell": { | |
| "cell_type": "raw", | |
| "source": [], | |
| "metadata": { | |
| "collapsed": false | |
| } | |
| } | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment