csarron · December 13, 2019 03:42
diff --git a/albert_demo.ipynb b/albert_demo.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n",
    "# assume you installed tensorflow and tensorlow_hub\n",
    "# wget https://raw.githubusercontent.com/google-research/ALBERT/master/tokenization.py\n",
    "# pip install sentencepiece"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "['tokenization_info', 'mlm', 'tokens']\n",
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
      "spm_model_file b'/var/folders/v6/vnz79w0d2dn95fj0mtnqs27m0000gn/T/tfhub_modules/098d91f064a4f53dffc7633d00c3d8e87f3a4716/assets/30k-clean.model'\n",
      "INFO:tensorflow:loading sentence piece model\n"
     ],
     "output_type": "stream"
    },
    {
     "name": "stderr",
     "text": [
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
      "INFO:tensorflow:loading sentence piece model\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "import tensorflow_hub as hub\n",
    "import tensorflow as tf\n",
    "\n",
    "import tokenization\n",
    "\n",
    "albert_module = hub.Module(\"https://tfhub.dev/google/albert_base/2\",\n",
    "                           trainable=True)\n",
    "\n",
    "print(albert_module.get_signature_names())\n",
    "\n",
    "tokenization_info = albert_module(signature=\"tokenization_info\",\n",
    "                                  as_dict=True)\n",
    "with tf.Session() as sess:\n",
    "    spm_model_file, lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
    "                                           tokenization_info[\"do_lower_case\"]])\n",
    "print('spm_model_file', spm_model_file)\n",
    "tokenizer = tokenization.FullTokenizer(vocab_file=None,\n",
    "                                       do_lower_case=lower_case,\n",
    "                                       spm_model_file=spm_model_file)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "INFO:tensorflow:using sentence piece tokenzier.\n",
      "INFO:tensorflow:using sentence piece tokenzier.\n",
      "input_tokens [['▁', 'N', 'ew', '▁', 'D', 'el', 'hi', '▁is', '▁the', '▁capital', '▁of', '▁', 'I', 'nd', 'ia'], ['▁', 'T', 'he', '▁capital', '▁of', '▁', 'I', 'nd', 'ia', '▁is', '▁', 'D', 'el', 'hi']]\n",
      "input_ids [[13, 1, 4460, 13, 1, 532, 1822, 25, 14, 1057, 16, 13, 1, 706, 549], [13, 1, 438, 1057, 16, 13, 1, 706, 549, 25, 13, 1, 532, 1822]]\n",
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
      "pooled_output [[ 0.9138424  -0.91918063 -0.9750561  ... -0.09777299 -0.6580293\n",
      "   0.85734063]\n",
      " [ 0.9554146  -0.969018   -0.98720574 ... -0.33370218 -0.9834848\n",
      "   0.9518022 ]]\n",
      "sequence_output [[[-0.79346025 -0.9684168   0.68525565 ... -1.2617347   0.73567647\n",
      "    0.4075023 ]\n",
      "  [-1.4199482  -1.4559035   0.29453015 ... -1.1323366   0.622609\n",
      "   -0.10111377]\n",
      "  [-1.2416221  -1.3402516   0.5318976  ... -1.100372    0.61021024\n",
      "   -0.0320334 ]\n",
      "  ...\n",
      "  [-1.4569476  -1.4031975   0.5440552  ... -1.0294002   0.57723916\n",
      "   -0.19555172]\n",
      "  [-1.4182447  -1.2609751   0.8265942  ... -0.83261204  0.5850436\n",
      "   -0.36173826]\n",
      "  [-1.3521252  -1.089019    1.0707805  ... -0.7020445   0.7099029\n",
      "   -0.35090202]]\n",
      "\n",
      " [[ 0.50215864 -0.8857083   0.21554916 ...  0.10951228 -0.1213384\n",
      "    0.29680854]\n",
      "  [-1.0673571  -2.8334231   0.41561085 ...  1.5424173  -0.46648303\n",
      "    0.21906391]\n",
      "  [-0.9393208  -2.6763246   0.82966477 ...  1.5915289  -0.55339867\n",
      "    0.21424139]\n",
      "  ...\n",
      "  [-0.90017676 -2.3508196   0.7401754  ...  1.0992681  -0.6233507\n",
      "    0.17976789]\n",
      "  [-0.92405754 -2.305296    0.9214774  ...  1.1633668  -0.49488556\n",
      "    0.21457243]\n",
      "  [-0.9365634  -2.3063788   1.0833699  ...  1.3754547  -0.5631695\n",
      "    0.19878839]]]\n"
     ],
     "output_type": "stream"
    },
    {
     "name": "stderr",
     "text": [
      "INFO:tensorflow:using sentence piece tokenzier.\n",
      "INFO:tensorflow:using sentence piece tokenzier.\n",
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "sample_input_texts = ['New Delhi is the capital of India',\n",
    "                      'The capital of India is Delhi']\n",
    "input_tokens = [tokenizer.tokenize(t) for t in sample_input_texts]\n",
    "input_ids = [tokenizer.convert_tokens_to_ids(t) for t in input_tokens]\n",
    "print('input_tokens', input_tokens)\n",
    "print('input_ids', input_ids)  # len(input_ids[0])==15,  len(input_ids[1]) == 14\n",
    "#\n",
    "input_ids[1].append(0)  # pad second sentence tokens to 15\n",
    "segment_ids = [[0] * 15, [0] * 15]\n",
    "\n",
    "# only last token of the second sentence is masked\n",
    "input_mask = [[1] * 15, [1] * 14 + [0]]\n",
    "with tf.Session() as sess:\n",
    "    albert_inputs = dict(\n",
    "        input_ids=input_ids,\n",
    "        input_mask=input_mask,\n",
    "        segment_ids=segment_ids)\n",
    "    albert_outputs = albert_module(\n",
    "        inputs=albert_inputs,\n",
    "        signature=\"tokens\",\n",
    "        as_dict=True)\n",
    "    sess.run(tf.global_variables_initializer())\n",
    "    sess.run(tf.tables_initializer())\n",
    "    pooled_output, sequence_output = sess.run(\n",
    "        [albert_outputs[\"pooled_output\"], albert_outputs[\"sequence_output\"]])\n",
    "print('pooled_output', pooled_output)\n",
    "print('sequence_output', sequence_output)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true,
	"pycharm": {
	"is_executing": false
	}
	},
	"outputs": [],
	"source": [
	"import os\n",
	"os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n",
	"# assume you installed tensorflow and tensorlow_hub\n",
	"# wget https://raw.githubusercontent.com/google-research/ALBERT/master/tokenization.py\n",
	"# pip install sentencepiece"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"outputs": [
	{
	"name": "stdout",
	"text": [
	"['tokenization_info', 'mlm', 'tokens']\n",
	"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
	"spm_model_file b'/var/folders/v6/vnz79w0d2dn95fj0mtnqs27m0000gn/T/tfhub_modules/098d91f064a4f53dffc7633d00c3d8e87f3a4716/assets/30k-clean.model'\n",
	"INFO:tensorflow:loading sentence piece model\n"
	],
	"output_type": "stream"
	},
	{
	"name": "stderr",
	"text": [
	"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
	"INFO:tensorflow:loading sentence piece model\n"
	],
	"output_type": "stream"
	}
	],
	"source": [
	"import tensorflow_hub as hub\n",
	"import tensorflow as tf\n",
	"\n",
	"import tokenization\n",
	"\n",
	"albert_module = hub.Module(\"https://tfhub.dev/google/albert_base/2\",\n",
	" trainable=True)\n",
	"\n",
	"print(albert_module.get_signature_names())\n",
	"\n",
	"tokenization_info = albert_module(signature=\"tokenization_info\",\n",
	" as_dict=True)\n",
	"with tf.Session() as sess:\n",
	" spm_model_file, lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
	" tokenization_info[\"do_lower_case\"]])\n",
	"print('spm_model_file', spm_model_file)\n",
	"tokenizer = tokenization.FullTokenizer(vocab_file=None,\n",
	" do_lower_case=lower_case,\n",
	" spm_model_file=spm_model_file)"
	],
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"name": "#%%\n",
	"is_executing": false
	}
	}
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"outputs": [
	{
	"name": "stdout",
	"text": [
	"INFO:tensorflow:using sentence piece tokenzier.\n",
	"INFO:tensorflow:using sentence piece tokenzier.\n",
	"input_tokens [['▁', 'N', 'ew', '▁', 'D', 'el', 'hi', '▁is', '▁the', '▁capital', '▁of', '▁', 'I', 'nd', 'ia'], ['▁', 'T', 'he', '▁capital', '▁of', '▁', 'I', 'nd', 'ia', '▁is', '▁', 'D', 'el', 'hi']]\n",
	"input_ids [[13, 1, 4460, 13, 1, 532, 1822, 25, 14, 1057, 16, 13, 1, 706, 549], [13, 1, 438, 1057, 16, 13, 1, 706, 549, 25, 13, 1, 532, 1822]]\n",
	"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
	"pooled_output [[ 0.9138424 -0.91918063 -0.9750561 ... -0.09777299 -0.6580293\n",
	" 0.85734063]\n",
	" [ 0.9554146 -0.969018 -0.98720574 ... -0.33370218 -0.9834848\n",
	" 0.9518022 ]]\n",
	"sequence_output [[[-0.79346025 -0.9684168 0.68525565 ... -1.2617347 0.73567647\n",
	" 0.4075023 ]\n",
	" [-1.4199482 -1.4559035 0.29453015 ... -1.1323366 0.622609\n",
	" -0.10111377]\n",
	" [-1.2416221 -1.3402516 0.5318976 ... -1.100372 0.61021024\n",
	" -0.0320334 ]\n",
	" ...\n",
	" [-1.4569476 -1.4031975 0.5440552 ... -1.0294002 0.57723916\n",
	" -0.19555172]\n",
	" [-1.4182447 -1.2609751 0.8265942 ... -0.83261204 0.5850436\n",
	" -0.36173826]\n",
	" [-1.3521252 -1.089019 1.0707805 ... -0.7020445 0.7099029\n",
	" -0.35090202]]\n",
	"\n",
	" [[ 0.50215864 -0.8857083 0.21554916 ... 0.10951228 -0.1213384\n",
	" 0.29680854]\n",
	" [-1.0673571 -2.8334231 0.41561085 ... 1.5424173 -0.46648303\n",
	" 0.21906391]\n",
	" [-0.9393208 -2.6763246 0.82966477 ... 1.5915289 -0.55339867\n",
	" 0.21424139]\n",
	" ...\n",
	" [-0.90017676 -2.3508196 0.7401754 ... 1.0992681 -0.6233507\n",
	" 0.17976789]\n",
	" [-0.92405754 -2.305296 0.9214774 ... 1.1633668 -0.49488556\n",
	" 0.21457243]\n",
	" [-0.9365634 -2.3063788 1.0833699 ... 1.3754547 -0.5631695\n",
	" 0.19878839]]]\n"
	],
	"output_type": "stream"
	},
	{
	"name": "stderr",
	"text": [
	"INFO:tensorflow:using sentence piece tokenzier.\n",
	"INFO:tensorflow:using sentence piece tokenzier.\n",
	"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
	],
	"output_type": "stream"
	}
	],
	"source": [
	"sample_input_texts = ['New Delhi is the capital of India',\n",
	" 'The capital of India is Delhi']\n",
	"input_tokens = [tokenizer.tokenize(t) for t in sample_input_texts]\n",
	"input_ids = [tokenizer.convert_tokens_to_ids(t) for t in input_tokens]\n",
	"print('input_tokens', input_tokens)\n",
	"print('input_ids', input_ids) # len(input_ids[0])==15, len(input_ids[1]) == 14\n",
	"#\n",
	"input_ids[1].append(0) # pad second sentence tokens to 15\n",
	"segment_ids = [[0] * 15, [0] * 15]\n",
	"\n",
	"# only last token of the second sentence is masked\n",
	"input_mask = [[1] * 15, [1] * 14 + [0]]\n",
	"with tf.Session() as sess:\n",
	" albert_inputs = dict(\n",
	" input_ids=input_ids,\n",
	" input_mask=input_mask,\n",
	" segment_ids=segment_ids)\n",
	" albert_outputs = albert_module(\n",
	" inputs=albert_inputs,\n",
	" signature=\"tokens\",\n",
	" as_dict=True)\n",
	" sess.run(tf.global_variables_initializer())\n",
	" sess.run(tf.tables_initializer())\n",
	" pooled_output, sequence_output = sess.run(\n",
	" [albert_outputs[\"pooled_output\"], albert_outputs[\"sequence_output\"]])\n",
	"print('pooled_output', pooled_output)\n",
	"print('sequence_output', sequence_output)\n"
	],
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"name": "#%%\n",
	"is_executing": false
	}
	}
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.6"
	},
	"pycharm": {
	"stem_cell": {
	"cell_type": "raw",
	"source": [],
	"metadata": {
	"collapsed": false
	}
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found