Skip to content

Instantly share code, notes, and snippets.

@csarron
Created December 13, 2019 03:42
Show Gist options
  • Select an option

  • Save csarron/ed68a43e1716db855fbde02b53ecc1bb to your computer and use it in GitHub Desktop.

Select an option

Save csarron/ed68a43e1716db855fbde02b53ecc1bb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true,
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import os\n",
"os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n",
"# assume you installed tensorflow and tensorlow_hub\n",
"# wget https://raw.githubusercontent.com/google-research/ALBERT/master/tokenization.py\n",
"# pip install sentencepiece"
]
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"name": "stdout",
"text": [
"['tokenization_info', 'mlm', 'tokens']\n",
"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
"spm_model_file b'/var/folders/v6/vnz79w0d2dn95fj0mtnqs27m0000gn/T/tfhub_modules/098d91f064a4f53dffc7633d00c3d8e87f3a4716/assets/30k-clean.model'\n",
"INFO:tensorflow:loading sentence piece model\n"
],
"output_type": "stream"
},
{
"name": "stderr",
"text": [
"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
"INFO:tensorflow:loading sentence piece model\n"
],
"output_type": "stream"
}
],
"source": [
"import tensorflow_hub as hub\n",
"import tensorflow as tf\n",
"\n",
"import tokenization\n",
"\n",
"albert_module = hub.Module(\"https://tfhub.dev/google/albert_base/2\",\n",
" trainable=True)\n",
"\n",
"print(albert_module.get_signature_names())\n",
"\n",
"tokenization_info = albert_module(signature=\"tokenization_info\",\n",
" as_dict=True)\n",
"with tf.Session() as sess:\n",
" spm_model_file, lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
" tokenization_info[\"do_lower_case\"]])\n",
"print('spm_model_file', spm_model_file)\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=None,\n",
" do_lower_case=lower_case,\n",
" spm_model_file=spm_model_file)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"text": [
"INFO:tensorflow:using sentence piece tokenzier.\n",
"INFO:tensorflow:using sentence piece tokenzier.\n",
"input_tokens [['▁', 'N', 'ew', '▁', 'D', 'el', 'hi', '▁is', '▁the', '▁capital', '▁of', '▁', 'I', 'nd', 'ia'], ['▁', 'T', 'he', '▁capital', '▁of', '▁', 'I', 'nd', 'ia', '▁is', '▁', 'D', 'el', 'hi']]\n",
"input_ids [[13, 1, 4460, 13, 1, 532, 1822, 25, 14, 1057, 16, 13, 1, 706, 549], [13, 1, 438, 1057, 16, 13, 1, 706, 549, 25, 13, 1, 532, 1822]]\n",
"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n",
"pooled_output [[ 0.9138424 -0.91918063 -0.9750561 ... -0.09777299 -0.6580293\n",
" 0.85734063]\n",
" [ 0.9554146 -0.969018 -0.98720574 ... -0.33370218 -0.9834848\n",
" 0.9518022 ]]\n",
"sequence_output [[[-0.79346025 -0.9684168 0.68525565 ... -1.2617347 0.73567647\n",
" 0.4075023 ]\n",
" [-1.4199482 -1.4559035 0.29453015 ... -1.1323366 0.622609\n",
" -0.10111377]\n",
" [-1.2416221 -1.3402516 0.5318976 ... -1.100372 0.61021024\n",
" -0.0320334 ]\n",
" ...\n",
" [-1.4569476 -1.4031975 0.5440552 ... -1.0294002 0.57723916\n",
" -0.19555172]\n",
" [-1.4182447 -1.2609751 0.8265942 ... -0.83261204 0.5850436\n",
" -0.36173826]\n",
" [-1.3521252 -1.089019 1.0707805 ... -0.7020445 0.7099029\n",
" -0.35090202]]\n",
"\n",
" [[ 0.50215864 -0.8857083 0.21554916 ... 0.10951228 -0.1213384\n",
" 0.29680854]\n",
" [-1.0673571 -2.8334231 0.41561085 ... 1.5424173 -0.46648303\n",
" 0.21906391]\n",
" [-0.9393208 -2.6763246 0.82966477 ... 1.5915289 -0.55339867\n",
" 0.21424139]\n",
" ...\n",
" [-0.90017676 -2.3508196 0.7401754 ... 1.0992681 -0.6233507\n",
" 0.17976789]\n",
" [-0.92405754 -2.305296 0.9214774 ... 1.1633668 -0.49488556\n",
" 0.21457243]\n",
" [-0.9365634 -2.3063788 1.0833699 ... 1.3754547 -0.5631695\n",
" 0.19878839]]]\n"
],
"output_type": "stream"
},
{
"name": "stderr",
"text": [
"INFO:tensorflow:using sentence piece tokenzier.\n",
"INFO:tensorflow:using sentence piece tokenzier.\n",
"INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
],
"output_type": "stream"
}
],
"source": [
"sample_input_texts = ['New Delhi is the capital of India',\n",
" 'The capital of India is Delhi']\n",
"input_tokens = [tokenizer.tokenize(t) for t in sample_input_texts]\n",
"input_ids = [tokenizer.convert_tokens_to_ids(t) for t in input_tokens]\n",
"print('input_tokens', input_tokens)\n",
"print('input_ids', input_ids) # len(input_ids[0])==15, len(input_ids[1]) == 14\n",
"#\n",
"input_ids[1].append(0) # pad second sentence tokens to 15\n",
"segment_ids = [[0] * 15, [0] * 15]\n",
"\n",
"# only last token of the second sentence is masked\n",
"input_mask = [[1] * 15, [1] * 14 + [0]]\n",
"with tf.Session() as sess:\n",
" albert_inputs = dict(\n",
" input_ids=input_ids,\n",
" input_mask=input_mask,\n",
" segment_ids=segment_ids)\n",
" albert_outputs = albert_module(\n",
" inputs=albert_inputs,\n",
" signature=\"tokens\",\n",
" as_dict=True)\n",
" sess.run(tf.global_variables_initializer())\n",
" sess.run(tf.tables_initializer())\n",
" pooled_output, sequence_output = sess.run(\n",
" [albert_outputs[\"pooled_output\"], albert_outputs[\"sequence_output\"]])\n",
"print('pooled_output', pooled_output)\n",
"print('sequence_output', sequence_output)\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment