Created
April 24, 2020 06:31
-
-
Save ceshine/0549086d8c59efb1d706f6e369b8e136 to your computer and use it in GitHub Desktop.
Tensorflow Profiler with Custom Training Loop
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/data/miniconda3/envs/tf/lib/python3.7/site-packages/tf_helper_bot/bot.py:9: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", | |
" from tqdm.autonotebook import tqdm\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import logging\n", | |
"\n", | |
"import numpy as np\n", | |
"import tensorflow as tf\n", | |
"from tensorflow.keras.mixed_precision import experimental as amp\n", | |
"from tf_helper_bot import BaseBot, MovingAverageStatsTrackerCallback\n", | |
"#from tf_helper_bot.schedulers import CosineDecayWithWarmup\n", | |
"from tf_helper_bot.mixup import mixup_loss_fn\n", | |
"from tf_helper_bot.optimizers import RAdam\n", | |
"from tensorflow.python.profiler import profiler_v2 as profiler\n", | |
"\n", | |
"from cliff.model import get_model\n", | |
"from cliff.dataset import tfrecord_dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [], | |
"source": [ | |
"MIXED_PRECISION = True\n", | |
"BATCH_SIZE = 24\n", | |
"ARCH = \"b3\"\n", | |
"GRAD_ACCU = 1\n", | |
"TRAIN_FOLDER = \"../data/tfrecords/train/\"\n", | |
"# VALID_FOLDER = \"../data/tfrecords/valid/\"\n", | |
"logging.getLogger(\"tensorflow\").setLevel(logging.INFO)\n", | |
"os.environ[\"TF_GPU_THREAD_MODE\"] = \"gpu_private\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Compute dtype: float16\n", | |
"Variable dtype: float32\n" | |
] | |
} | |
], | |
"source": [ | |
"if MIXED_PRECISION:\n", | |
" policy = amp.Policy('mixed_float16')\n", | |
" amp.set_policy(policy)\n", | |
" print('Compute dtype: %s' % policy.compute_dtype)\n", | |
" print('Variable dtype: %s' % policy.variable_dtype)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [], | |
"source": [ | |
"model = get_model(ARCH, n_classes=6)\n", | |
"class_weights = np.array([1., 1., 3., 2., 3., 2.])\n", | |
"strategy = tf.distribute.get_strategy()\n", | |
"train_dataset, train_steps = tfrecord_dataset(\n", | |
" tf.io.gfile.glob(TRAIN_FOLDER + \"*\"),\n", | |
" BATCH_SIZE, is_train=True, strategy=strategy,\n", | |
" return_sample_weights=False,\n", | |
" class_weights=class_weights,\n", | |
" mixup_alpha=-1, # disabled\n", | |
" cutmix_alpha=-1 # disabled\n", | |
")\n", | |
"optimizer = RAdam(learning_rate=2e-4, epsilon=1e-6)\n", | |
"if MIXED_PRECISION:\n", | |
" optimizer = amp.LossScaleOptimizer(optimizer, loss_scale='dynamic')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [], | |
"source": [ | |
"bot = BaseBot(\n", | |
" model=model,\n", | |
" criterion=mixup_loss_fn,\n", | |
" optimizer=optimizer,\n", | |
" train_dataset=train_dataset,\n", | |
" valid_dataset=None,\n", | |
" steps_per_epoch=train_steps,\n", | |
" gradient_accumulation_steps=GRAD_ACCU,\n", | |
" callbacks=(\n", | |
" MovingAverageStatsTrackerCallback(\n", | |
" avg_window=3,\n", | |
" log_interval=2,\n", | |
" ),\n", | |
" ),\n", | |
" metrics=(),\n", | |
" mixed_precision=MIXED_PRECISION\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO][04/24/2020 14:23:12] Step 2 | loss 1.9551 | lr 2.00e-04 | 24.209s per step\n", | |
"[INFO][04/24/2020 14:23:13] Step 4 | loss 1.8648 | lr 2.00e-04 | 0.340s per step\n", | |
"[INFO][04/24/2020 14:23:14] Step 6 | loss 1.8709 | lr 2.00e-04 | 0.358s per step\n", | |
"[INFO][04/24/2020 14:23:14] Step 8 | loss 1.8832 | lr 2.00e-04 | 0.363s per step\n", | |
"[INFO][04/24/2020 14:23:15] Step 10 | loss 1.8606 | lr 2.00e-04 | 0.342s per step\n" | |
] | |
} | |
], | |
"source": [ | |
"# Warmup (compile the graph and train a few steps)\n", | |
"bot.train(checkpoint_interval=1000, n_steps=10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO][04/24/2020 14:23:19] Step 12 | loss 1.7825 | lr 2.00e-04 | 2.058s per step\n", | |
"[INFO][04/24/2020 14:23:20] Step 14 | loss 1.7328 | lr 2.00e-04 | 0.459s per step\n", | |
"[INFO][04/24/2020 14:23:21] Step 16 | loss 1.8026 | lr 2.00e-04 | 0.434s per step\n", | |
"[INFO][04/24/2020 14:23:22] Step 18 | loss 1.8207 | lr 2.00e-04 | 0.451s per step\n", | |
"[INFO][04/24/2020 14:23:23] Step 20 | loss 1.8043 | lr 2.00e-04 | 0.412s per step\n" | |
] | |
} | |
], | |
"source": [ | |
"tf.profiler.experimental.start('../cache/tblogdir')\n", | |
"bot.train(checkpoint_interval=1000, n_steps=10)\n", | |
"tf.profiler.experimental.stop()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO][04/24/2020 14:23:40] Step 22 | loss 1.8380 | lr 2.00e-04 | 2.191s per step\n", | |
"[INFO][04/24/2020 14:23:40] Step 24 | loss 1.8116 | lr 2.00e-04 | 0.408s per step\n", | |
"[INFO][04/24/2020 14:23:41] Step 26 | loss 1.7550 | lr 2.00e-04 | 0.431s per step\n", | |
"[INFO][04/24/2020 14:23:42] Step 28 | loss 1.7031 | lr 2.00e-04 | 0.437s per step\n", | |
"[INFO][04/24/2020 14:23:43] Step 30 | loss 1.6449 | lr 2.00e-04 | 0.418s per step\n" | |
] | |
} | |
], | |
"source": [ | |
"# https://github.com/tensorflow/tensorflow/blob/e02b78e9df4e74161ae9733e038fd978db75901e/tensorflow/python/keras/callbacks.py#L1706\n", | |
"profiler.warmup()\n", | |
"profiler.start(logdir='../cache/tblogdir')\n", | |
"bot.train(checkpoint_interval=1000, n_steps=10)\n", | |
"profiler.stop()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"Collapsed": "false" | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment