Last active
January 21, 2021 18:04
-
-
Save MikeOfZen/abadf58b9c68acd1b33c6e39af7b3f7a to your computer and use it in GitHub Desktop.
TPU training error .ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "TPU training error .ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"machine_shape": "hm", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "TPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/MikeOfZen/abadf58b9c68acd1b33c6e39af7b3f7a/catsdogs-transfer-learning-inception-tpu-tensorboard.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "P8HTG5Ejqqrq", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"USING_TPU=True #Change this to switch between gpu and tpu (also must be changed for the notebook settings offcours)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gP7aBI2b-EQx", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from google.colab import auth\n", | |
"auth.authenticate_user()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eALMi4wB9fRO", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"%tensorflow_version 2.x\n", | |
"#must use tf 1.x to use TPU properly\n", | |
"import tensorflow as tf\n", | |
"import matplotlib.pyplot as plt\n", | |
"import tensorflow_datasets as tfds\n", | |
"import os\n", | |
"import IPython\n", | |
"print(\"TF version:\",tf.version.GIT_VERSION, tf.version.VERSION)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Z0fIuiLOL--R", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#constants\n", | |
"IMG_HEIGHT=IMG_WIDTH=299\n", | |
"\n", | |
"DATASET_SIZE=23262\n", | |
"BATCH_SIZE=32\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "x2O14Njm-90k", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Dataset init" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "H561tB1H_P1R", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"@tf.function\n", | |
"def convert(image,label):\n", | |
" return (tf.image.convert_image_dtype(image, tf.float32),tf.expand_dims(tf.cast(label,tf.float32),0))\n", | |
"@tf.function\n", | |
"def resize(image,label):\n", | |
" return (tf.image.resize(image,(IMG_HEIGHT,IMG_WIDTH)),label)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "iKEc3TzT-WND", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"test_split, valid_split, train_split = tfds.Split.TRAIN.subsplit([10, 10, 80])\n", | |
"\n", | |
"#THE DATASET BUCKET MUST BE CHANGED FOR IT TO RUN!!!!!!!!!!!\n", | |
"train_ds = (tfds.load(\"cats_vs_dogs\", split=train_split,data_dir=\"gs://datasets_bucket_a/tmp/\", as_supervised=True) \n", | |
" .map(convert)\n", | |
" .map(resize)\n", | |
" .batch(BATCH_SIZE))\n", | |
"\n", | |
"validation_ds = (tfds.load(\"cats_vs_dogs\", split=valid_split,data_dir=\"gs://datasets_bucket_a/tmp/\", as_supervised=True) \n", | |
" .map(convert)\n", | |
" .map(resize)\n", | |
" .batch(BATCH_SIZE))#.cache()\n", | |
"\n", | |
"test_ds = (tfds.load(\"cats_vs_dogs\", split=test_split,data_dir=\"gs://datasets_bucket_a/tmp/\", as_supervised=True) \n", | |
" .map(convert)\n", | |
" .map(resize)\n", | |
" .batch(BATCH_SIZE))#.cache()\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nx-smnX_hgf5", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"training_batches=int(DATASET_SIZE*0.8/BATCH_SIZE)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ENcdVMAP-xpt", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"CLASSES=[\"Cat\",\"Dog\"]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Qh8yrs0s_Qn2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"@tf.function\n", | |
"def augment(images,labels):\n", | |
" #mirror\n", | |
" images=tf.image.random_flip_left_right(images)\n", | |
" #adjust contrast\n", | |
" images=tf.image.random_contrast(images, lower=0.5, upper=1.5)\n", | |
" images=tf.image.random_brightness(images, max_delta=0.2)\n", | |
" images=tf.image.random_hue(images,0.1)\n", | |
" images=tf.image.random_saturation(images,0.8,1.2)\n", | |
" images=tf.clip_by_value(images,0,1) #clipping is required as some of these functions seems to go out of bounds [0..1]\n", | |
" return images,labels" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "aYeI4RPL_YYC", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Prepeare training set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bofEKw7W_dJS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"train_ds_aug=(\n", | |
" train_ds\n", | |
" .take(training_batches) #an attempt to solve dataset cardinality problem, doesnt affect the issue\n", | |
" #.cache() #must be disabled for TF 2 to work, in TPU setting\n", | |
" .repeat()\n", | |
" #.shuffle(30)\n", | |
" .map(augment)\n", | |
" .prefetch(2))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "m-P__gd-_AAY", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Service func" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "sTbcgKJC-8FE", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def show_item(ds):\n", | |
" item=next(iter(ds.take(1)))\n", | |
" plt.imshow(item[0])\n", | |
" _=plt.title(f\"It's a {CLASSES[item[1].numpy()]}\")\n", | |
"def show_batch(ds):\n", | |
" image_batch, label_batch=next(iter(ds))\n", | |
" plt.figure(figsize=(10,10))\n", | |
" for n in range(25):\n", | |
" ax = plt.subplot(5,5,n+1)\n", | |
" plt.imshow(image_batch[n])\n", | |
" plt.axis('off')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "YgdaK1Sw_IiJ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Model definition" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "pZcHLuunwdlU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#snippet to select layers above chosen layer.\n", | |
"def get_layers_above(cutoff_layer,model):\n", | |
"\n", | |
" def get_next_level(layer,model):\n", | |
" def wrap_list(val):\n", | |
" if type(val) is list:\n", | |
" return val\n", | |
" return [val] \n", | |
" r=[]\n", | |
" for output_t in wrap_list(layer.output):\n", | |
" r+=[x for x in model.layers if output_t.name in [y.name for y in wrap_list(x.input)]]\n", | |
" return r\n", | |
"\n", | |
" visited=set()\n", | |
" to_visit=set([cutoff_layer])\n", | |
"\n", | |
" while to_visit:\n", | |
" layer=to_visit.pop()\n", | |
" to_visit.update(get_next_level(layer,model))\n", | |
" visited.add(layer)\n", | |
" return list(visited)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "h292y488KUgU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def create_model():\n", | |
" base_model = tf.keras.applications.InceptionV3(include_top=False,weights='imagenet',input_shape=(IMG_HEIGHT,IMG_WIDTH,3))\n", | |
" upper_layers=get_layers_above(base_model.get_layer('mixed8'),base_model)\n", | |
"\n", | |
" for layer in base_model.layers:\n", | |
" layer.trainable = False\n", | |
" for layer in upper_layers:\n", | |
" layer.trainable=True\n", | |
"\n", | |
" x = tf.keras.layers.Conv2D(448,4,2)(base_model.output)\n", | |
" x = tf.keras.layers.Flatten()(x)\n", | |
" x = tf.keras.layers.Dense(128, activation='relu')(x)\n", | |
" x = tf.keras.layers.Dropout(0.2)(x) \n", | |
" x = tf.keras.layers.Dense (1, activation='sigmoid')(x) \n", | |
"\n", | |
" model = tf.keras.Model(base_model.input, x)\n", | |
" return model" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IyyQ_xEIQ7KD", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#cpu_model=create_model()\n", | |
"#cpu_model.summary()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2aYipz5OJgVB", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#_=tf.keras.utils.plot_model(cpu_model, to_file=\"full_model.png\", show_shapes=True)\n", | |
"#IPython.display.Image(\"model.png\")" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "0MWlLSLpR0DS", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# TPU config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "WN_0FujNM6Rb", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"if USING_TPU:\n", | |
" try:\n", | |
" os.environ['COLAB_TPU_ADDR']\n", | |
" print(\"TPU Found: \"+os.environ['COLAB_TPU_ADDR'])\n", | |
" except KeyError:\n", | |
" print(\"Must load TPU\")" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cqQbM792R7P4", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"if USING_TPU:\n", | |
" resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])\n", | |
" tf.config.experimental_connect_to_cluster(resolver)\n", | |
" tf.tpu.experimental.initialize_tpu_system(resolver)\n", | |
" strategy = tf.distribute.experimental.TPUStrategy(resolver)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jHk1qIhqKb0b", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"if USING_TPU:\n", | |
" with strategy.scope():\n", | |
" model=create_model()\n", | |
" model.compile(\n", | |
" optimizer=tf.keras.optimizers.Adam(),\n", | |
" loss=\"binary_crossentropy\",\n", | |
" metrics=['acc'])\n", | |
"else:\n", | |
" model=create_model()\n", | |
" model.compile(\n", | |
" optimizer=tf.keras.optimizers.Adam(),\n", | |
" loss=\"binary_crossentropy\",\n", | |
" metrics=['acc'])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "58scjZ2eSFgx", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# TRAINING" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "KoTruG99SEhX", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"model.fit(train_ds_aug,epochs=4,steps_per_epoch=training_batches)#,validation_data=validation_ds,validation_steps=10,validation_freq=5)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "n3JoQXmETqSc", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"model.evaluate(test_ds)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment