Last active
May 26, 2022 05:07
-
-
Save mohanr/5df47f748d28e7e75a261a41df5b5ff0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9229d8fd-f0a9-4d66-b7c0-8700a7076202", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%writefile task.py\n", | |
"\n", | |
"import os\n", | |
"import numpy as np\n", | |
"import tensorflow as tf\n", | |
"from tensorflow import keras\n", | |
"import time\n", | |
"\n", | |
"# At the begining of the program\n", | |
"distribution = tf.distribute.MultiWorkerMirroredStrategy()\n", | |
"\n", | |
"resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()\n", | |
"print(\"Starting task {}{}\".format(resolver.task_type, resolver.task_id))\n", | |
"\n", | |
"# Only worker #0 will write checkpoints and log to TensorBoard\n", | |
"if resolver.task_id == 0:\n", | |
" root_logdir = os.path.join(os.curdir, # os.getcwd(), #os.curdir,\n", | |
" \"task_logs/\",\n", | |
" \"psworker_logs\"\n", | |
" )\n", | |
" run_id = time.strftime(\"run_%Y_%m_%d-%H_%M_%S\")\n", | |
" run_dir = os.path.join(root_logdir, run_id)\n", | |
" callbacks = [keras.callbacks.TensorBoard(run_dir),\n", | |
" keras.callbacks.ModelCheckpoint(\"psworker_model.h5\",\n", | |
" save_best_only=True\n", | |
" ),\n", | |
" ]\n", | |
"else:\n", | |
" callbacks = []\n", | |
"\n", | |
"# load and prepare the MNIST dataset\n", | |
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\n", | |
"X_train_full = X_train_full[..., np.newaxis] / 255.\n", | |
"X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n", | |
"y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n", | |
"\n", | |
"with distribution.scope():\n", | |
" model = keras.models.Sequential([\n", | |
" keras.layers.Conv2D(filters=64, kernel_size=7, activation=\"relu\",\n", | |
" padding=\"same\", input_shape=[28, 28, 1]\n", | |
" ), # (None, 28, 28, 64)\n", | |
" keras.layers.MaxPooling2D(pool_size=2), # (None, 14, 14, 64)\n", | |
"\n", | |
" keras.layers.Conv2D(filters=128, kernel_size=3, activation=\"relu\",\n", | |
" padding=\"same\"\n", | |
" ), # (None, 14, 14, 128)\n", | |
" keras.layers.Conv2D(filters=128, kernel_size=3, activation=\"relu\",\n", | |
" padding=\"same\"\n", | |
" ),\n", | |
" keras.layers.MaxPooling2D(pool_size=2), # (None, 7, 7, 128)\n", | |
"\n", | |
" keras.layers.Flatten(), # (None, 6272)\n", | |
" keras.layers.Dense(units=64, activation='relu'), # (None, 64)\n", | |
" keras.layers.Dropout(0.5),\n", | |
" keras.layers.Dense(units=10, activation=\"softmax\"),\n", | |
" # (None, 10)\n", | |
" ])\n", | |
" model.compile(loss=\"sparse_categorical_crossentropy\",\n", | |
" optimizer=keras.optimizers.SGD(learning_rate=1e-2),\n", | |
" metrics=[\"accuracy\"]\n", | |
" )\n", | |
"\n", | |
"model.fit(X_train, y_train, validation_data=(X_valid, y_valid),\n", | |
" epochs=10, callbacks=callbacks\n", | |
" )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "8cd842e9-4755-4c75-acd7-f88dfb9f81df", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2022-05-25 07:44:47.504469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.504765: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.515766: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.515872: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.517581: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.517654: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.519971: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", | |
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
"2022-05-25 07:44:47.520006: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", | |
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
"2022-05-25 07:44:47.520607: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.520636: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.522471: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.522487: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.524314: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:47.524329: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.172215: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.173450: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.174346: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.175074: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5\n", | |
"2022-05-25 07:44:48.176202: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.177006: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.177751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.178593: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.179372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.180067: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:worker/replica:0/task:0/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5\n", | |
"2022-05-25 07:44:48.184702: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.185967: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.186076: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:9901, 1 -> 127.0.0.1:9902}\n", | |
"2022-05-25 07:44:48.186390: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:437] Started server with target: grpc://127.0.0.1:9901\n", | |
"2022-05-25 07:44:48.187072: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.187988: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5\n", | |
"2022-05-25 07:44:48.190367: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.191235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.192153: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.192898: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.193639: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
"2022-05-25 07:44:48.194373: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:worker/replica:0/task:1/device:GPU:0 with 13823 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5\n", | |
"2022-05-25 07:44:48.199645: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:9901, 1 -> 127.0.0.1:9902}\n", | |
"2022-05-25 07:44:48.200143: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:437] Started server with target: grpc://127.0.0.1:9902\n", | |
"2022-05-25 07:44:49.314752: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n", | |
"op: \"FlatMapDataset\"\n", | |
"input: \"PrefetchDataset/_8\"\n", | |
"attr {\n", | |
" key: \"Targuments\"\n", | |
" value {\n", | |
" list {\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"_cardinality\"\n", | |
" value {\n", | |
" i: -2\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"f\"\n", | |
" value {\n", | |
" func {\n", | |
" name: \"__inference_Dataset_flat_map_slice_batch_indices_308\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"metadata\"\n", | |
" value {\n", | |
" s: \"\\n\\020FlatMapDataset:4\"\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_shapes\"\n", | |
" value {\n", | |
" list {\n", | |
" shape {\n", | |
" dim {\n", | |
" size: -1\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_types\"\n", | |
" value {\n", | |
" list {\n", | |
" type: DT_INT64\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"experimental_type {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n", | |
"2022-05-25 07:44:49.316088: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n", | |
"op: \"FlatMapDataset\"\n", | |
"input: \"PrefetchDataset/_8\"\n", | |
"attr {\n", | |
" key: \"Targuments\"\n", | |
" value {\n", | |
" list {\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"_cardinality\"\n", | |
" value {\n", | |
" i: -2\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"f\"\n", | |
" value {\n", | |
" func {\n", | |
" name: \"__inference_Dataset_flat_map_slice_batch_indices_292\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"metadata\"\n", | |
" value {\n", | |
" s: \"\\n\\020FlatMapDataset:4\"\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_shapes\"\n", | |
" value {\n", | |
" list {\n", | |
" shape {\n", | |
" dim {\n", | |
" size: -1\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_types\"\n", | |
" value {\n", | |
" list {\n", | |
" type: DT_INT64\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"experimental_type {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Starting task worker1\n", | |
"Epoch 1/10\n", | |
"Starting task worker0\n", | |
"Epoch 1/10\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2022-05-25 07:44:52.699962: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200\n", | |
"2022-05-25 07:44:52.706704: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1719/1719 [==============================] - ETA: 0s - loss: 0.7568 - accuracy: 0.7579" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2022-05-25 07:45:23.092390: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n", | |
"op: \"FlatMapDataset\"\n", | |
"input: \"PrefetchDataset/_8\"\n", | |
"attr {\n", | |
" key: \"Targuments\"\n", | |
" value {\n", | |
" list {\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"_cardinality\"\n", | |
" value {\n", | |
" i: -2\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"f\"\n", | |
" value {\n", | |
" func {\n", | |
" name: \"__inference_Dataset_flat_map_slice_batch_indices_20246\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"metadata\"\n", | |
" value {\n", | |
" s: \"\\n\\021FlatMapDataset:31\"\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_shapes\"\n", | |
" value {\n", | |
" list {\n", | |
" shape {\n", | |
" dim {\n", | |
" size: -1\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_types\"\n", | |
" value {\n", | |
" list {\n", | |
" type: DT_INT64\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"experimental_type {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n", | |
"2022-05-25 07:45:23.114419: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: \"FlatMapDataset/_9\"\n", | |
"op: \"FlatMapDataset\"\n", | |
"input: \"PrefetchDataset/_8\"\n", | |
"attr {\n", | |
" key: \"Targuments\"\n", | |
" value {\n", | |
" list {\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"_cardinality\"\n", | |
" value {\n", | |
" i: -2\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"f\"\n", | |
" value {\n", | |
" func {\n", | |
" name: \"__inference_Dataset_flat_map_slice_batch_indices_20288\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"metadata\"\n", | |
" value {\n", | |
" s: \"\\n\\021FlatMapDataset:31\"\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_shapes\"\n", | |
" value {\n", | |
" list {\n", | |
" shape {\n", | |
" dim {\n", | |
" size: -1\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"attr {\n", | |
" key: \"output_types\"\n", | |
" value {\n", | |
" list {\n", | |
" type: DT_INT64\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"experimental_type {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" args {\n", | |
" type_id: TFT_DATASET\n", | |
" args {\n", | |
" type_id: TFT_PRODUCT\n", | |
" args {\n", | |
" type_id: TFT_TENSOR\n", | |
" args {\n", | |
" type_id: TFT_INT64\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
". Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1719/1719 [==============================] - 36s 18ms/step - loss: 0.7568 - accuracy: 0.7579 - val_loss: 0.1375 - val_accuracy: 0.9602\n", | |
"1719/1719 [==============================] - 36s 18ms/step - loss: 0.7568 - accuracy: 0.7579 - val_loss: 0.1375 - val_accuracy: 0.9602\n", | |
"Epoch 2/10\n", | |
"Epoch 2/10\n", | |
"1719/1719 [==============================] - 29s 17ms/step - loss: 0.2274 - accuracy: 0.9339 - val_loss: 0.0958 - val_accuracy: 0.9726\n", | |
"1719/1719 [==============================] - 31s 18ms/step - loss: 0.2274 - accuracy: 0.9339 - val_loss: 0.0958 - val_accuracy: 0.9726\n", | |
"Epoch 3/10\n", | |
"Epoch 3/10\n", | |
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1644 - accuracy: 0.9526 - val_loss: 0.0760 - val_accuracy: 0.9768\n", | |
"1719/1719 [==============================] - 31s 18ms/step - loss: 0.1644 - accuracy: 0.9526 - val_loss: 0.0760 - val_accuracy: 0.9768\n", | |
"Epoch 4/10\n", | |
"Epoch 4/10\n", | |
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1326 - accuracy: 0.9613 - val_loss: 0.0631 - val_accuracy: 0.9798\n", | |
"1719/1719 [==============================] - 31s 18ms/step - loss: 0.1326 - accuracy: 0.9613 - val_loss: 0.0631 - val_accuracy: 0.9798\n", | |
"Epoch 5/10\n", | |
"Epoch 5/10\n", | |
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1122 - accuracy: 0.9670 - val_loss: 0.0586 - val_accuracy: 0.9838\n", | |
"1719/1719 [==============================] - 30s 17ms/step - loss: 0.1122 - accuracy: 0.9670 - val_loss: 0.0586 - val_accuracy: 0.9838\n", | |
"Epoch 6/10\n", | |
"Epoch 6/10\n", | |
" 443/1719 [======>.......................] - ETA: 21s - loss: 0.0998 - accuracy: 0.9720" | |
] | |
} | |
], | |
"source": [ | |
"import subprocess\n", | |
"import json\n", | |
"import os\n", | |
"\n", | |
"# tf_config_str = os.environ.get('TF_CONFIG')\n", | |
"# tf_config_dict = json.loads(tf_config_str)\n", | |
"#\n", | |
"# # Convert back to string just for pretty printing\n", | |
"# print(json.dumps(tf_config_dict, indent=2))\n", | |
"\n", | |
"import os\n", | |
"import numpy as np\n", | |
"import tensorflow as tf\n", | |
"from tensorflow import keras\n", | |
"import time\n", | |
"\n", | |
"gpus = tf.config.experimental.list_physical_devices('GPU')\n", | |
"if gpus:\n", | |
" try:\n", | |
" for gpu in gpus:\n", | |
" tf.config.experimental.set_memory_growth(gpu, True)\n", | |
" except RuntimeError as e:\n", | |
" print(e)\n", | |
"# The cluster spec is a dictionary with one key per job,\n", | |
"# and the values are lists of task addresses (IP:port)\n", | |
"cluster_spec = { \"worker\":[\"127.0.0.1:9901\",\n", | |
" \"127.0.0.1:9902\"]\n", | |
" }\n", | |
"\n", | |
"# set the TF_CONFIG environment variable before starting TensorFlow\n", | |
"# JSON-encoded dictionary containing a cluster specification (under the \"cluster\" key)\n", | |
"# and the type and index of the current task (under the \"task\" key)\n", | |
"for index, worker_address in enumerate( cluster_spec[\"worker\"] ):\n", | |
" \n", | |
" os.environ['CUDA_VISIBLE_DEVICES']=str(index)\n", | |
" os.environ[\"TF_CONFIG\"] = json.dumps( { \"cluster\":cluster_spec,\n", | |
" \"task\":{\"type\":\"worker\",\n", | |
" \"index\": index}\n", | |
" } )\n", | |
" subprocess.Popen( \"python /home/jupyter/task.py\",\n", | |
" shell = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "c610ad33-a7c9-4694-a6b1-55bb3d03d488", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\n", | |
" \"cluster\": {\n", | |
" \"worker\": [\n", | |
" \"127.0.0.1:9901\",\n", | |
" \"127.0.0.1:9902\"\n", | |
" ]\n", | |
" },\n", | |
" \"task\": {\n", | |
" \"type\": \"worker\",\n", | |
" \"index\": 1\n", | |
" }\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"\n", | |
"tf_config_str = os.environ.get('TF_CONFIG')\n", | |
"tf_config_dict = json.loads(tf_config_str)\n", | |
"\n", | |
"# Convert back to string just for pretty printing\n", | |
"print(json.dumps(tf_config_dict, indent=2))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "97a2fa5b-aec5-4e7d-8fb2-956ef018905d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"environment": { | |
"kernel": "python3", | |
"name": "tf2-gpu.2-8.m92", | |
"type": "gcloud", | |
"uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-8:m92" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment