-
-
Save jfsantos/bc37eb9bc506a6f165152d676e3ccd42 to your computer and use it in GitHub Desktop.
''' | |
A logistic regression example using the meta-graph checkpointing | |
features of Tensorflow. | |
Author: João Felipe Santos, based on code by Aymeric Damien | |
(https://github.com/aymericdamien/TensorFlow-Examples/) | |
''' | |
from __future__ import print_function | |
import tensorflow as tf | |
import numpy as np | |
import argparse | |
# Import MNIST data | |
from tensorflow.examples.tutorials.mnist import input_data | |
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True) | |
# Parameters | |
learning_rate = 0.01 | |
batch_size = 100 | |
display_step = 1 | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--load', default=False) | |
parser.add_argument('--max_epochs', type=int, default=5) | |
args = parser.parse_args() | |
load = args.load | |
training_epochs = args.max_epochs | |
# Instantiate saver | |
if not load: | |
# tf Graph Input | |
x = tf.placeholder(tf.float32, [None, 784], name='x') # mnist data image of shape 28*28=784 | |
y = tf.placeholder(tf.float32, [None, 10], name='y') # 0-9 digits recognition => 10 classes | |
# Set model weights | |
W = tf.Variable(tf.zeros([784, 10]), name='W') | |
b = tf.Variable(tf.zeros([10]), name='b') | |
# Construct model | |
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax | |
# Minimize error using cross entropy | |
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) | |
# Gradient Descent | |
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) | |
init = tf.initialize_all_variables() | |
saver = tf.train.Saver() | |
# In order to be able to easily retrieve variables and ops later, | |
# we add them to collections | |
tf.add_to_collection('train_op', optimizer) | |
tf.add_to_collection('cost_op', cost) | |
tf.add_to_collection('input', x) | |
tf.add_to_collection('target', y) | |
tf.add_to_collection('pred', pred) | |
initial_epoch = 0 | |
else: | |
# Find last executed epoch | |
from glob import glob | |
history = list(map(lambda x: int(x.split('-')[1][:-5]), glob('model.ckpt-*.meta'))) | |
last_epoch = np.max(history) | |
# Instantiate saver object using previously saved meta-graph | |
saver = tf.train.import_meta_graph('model.ckpt-{}.meta'.format(last_epoch)) | |
initial_epoch = last_epoch + 1 | |
# Launch the graph | |
with tf.Session() as sess: | |
if not load: | |
sess.run(init) | |
else: | |
saver.restore(sess, 'model.ckpt') | |
optimizer = tf.get_collection('train_op')[0] | |
cost = tf.get_collection('cost_op')[0] | |
x = tf.get_collection('input')[0] | |
y = tf.get_collection('target')[0] | |
pred = tf.get_collection('pred')[0] | |
# Training cycle | |
for epoch in range(initial_epoch, training_epochs): | |
avg_cost = 0. | |
total_batch = int(mnist.train.num_examples/batch_size) | |
# Loop over all batches | |
for i in range(total_batch): | |
batch_xs, batch_ys = mnist.train.next_batch(batch_size) | |
# Run optimization op (backprop) and cost op (to get loss value) | |
_, c = sess.run([optimizer, cost], feed_dict={x: batch_xs, | |
y: batch_ys}) | |
# Compute average loss | |
avg_cost += c / total_batch | |
# Display logs per epoch step | |
if (epoch+1) % display_step == 0: | |
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)) | |
saver.save(sess, 'model.ckpt', global_step=epoch) | |
print("Optimization Finished!") | |
# Test model | |
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) | |
# Calculate accuracy | |
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) | |
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})) |
Thanks for this. This is useful.
However, I get an error while trying to resume training after checkpointing.
Here's the stack trace. FYI - I added a few print statements to print out the last epoch, and changed the model name. Just cosmetic.
I see that the .meta files are there, and so are the index files, etc.
FWIW, this is TensorFlow 1.1.0
Any ideas what's incorrect??
tf1 ▶ ~ ▶ Developer ❯ courses ❯ self_driving_car ❯ semantic_segment ▶ $ ▶ python logistic_regression_checkpointing.py --load True --max_epochs 10
Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
Looking for existing checkpoint files...
history: [0, 1, 2, 3, 4]
last epoch: 4
last checkpoint: ./my_model-4.meta
2017-08-30 12:03:12.359184: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359223: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359229: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359234: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.359238: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-08-30 12:03:12.820262: E tensorflow/stream_executor/cuda/cuda_driver.cc:405] failed call to cuInit: CUDA_ERROR_NO_DEVICE
2017-08-30 12:03:12.820595: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:158] retrieving CUDA diagnostic information for host: Atuls-MacBook-Pro.local
2017-08-30 12:03:12.820608: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:165] hostname: Atuls-MacBook-Pro.local
2017-08-30 12:03:12.820820: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:189] libcuda reported version is: 310.42.25
2017-08-30 12:03:12.821011: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:193] kernel reported version is: Invalid argument: expected %d.%d or %d.%d.%d form for driver version; got ""
-- Restoring model --
2017-08-30 12:03:12.880175: W tensorflow/core/framework/op_kernel.cc:1152] Not found: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
2017-08-30 12:03:12.880800: W tensorflow/core/framework/op_kernel.cc:1152] Not found: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
Traceback (most recent call last):
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1039, in _do_call
return fn(*args)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1021, in _run_fn
status, run_metadata)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/contextlib.py", line 66, in __exit__
next(self.gen)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
[[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "logistic_regression_checkpointing.py", line 96, in <module>
saver.restore(sess, MODEL_FILE)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1457, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 778, in run
run_metadata_ptr)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 982, in _run
feed_dict_string, options, run_metadata)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1032, in _do_run
target_list, options, run_metadata)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1052, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
[[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]
Caused by op 'save/RestoreV2', defined at:
File "logistic_regression_checkpointing.py", line 87, in <module>
saver = tf.train.import_meta_graph(latest_ckpt_name_base + '.meta')
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1595, in import_meta_graph
**kwargs)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/meta_graph.py", line 499, in import_scoped_meta_graph
producer_op_list=producer_op_list)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/importer.py", line 308, in import_graph_def
op_def=op_def)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/Users/aa/Developer/miniconda/envs/tf1/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
self._traceback = _extract_stack()
NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for my_model
[[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]
Do the parameters for the optimizer get restored or does the initialization op for the optimizer just reset the optimizer to as it was at the beginning of training?
I ran this file using both commands as mentioned in directions but I ran into an error and then I realized that there is an error in the logic as well. You have not given any information in the saver.restore() function which can help in loading the last saved checkpoint. Line 75 in the file should be changed to the following:
saver.restore(sess, tf.train.latest_checkpoint('./'))
Otherwise, you will get an error when running the second command. Thanks.
How to try this:
python logistic_regression_with_checkpointing.py
). It will run for 5 epochs and save checkpoints for each epoch.--load True --max_epochs 10
. The script will detect it has already trained for 5 epochs, and run for another 5 epochs.