lakshmanok · August 28, 2018 16:12
diff --git a/training_loop_tpu.py b/training_loop_tpu.py
  # load last checkpoint and start from there
  current_step = load_global_step_from_checkpoint_dir(output_dir)
  steps_per_epoch = hparams['num_train_images'] // hparams['train_batch_size']
  tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                  ' step %d.',
                  max_steps,
                  max_steps / steps_per_epoch,
                  current_step)

  start_timestamp = time.time()  # This time will include compilation time

  while current_step < hparams['train_steps']:
    # Train for up to steps_per_eval number of steps.
    # At the end of training, a checkpoint will be written to --model_dir.
    next_checkpoint = min(current_step + STEPS_PER_EVAL, max_steps)
    estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
    current_step = next_checkpoint
    tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

    # Evaluate the model on the most recent model in --model_dir.
    # Since evaluation happens in batches of --eval_batch_size, some images
    # may be excluded modulo the batch size. As long as the batch size is
    # consistent, the evaluated images are also consistent.
    tf.logging.info('Starting to evaluate at step %d', next_checkpoint)
    eval_results = estimator.evaluate(
      input_fn=eval_input_fn,
      steps=hparams['num_eval_images'] // eval_batch_size)
    tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results)

  elapsed_time = int(time.time() - start_timestamp)
  tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                  max_steps, elapsed_time)
	# load last checkpoint and start from there
	current_step = load_global_step_from_checkpoint_dir(output_dir)
	steps_per_epoch = hparams['num_train_images'] // hparams['train_batch_size']
	tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
	' step %d.',
	max_steps,
	max_steps / steps_per_epoch,
	current_step)

	start_timestamp = time.time() # This time will include compilation time

	while current_step < hparams['train_steps']:
	# Train for up to steps_per_eval number of steps.
	# At the end of training, a checkpoint will be written to --model_dir.
	next_checkpoint = min(current_step + STEPS_PER_EVAL, max_steps)
	estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
	current_step = next_checkpoint
	tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
	next_checkpoint, int(time.time() - start_timestamp))

	# Evaluate the model on the most recent model in --model_dir.
	# Since evaluation happens in batches of --eval_batch_size, some images
	# may be excluded modulo the batch size. As long as the batch size is
	# consistent, the evaluated images are also consistent.
	tf.logging.info('Starting to evaluate at step %d', next_checkpoint)
	eval_results = estimator.evaluate(
	input_fn=eval_input_fn,
	steps=hparams['num_eval_images'] // eval_batch_size)
	tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results)

	elapsed_time = int(time.time() - start_timestamp)
	tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
	max_steps, elapsed_time)
No results found