Created
September 30, 2019 09:37
-
-
Save ChuaCheowHuan/b6d903919547bc713aeaac97a7e1ead9 to your computer and use it in GitHub Desktop.
smk ray 0.6.5 custom env output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Creating tmp8z_8haic_algo-1-vmd10_1 ... | |
Attaching to tmp8z_8haic_algo-1-vmd10_12mdone | |
algo-1-vmd10_1 | 2019-09-30 07:54:22,885 sagemaker-containers INFO Imported framework sagemaker_tensorflow_container.training | |
algo-1-vmd10_1 | 2019-09-30 07:54:22,892 sagemaker-containers INFO No GPUs detected (normal if no gpus installed) | |
algo-1-vmd10_1 | 2019-09-30 07:54:23,011 sagemaker-containers INFO No GPUs detected (normal if no gpus installed) | |
algo-1-vmd10_1 | 2019-09-30 07:54:23,031 sagemaker-containers INFO No GPUs detected (normal if no gpus installed) | |
algo-1-vmd10_1 | 2019-09-30 07:54:23,045 sagemaker-containers INFO Invoking user script | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | Training Env: | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | { | |
algo-1-vmd10_1 | "additional_framework_parameters": { | |
algo-1-vmd10_1 | "sagemaker_estimator": "RLEstimator" | |
algo-1-vmd10_1 | }, | |
algo-1-vmd10_1 | "channel_input_dirs": {}, | |
algo-1-vmd10_1 | "current_host": "algo-1-vmd10", | |
algo-1-vmd10_1 | "framework_module": "sagemaker_tensorflow_container.training:main", | |
algo-1-vmd10_1 | "hosts": [ | |
algo-1-vmd10_1 | "algo-1-vmd10" | |
algo-1-vmd10_1 | ], | |
algo-1-vmd10_1 | "hyperparameters": { | |
algo-1-vmd10_1 | "s3_bucket": "sagemaker-us-west-2-123456789012", | |
algo-1-vmd10_1 | "rl.training.stop.training_iteration": 2, | |
algo-1-vmd10_1 | "rl.training.checkpoint_freq": 2 | |
algo-1-vmd10_1 | }, | |
algo-1-vmd10_1 | "input_config_dir": "/opt/ml/input/config", | |
algo-1-vmd10_1 | "input_data_config": {}, | |
algo-1-vmd10_1 | "input_dir": "/opt/ml/input", | |
algo-1-vmd10_1 | "is_master": true, | |
algo-1-vmd10_1 | "job_name": "ArrivalSim-2019-09-30-07-53-33-200", | |
algo-1-vmd10_1 | "log_level": 20, | |
algo-1-vmd10_1 | "master_hostname": "algo-1-vmd10", | |
algo-1-vmd10_1 | "model_dir": "/opt/ml/model", | |
algo-1-vmd10_1 | "module_dir": "s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz", | |
algo-1-vmd10_1 | "module_name": "mod_op_train", | |
algo-1-vmd10_1 | "network_interface_name": "eth0", | |
algo-1-vmd10_1 | "num_cpus": 2, | |
algo-1-vmd10_1 | "num_gpus": 0, | |
algo-1-vmd10_1 | "output_data_dir": "/opt/ml/output/data", | |
algo-1-vmd10_1 | "output_dir": "/opt/ml/output", | |
algo-1-vmd10_1 | "output_intermediate_dir": "/opt/ml/output/intermediate", | |
algo-1-vmd10_1 | "resource_config": { | |
algo-1-vmd10_1 | "current_host": "algo-1-vmd10", | |
algo-1-vmd10_1 | "hosts": [ | |
algo-1-vmd10_1 | "algo-1-vmd10" | |
algo-1-vmd10_1 | ] | |
algo-1-vmd10_1 | }, | |
algo-1-vmd10_1 | "user_entry_point": "mod_op_train.py" | |
algo-1-vmd10_1 | } | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | Environment variables: | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | SM_HOSTS=["algo-1-vmd10"] | |
algo-1-vmd10_1 | SM_NETWORK_INTERFACE_NAME=eth0 | |
algo-1-vmd10_1 | SM_HPS={"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"} | |
algo-1-vmd10_1 | SM_USER_ENTRY_POINT=mod_op_train.py | |
algo-1-vmd10_1 | SM_FRAMEWORK_PARAMS={"sagemaker_estimator":"RLEstimator"} | |
algo-1-vmd10_1 | SM_RESOURCE_CONFIG={"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]} | |
algo-1-vmd10_1 | SM_INPUT_DATA_CONFIG={} | |
algo-1-vmd10_1 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data | |
algo-1-vmd10_1 | SM_CHANNELS=[] | |
algo-1-vmd10_1 | SM_CURRENT_HOST=algo-1-vmd10 | |
algo-1-vmd10_1 | SM_MODULE_NAME=mod_op_train | |
algo-1-vmd10_1 | SM_LOG_LEVEL=20 | |
algo-1-vmd10_1 | SM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main | |
algo-1-vmd10_1 | SM_INPUT_DIR=/opt/ml/input | |
algo-1-vmd10_1 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config | |
algo-1-vmd10_1 | SM_OUTPUT_DIR=/opt/ml/output | |
algo-1-vmd10_1 | SM_NUM_CPUS=2 | |
algo-1-vmd10_1 | SM_NUM_GPUS=0 | |
algo-1-vmd10_1 | SM_MODEL_DIR=/opt/ml/model | |
algo-1-vmd10_1 | SM_MODULE_DIR=s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz | |
algo-1-vmd10_1 | SM_TRAINING_ENV={"additional_framework_parameters":{"sagemaker_estimator":"RLEstimator"},"channel_input_dirs":{},"current_host":"algo-1-vmd10","framework_module":"sagemaker_tensorflow_container.training:main","hosts":["algo-1-vmd10"],"hyperparameters":{"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"ArrivalSim-2019-09-30-07-53-33-200","log_level":20,"master_hostname":"algo-1-vmd10","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz","module_name":"mod_op_train","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]},"user_entry_point":"mod_op_train.py"} | |
algo-1-vmd10_1 | SM_USER_ARGS=["--rl.training.checkpoint_freq","2","--rl.training.stop.training_iteration","2","--s3_bucket","sagemaker-us-west-2-123456789012"] | |
algo-1-vmd10_1 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate | |
algo-1-vmd10_1 | SM_HP_S3_BUCKET=sagemaker-us-west-2-123456789012 | |
algo-1-vmd10_1 | SM_HP_RL.TRAINING.STOP.TRAINING_ITERATION=2 | |
algo-1-vmd10_1 | SM_HP_RL.TRAINING.CHECKPOINT_FREQ=2 | |
algo-1-vmd10_1 | PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python36.zip:/usr/lib/python3.6:/usr/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/dist-packages:/usr/lib/python3/dist-packages | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | Invoking script with the following command: | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | /usr/bin/python mod_op_train.py --rl.training.checkpoint_freq 2 --rl.training.stop.training_iteration 2 --s3_bucket sagemaker-us-west-2-123456789012 | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | {'monitor': False, 'log_level': 'INFO', 'callbacks': {'on_episode_start': None, 'on_episode_step': None, 'on_episode_end': None, 'on_sample_end': None, 'on_train_result': None}, 'ignore_worker_failures': False, 'model': {'conv_filters': None, 'conv_activation': 'relu', 'fcnet_activation': 'tanh', 'fcnet_hiddens': [256, 256], 'free_log_std': False, 'squash_to_range': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_preprocessor': None, 'custom_model': None, 'custom_options': {}}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'env_config': {}, 'env': None, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'num_workers': 2, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'num_envs_per_worker': 1, 'sample_batch_size': 200, 'train_batch_size': 4000, 'batch_mode': 'truncate_episodes', 'sample_async': False, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_evaluator_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'async_remote_worker_envs': False, 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policy_graphs': {}, 'policy_mapping_fn': None, 'policies_to_train': None}, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'num_sgd_iter': 30, 'lr': 5e-05, 'lr_schedule': None, 'vf_share_layers': False, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'simple_optimizer': False, 'straggler_mitigation': False} | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,715 WARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes. | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,716 INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-30_07-54-30_51/logs. | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,823 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:45224 to respond... | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,934 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:39871 to respond... | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,936 INFO services.py:760 -- Starting Redis shard with 0.83 GB max memory. | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,951 WARNING services.py:1261 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag '--shm-size' to 'docker run'. | |
algo-1-vmd10_1 | 2019-09-30 07:54:30,951 INFO services.py:1384 -- Starting the Plasma object store with 1.24 GB memory using /tmp. | |
algo-1-vmd10_1 | Running experiment with config { | |
algo-1-vmd10_1 | "training": { | |
algo-1-vmd10_1 | "env": "ArrivalSim-v0", | |
algo-1-vmd10_1 | "run": "PPO", | |
algo-1-vmd10_1 | "stop": { | |
algo-1-vmd10_1 | "training_iteration": 2 | |
algo-1-vmd10_1 | }, | |
algo-1-vmd10_1 | "local_dir": "/opt/ml/output/intermediate", | |
algo-1-vmd10_1 | "checkpoint_freq": 10, | |
algo-1-vmd10_1 | "config": { | |
algo-1-vmd10_1 | "num_workers": 1, | |
algo-1-vmd10_1 | "train_batch_size": 128, | |
algo-1-vmd10_1 | "sample_batch_size": 32, | |
algo-1-vmd10_1 | "optimizer": { | |
algo-1-vmd10_1 | "grads_per_step": 10 | |
algo-1-vmd10_1 | } | |
algo-1-vmd10_1 | }, | |
algo-1-vmd10_1 | "checkpoint_at_end": true | |
algo-1-vmd10_1 | } | |
algo-1-vmd10_1 | } | |
algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:64 -- Did not find checkpoint file in /opt/ml/output/intermediate/training. | |
algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:211 -- Starting a new experiment. | |
algo-1-vmd10_1 | == Status == | |
algo-1-vmd10_1 | Using FIFO scheduling algorithm. | |
algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs | |
algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | == Status == | |
algo-1-vmd10_1 | Using FIFO scheduling algorithm. | |
algo-1-vmd10_1 | Resources requested: 2/3 CPUs, 0/0 GPUs | |
algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB | |
algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training | |
algo-1-vmd10_1 | Number of trials: 1 ({'RUNNING': 1}) | |
algo-1-vmd10_1 | RUNNING trials: | |
algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: RUNNING | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,765 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False). | |
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,776 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors) | |
algo-1-vmd10_1 | (pid=72) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. | |
algo-1-vmd10_1 | (pid=72) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. " | |
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:40,860 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0'] | |
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:44,007 INFO ppo.py:105 -- Important! Since 0.7.0, observation normalization is no longer enabled by default. To enable running-mean normalization, set 'observation_filter': 'MeanStdFilter'. You can ignore this message if your environment doesn't require observation normalization. | |
algo-1-vmd10_1 | (pid=97) 2019-09-30 07:54:48,310 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors) | |
algo-1-vmd10_1 | (pid=97) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. | |
algo-1-vmd10_1 | (pid=97) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. " | |
algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306 | |
algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306 | |
algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,) | |
algo-1-vmd10_1 | (pid=97) [1.83663046] 0.2754945689088046 False {} | |
algo-1-vmd10_1 | (pid=97) price b = 1.8366304593920306 | |
algo-1-vmd10_1 | (pid=97) price a = 1.5866304593920306 | |
algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,) | |
algo-1-vmd10_1 | (pid=97) [1.58663046] 0.2538608735027249 False {} | |
algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306 | |
algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306 | |
algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,) | |
algo-1-vmd10_1 | (pid=97) [1.83663046] 0.23876195972096398 False {} | |
algo-1-vmd10_1 | Result for PPO_ArrivalSim-v0_0: | |
algo-1-vmd10_1 | custom_metrics: {} | |
algo-1-vmd10_1 | date: 2019-09-30_07-54-52 | |
algo-1-vmd10_1 | done: true | |
algo-1-vmd10_1 | episode_len_mean: 13.9375 | |
algo-1-vmd10_1 | episode_reward_max: 15.623709832898886 | |
algo-1-vmd10_1 | episode_reward_mean: 2.1431919362241683 | |
algo-1-vmd10_1 | episode_reward_min: 0.0 | |
algo-1-vmd10_1 | episodes_this_iter: 8 | |
algo-1-vmd10_1 | episodes_total: 16 | |
algo-1-vmd10_1 | experiment_id: e401acafb745453a93cd07c23a49719a | |
algo-1-vmd10_1 | hostname: 912222cf3d36 | |
algo-1-vmd10_1 | info: | |
algo-1-vmd10_1 | default: | |
algo-1-vmd10_1 | cur_kl_coeff: 0.30000001192092896 | |
algo-1-vmd10_1 | cur_lr: 4.999999873689376e-05 | |
algo-1-vmd10_1 | entropy: 1.0779298543930054 | |
algo-1-vmd10_1 | kl: 3.3599588871002197 | |
algo-1-vmd10_1 | policy_loss: -0.00849771499633789 | |
algo-1-vmd10_1 | total_loss: 9.052791595458984 | |
algo-1-vmd10_1 | vf_explained_var: 0.06744426488876343 | |
algo-1-vmd10_1 | vf_loss: 8.053301811218262 | |
algo-1-vmd10_1 | grad_time_ms: 1050.643 | |
algo-1-vmd10_1 | load_time_ms: 34.049 | |
algo-1-vmd10_1 | num_steps_sampled: 256 | |
algo-1-vmd10_1 | num_steps_trained: 256 | |
algo-1-vmd10_1 | sample_time_ms: 2921.621 | |
algo-1-vmd10_1 | update_time_ms: 214.194 | |
algo-1-vmd10_1 | iterations_since_restore: 2 | |
algo-1-vmd10_1 | node_ip: 172.18.0.2 | |
algo-1-vmd10_1 | num_healthy_workers: 1 | |
algo-1-vmd10_1 | num_metric_batches_dropped: 0 | |
algo-1-vmd10_1 | off_policy_estimator: {} | |
algo-1-vmd10_1 | pid: 72 | |
algo-1-vmd10_1 | policy_reward_mean: {} | |
algo-1-vmd10_1 | time_since_restore: 8.488733053207397 | |
algo-1-vmd10_1 | time_this_iter_s: 1.0875024795532227 | |
algo-1-vmd10_1 | time_total_s: 8.488733053207397 | |
algo-1-vmd10_1 | timestamp: 1569830092 | |
algo-1-vmd10_1 | timesteps_since_restore: 256 | |
algo-1-vmd10_1 | timesteps_this_iter: 128 | |
algo-1-vmd10_1 | timesteps_total: 256 | |
algo-1-vmd10_1 | training_iteration: 2 | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | 2019-09-30 07:54:52,557 INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads. | |
algo-1-vmd10_1 | == Status == | |
algo-1-vmd10_1 | Using FIFO scheduling algorithm. | |
algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs | |
algo-1-vmd10_1 | Memory usage on this node: 2.0/4.1 GB | |
algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training | |
algo-1-vmd10_1 | Number of trials: 1 ({'TERMINATED': 1}) | |
algo-1-vmd10_1 | TERMINATED trials: | |
algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: TERMINATED, [2 CPUs, 0 GPUs], [pid=72], 8 s, 2 iter, 256 ts, 2.14 rew | |
algo-1-vmd10_1 | | |
algo-1-vmd10_1 | Saved model configuration. | |
algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2 as /opt/ml/model/checkpoint | |
algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2.tune_metadata as /opt/ml/model/checkpoint.tune_metadata | |
algo-1-vmd10_1 | 2019-09-30 07:54:57,605 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False). | |
algo-1-vmd10_1 | 2019-09-30 07:54:57,607 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors) | |
algo-1-vmd10_1 | /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. | |
algo-1-vmd10_1 | "Converting sparse IndexedSlices to a dense Tensor of unknown shape. " | |
algo-1-vmd10_1 | 2019-09-30 07:54:58,769 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0'] | |
algo-1-vmd10_1 | (pid=73) 2019-09-30 07:54:59,065 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors) | |
algo-1-vmd10_1 | (pid=73) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. | |
algo-1-vmd10_1 | (pid=73) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. " | |
algo-1-vmd10_1 | Saved TensorFlow serving model! | |
algo-1-vmd10_1 | 2019-09-30 07:55:03,775 sagemaker-containers INFO Reporting training SUCCESS | |
tmp8z_8haic_algo-1-vmd10_1 exited with code 0 | |
Aborting on container exit... | |
--------------------------------------------------------------------------- | |
PermissionError Traceback (most recent call last) | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size) | |
28 try: | |
---> 29 fsrc = open(src, 'rb') | |
30 except OSError as e: | |
PermissionError: [Errno 13] Permission denied: '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log' | |
During handling of the above exception, another exception occurred: | |
DistutilsFileError Traceback (most recent call last) | |
<ipython-input-5-abacdc7913fc> in <module>() | |
34 ) | |
35 | |
---> 36 estimator.fit() | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name) | |
337 self._prepare_for_training(job_name=job_name) | |
338 | |
--> 339 self.latest_training_job = _TrainingJob.start_new(self, inputs) | |
340 if wait: | |
341 self.latest_training_job.wait(logs=logs) | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs) | |
856 cls._add_spot_checkpoint_args(local_mode, estimator, train_args) | |
857 | |
--> 858 estimator.sagemaker_session.train(**train_args) | |
859 | |
860 return cls(estimator.sagemaker_session, estimator._current_job_name) | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path) | |
390 LOGGER.info("Creating training-job with name: %s", job_name) | |
391 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4)) | |
--> 392 self.sagemaker_client.create_training_job(**train_request) | |
393 | |
394 def compile_model( | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs) | |
99 training_job = _LocalTrainingJob(container) | |
100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {} | |
--> 101 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName) | |
102 | |
103 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name) | |
87 | |
88 self.model_artifacts = self.container.train( | |
---> 89 input_data_config, output_data_config, hyperparameters, job_name | |
90 ) | |
91 self.end_time = datetime.datetime.now() | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name) | |
153 raise RuntimeError(msg) | |
154 finally: | |
--> 155 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name) | |
156 | |
157 # free up the training data directory as it may contain | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in retrieve_artifacts(self, compose_data, output_data_config, job_name) | |
253 sagemaker.local.utils.recursive_copy(host_dir, model_artifacts) | |
254 elif container_dir == "/opt/ml/output": | |
--> 255 sagemaker.local.utils.recursive_copy(host_dir, output_artifacts) | |
256 | |
257 # Tar Artifacts -> model.tar.gz and output.tar.gz | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/utils.py in recursive_copy(source, destination) | |
82 """ | |
83 if os.path.isdir(source): | |
---> 84 copy_tree(source, destination) | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run) | |
157 copy_tree(src_name, dst_name, preserve_mode, | |
158 preserve_times, preserve_symlinks, update, | |
--> 159 verbose=verbose, dry_run=dry_run)) | |
160 else: | |
161 copy_file(src_name, dst_name, preserve_mode, | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run) | |
157 copy_tree(src_name, dst_name, preserve_mode, | |
158 preserve_times, preserve_symlinks, update, | |
--> 159 verbose=verbose, dry_run=dry_run)) | |
160 else: | |
161 copy_file(src_name, dst_name, preserve_mode, | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run) | |
157 copy_tree(src_name, dst_name, preserve_mode, | |
158 preserve_times, preserve_symlinks, update, | |
--> 159 verbose=verbose, dry_run=dry_run)) | |
160 else: | |
161 copy_file(src_name, dst_name, preserve_mode, | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run) | |
161 copy_file(src_name, dst_name, preserve_mode, | |
162 preserve_times, update, verbose=verbose, | |
--> 163 dry_run=dry_run) | |
164 outputs.append(dst_name) | |
165 | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in copy_file(src, dst, preserve_mode, preserve_times, update, link, verbose, dry_run) | |
149 # Otherwise (non-Mac, not linking), copy the file contents and | |
150 # (optionally) copy the times and mode. | |
--> 151 _copy_file_contents(src, dst) | |
152 if preserve_mode or preserve_times: | |
153 st = os.stat(src) | |
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size) | |
29 fsrc = open(src, 'rb') | |
30 except OSError as e: | |
---> 31 raise DistutilsFileError("could not open '%s': %s" % (src, e.strerror)) | |
32 | |
33 if os.path.exists(dst): | |
DistutilsFileError: could not open '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log': Permission denied |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment