Skip to content

Instantly share code, notes, and snippets.

@ChuaCheowHuan
Created September 30, 2019 09:37
Show Gist options
  • Save ChuaCheowHuan/b6d903919547bc713aeaac97a7e1ead9 to your computer and use it in GitHub Desktop.
Save ChuaCheowHuan/b6d903919547bc713aeaac97a7e1ead9 to your computer and use it in GitHub Desktop.
smk ray 0.6.5 custom env output
Creating tmp8z_8haic_algo-1-vmd10_1 ...
Attaching to tmp8z_8haic_algo-1-vmd10_12mdone
algo-1-vmd10_1 | 2019-09-30 07:54:22,885 sagemaker-containers INFO Imported framework sagemaker_tensorflow_container.training
algo-1-vmd10_1 | 2019-09-30 07:54:22,892 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
algo-1-vmd10_1 | 2019-09-30 07:54:23,011 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
algo-1-vmd10_1 | 2019-09-30 07:54:23,031 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
algo-1-vmd10_1 | 2019-09-30 07:54:23,045 sagemaker-containers INFO Invoking user script
algo-1-vmd10_1 |
algo-1-vmd10_1 | Training Env:
algo-1-vmd10_1 |
algo-1-vmd10_1 | {
algo-1-vmd10_1 | "additional_framework_parameters": {
algo-1-vmd10_1 | "sagemaker_estimator": "RLEstimator"
algo-1-vmd10_1 | },
algo-1-vmd10_1 | "channel_input_dirs": {},
algo-1-vmd10_1 | "current_host": "algo-1-vmd10",
algo-1-vmd10_1 | "framework_module": "sagemaker_tensorflow_container.training:main",
algo-1-vmd10_1 | "hosts": [
algo-1-vmd10_1 | "algo-1-vmd10"
algo-1-vmd10_1 | ],
algo-1-vmd10_1 | "hyperparameters": {
algo-1-vmd10_1 | "s3_bucket": "sagemaker-us-west-2-123456789012",
algo-1-vmd10_1 | "rl.training.stop.training_iteration": 2,
algo-1-vmd10_1 | "rl.training.checkpoint_freq": 2
algo-1-vmd10_1 | },
algo-1-vmd10_1 | "input_config_dir": "/opt/ml/input/config",
algo-1-vmd10_1 | "input_data_config": {},
algo-1-vmd10_1 | "input_dir": "/opt/ml/input",
algo-1-vmd10_1 | "is_master": true,
algo-1-vmd10_1 | "job_name": "ArrivalSim-2019-09-30-07-53-33-200",
algo-1-vmd10_1 | "log_level": 20,
algo-1-vmd10_1 | "master_hostname": "algo-1-vmd10",
algo-1-vmd10_1 | "model_dir": "/opt/ml/model",
algo-1-vmd10_1 | "module_dir": "s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz",
algo-1-vmd10_1 | "module_name": "mod_op_train",
algo-1-vmd10_1 | "network_interface_name": "eth0",
algo-1-vmd10_1 | "num_cpus": 2,
algo-1-vmd10_1 | "num_gpus": 0,
algo-1-vmd10_1 | "output_data_dir": "/opt/ml/output/data",
algo-1-vmd10_1 | "output_dir": "/opt/ml/output",
algo-1-vmd10_1 | "output_intermediate_dir": "/opt/ml/output/intermediate",
algo-1-vmd10_1 | "resource_config": {
algo-1-vmd10_1 | "current_host": "algo-1-vmd10",
algo-1-vmd10_1 | "hosts": [
algo-1-vmd10_1 | "algo-1-vmd10"
algo-1-vmd10_1 | ]
algo-1-vmd10_1 | },
algo-1-vmd10_1 | "user_entry_point": "mod_op_train.py"
algo-1-vmd10_1 | }
algo-1-vmd10_1 |
algo-1-vmd10_1 | Environment variables:
algo-1-vmd10_1 |
algo-1-vmd10_1 | SM_HOSTS=["algo-1-vmd10"]
algo-1-vmd10_1 | SM_NETWORK_INTERFACE_NAME=eth0
algo-1-vmd10_1 | SM_HPS={"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"}
algo-1-vmd10_1 | SM_USER_ENTRY_POINT=mod_op_train.py
algo-1-vmd10_1 | SM_FRAMEWORK_PARAMS={"sagemaker_estimator":"RLEstimator"}
algo-1-vmd10_1 | SM_RESOURCE_CONFIG={"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]}
algo-1-vmd10_1 | SM_INPUT_DATA_CONFIG={}
algo-1-vmd10_1 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
algo-1-vmd10_1 | SM_CHANNELS=[]
algo-1-vmd10_1 | SM_CURRENT_HOST=algo-1-vmd10
algo-1-vmd10_1 | SM_MODULE_NAME=mod_op_train
algo-1-vmd10_1 | SM_LOG_LEVEL=20
algo-1-vmd10_1 | SM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main
algo-1-vmd10_1 | SM_INPUT_DIR=/opt/ml/input
algo-1-vmd10_1 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
algo-1-vmd10_1 | SM_OUTPUT_DIR=/opt/ml/output
algo-1-vmd10_1 | SM_NUM_CPUS=2
algo-1-vmd10_1 | SM_NUM_GPUS=0
algo-1-vmd10_1 | SM_MODEL_DIR=/opt/ml/model
algo-1-vmd10_1 | SM_MODULE_DIR=s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz
algo-1-vmd10_1 | SM_TRAINING_ENV={"additional_framework_parameters":{"sagemaker_estimator":"RLEstimator"},"channel_input_dirs":{},"current_host":"algo-1-vmd10","framework_module":"sagemaker_tensorflow_container.training:main","hosts":["algo-1-vmd10"],"hyperparameters":{"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"ArrivalSim-2019-09-30-07-53-33-200","log_level":20,"master_hostname":"algo-1-vmd10","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz","module_name":"mod_op_train","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]},"user_entry_point":"mod_op_train.py"}
algo-1-vmd10_1 | SM_USER_ARGS=["--rl.training.checkpoint_freq","2","--rl.training.stop.training_iteration","2","--s3_bucket","sagemaker-us-west-2-123456789012"]
algo-1-vmd10_1 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
algo-1-vmd10_1 | SM_HP_S3_BUCKET=sagemaker-us-west-2-123456789012
algo-1-vmd10_1 | SM_HP_RL.TRAINING.STOP.TRAINING_ITERATION=2
algo-1-vmd10_1 | SM_HP_RL.TRAINING.CHECKPOINT_FREQ=2
algo-1-vmd10_1 | PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python36.zip:/usr/lib/python3.6:/usr/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/dist-packages:/usr/lib/python3/dist-packages
algo-1-vmd10_1 |
algo-1-vmd10_1 | Invoking script with the following command:
algo-1-vmd10_1 |
algo-1-vmd10_1 | /usr/bin/python mod_op_train.py --rl.training.checkpoint_freq 2 --rl.training.stop.training_iteration 2 --s3_bucket sagemaker-us-west-2-123456789012
algo-1-vmd10_1 |
algo-1-vmd10_1 |
algo-1-vmd10_1 | {'monitor': False, 'log_level': 'INFO', 'callbacks': {'on_episode_start': None, 'on_episode_step': None, 'on_episode_end': None, 'on_sample_end': None, 'on_train_result': None}, 'ignore_worker_failures': False, 'model': {'conv_filters': None, 'conv_activation': 'relu', 'fcnet_activation': 'tanh', 'fcnet_hiddens': [256, 256], 'free_log_std': False, 'squash_to_range': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_preprocessor': None, 'custom_model': None, 'custom_options': {}}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'env_config': {}, 'env': None, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'num_workers': 2, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'num_envs_per_worker': 1, 'sample_batch_size': 200, 'train_batch_size': 4000, 'batch_mode': 'truncate_episodes', 'sample_async': False, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_evaluator_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'async_remote_worker_envs': False, 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policy_graphs': {}, 'policy_mapping_fn': None, 'policies_to_train': None}, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'num_sgd_iter': 30, 'lr': 5e-05, 'lr_schedule': None, 'vf_share_layers': False, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'simple_optimizer': False, 'straggler_mitigation': False}
algo-1-vmd10_1 | 2019-09-30 07:54:30,715 WARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.
algo-1-vmd10_1 | 2019-09-30 07:54:30,716 INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-30_07-54-30_51/logs.
algo-1-vmd10_1 | 2019-09-30 07:54:30,823 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:45224 to respond...
algo-1-vmd10_1 | 2019-09-30 07:54:30,934 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:39871 to respond...
algo-1-vmd10_1 | 2019-09-30 07:54:30,936 INFO services.py:760 -- Starting Redis shard with 0.83 GB max memory.
algo-1-vmd10_1 | 2019-09-30 07:54:30,951 WARNING services.py:1261 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag '--shm-size' to 'docker run'.
algo-1-vmd10_1 | 2019-09-30 07:54:30,951 INFO services.py:1384 -- Starting the Plasma object store with 1.24 GB memory using /tmp.
algo-1-vmd10_1 | Running experiment with config {
algo-1-vmd10_1 | "training": {
algo-1-vmd10_1 | "env": "ArrivalSim-v0",
algo-1-vmd10_1 | "run": "PPO",
algo-1-vmd10_1 | "stop": {
algo-1-vmd10_1 | "training_iteration": 2
algo-1-vmd10_1 | },
algo-1-vmd10_1 | "local_dir": "/opt/ml/output/intermediate",
algo-1-vmd10_1 | "checkpoint_freq": 10,
algo-1-vmd10_1 | "config": {
algo-1-vmd10_1 | "num_workers": 1,
algo-1-vmd10_1 | "train_batch_size": 128,
algo-1-vmd10_1 | "sample_batch_size": 32,
algo-1-vmd10_1 | "optimizer": {
algo-1-vmd10_1 | "grads_per_step": 10
algo-1-vmd10_1 | }
algo-1-vmd10_1 | },
algo-1-vmd10_1 | "checkpoint_at_end": true
algo-1-vmd10_1 | }
algo-1-vmd10_1 | }
algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:64 -- Did not find checkpoint file in /opt/ml/output/intermediate/training.
algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:211 -- Starting a new experiment.
algo-1-vmd10_1 | == Status ==
algo-1-vmd10_1 | Using FIFO scheduling algorithm.
algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB
algo-1-vmd10_1 |
algo-1-vmd10_1 | == Status ==
algo-1-vmd10_1 | Using FIFO scheduling algorithm.
algo-1-vmd10_1 | Resources requested: 2/3 CPUs, 0/0 GPUs
algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB
algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training
algo-1-vmd10_1 | Number of trials: 1 ({'RUNNING': 1})
algo-1-vmd10_1 | RUNNING trials:
algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: RUNNING
algo-1-vmd10_1 |
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,765 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,776 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
algo-1-vmd10_1 | (pid=72) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
algo-1-vmd10_1 | (pid=72) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:40,860 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:44,007 INFO ppo.py:105 -- Important! Since 0.7.0, observation normalization is no longer enabled by default. To enable running-mean normalization, set 'observation_filter': 'MeanStdFilter'. You can ignore this message if your environment doesn't require observation normalization.
algo-1-vmd10_1 | (pid=97) 2019-09-30 07:54:48,310 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
algo-1-vmd10_1 | (pid=97) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
algo-1-vmd10_1 | (pid=97) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306
algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306
algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
algo-1-vmd10_1 | (pid=97) [1.83663046] 0.2754945689088046 False {}
algo-1-vmd10_1 | (pid=97) price b = 1.8366304593920306
algo-1-vmd10_1 | (pid=97) price a = 1.5866304593920306
algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
algo-1-vmd10_1 | (pid=97) [1.58663046] 0.2538608735027249 False {}
algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306
algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306
algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
algo-1-vmd10_1 | (pid=97) [1.83663046] 0.23876195972096398 False {}
algo-1-vmd10_1 | Result for PPO_ArrivalSim-v0_0:
algo-1-vmd10_1 | custom_metrics: {}
algo-1-vmd10_1 | date: 2019-09-30_07-54-52
algo-1-vmd10_1 | done: true
algo-1-vmd10_1 | episode_len_mean: 13.9375
algo-1-vmd10_1 | episode_reward_max: 15.623709832898886
algo-1-vmd10_1 | episode_reward_mean: 2.1431919362241683
algo-1-vmd10_1 | episode_reward_min: 0.0
algo-1-vmd10_1 | episodes_this_iter: 8
algo-1-vmd10_1 | episodes_total: 16
algo-1-vmd10_1 | experiment_id: e401acafb745453a93cd07c23a49719a
algo-1-vmd10_1 | hostname: 912222cf3d36
algo-1-vmd10_1 | info:
algo-1-vmd10_1 | default:
algo-1-vmd10_1 | cur_kl_coeff: 0.30000001192092896
algo-1-vmd10_1 | cur_lr: 4.999999873689376e-05
algo-1-vmd10_1 | entropy: 1.0779298543930054
algo-1-vmd10_1 | kl: 3.3599588871002197
algo-1-vmd10_1 | policy_loss: -0.00849771499633789
algo-1-vmd10_1 | total_loss: 9.052791595458984
algo-1-vmd10_1 | vf_explained_var: 0.06744426488876343
algo-1-vmd10_1 | vf_loss: 8.053301811218262
algo-1-vmd10_1 | grad_time_ms: 1050.643
algo-1-vmd10_1 | load_time_ms: 34.049
algo-1-vmd10_1 | num_steps_sampled: 256
algo-1-vmd10_1 | num_steps_trained: 256
algo-1-vmd10_1 | sample_time_ms: 2921.621
algo-1-vmd10_1 | update_time_ms: 214.194
algo-1-vmd10_1 | iterations_since_restore: 2
algo-1-vmd10_1 | node_ip: 172.18.0.2
algo-1-vmd10_1 | num_healthy_workers: 1
algo-1-vmd10_1 | num_metric_batches_dropped: 0
algo-1-vmd10_1 | off_policy_estimator: {}
algo-1-vmd10_1 | pid: 72
algo-1-vmd10_1 | policy_reward_mean: {}
algo-1-vmd10_1 | time_since_restore: 8.488733053207397
algo-1-vmd10_1 | time_this_iter_s: 1.0875024795532227
algo-1-vmd10_1 | time_total_s: 8.488733053207397
algo-1-vmd10_1 | timestamp: 1569830092
algo-1-vmd10_1 | timesteps_since_restore: 256
algo-1-vmd10_1 | timesteps_this_iter: 128
algo-1-vmd10_1 | timesteps_total: 256
algo-1-vmd10_1 | training_iteration: 2
algo-1-vmd10_1 |
algo-1-vmd10_1 | 2019-09-30 07:54:52,557 INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
algo-1-vmd10_1 | == Status ==
algo-1-vmd10_1 | Using FIFO scheduling algorithm.
algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
algo-1-vmd10_1 | Memory usage on this node: 2.0/4.1 GB
algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training
algo-1-vmd10_1 | Number of trials: 1 ({'TERMINATED': 1})
algo-1-vmd10_1 | TERMINATED trials:
algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: TERMINATED, [2 CPUs, 0 GPUs], [pid=72], 8 s, 2 iter, 256 ts, 2.14 rew
algo-1-vmd10_1 |
algo-1-vmd10_1 | Saved model configuration.
algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2 as /opt/ml/model/checkpoint
algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2.tune_metadata as /opt/ml/model/checkpoint.tune_metadata
algo-1-vmd10_1 | 2019-09-30 07:54:57,605 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
algo-1-vmd10_1 | 2019-09-30 07:54:57,607 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
algo-1-vmd10_1 | /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
algo-1-vmd10_1 | "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
algo-1-vmd10_1 | 2019-09-30 07:54:58,769 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
algo-1-vmd10_1 | (pid=73) 2019-09-30 07:54:59,065 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
algo-1-vmd10_1 | (pid=73) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
algo-1-vmd10_1 | (pid=73) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
algo-1-vmd10_1 | Saved TensorFlow serving model!
algo-1-vmd10_1 | 2019-09-30 07:55:03,775 sagemaker-containers INFO Reporting training SUCCESS
tmp8z_8haic_algo-1-vmd10_1 exited with code 0
Aborting on container exit...
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
28 try:
---> 29 fsrc = open(src, 'rb')
30 except OSError as e:
PermissionError: [Errno 13] Permission denied: '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log'
During handling of the above exception, another exception occurred:
DistutilsFileError Traceback (most recent call last)
<ipython-input-5-abacdc7913fc> in <module>()
34 )
35
---> 36 estimator.fit()
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name)
337 self._prepare_for_training(job_name=job_name)
338
--> 339 self.latest_training_job = _TrainingJob.start_new(self, inputs)
340 if wait:
341 self.latest_training_job.wait(logs=logs)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs)
856 cls._add_spot_checkpoint_args(local_mode, estimator, train_args)
857
--> 858 estimator.sagemaker_session.train(**train_args)
859
860 return cls(estimator.sagemaker_session, estimator._current_job_name)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)
390 LOGGER.info("Creating training-job with name: %s", job_name)
391 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 392 self.sagemaker_client.create_training_job(**train_request)
393
394 def compile_model(
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
99 training_job = _LocalTrainingJob(container)
100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
--> 101 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
102
103 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
87
88 self.model_artifacts = self.container.train(
---> 89 input_data_config, output_data_config, hyperparameters, job_name
90 )
91 self.end_time = datetime.datetime.now()
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
153 raise RuntimeError(msg)
154 finally:
--> 155 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
156
157 # free up the training data directory as it may contain
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in retrieve_artifacts(self, compose_data, output_data_config, job_name)
253 sagemaker.local.utils.recursive_copy(host_dir, model_artifacts)
254 elif container_dir == "/opt/ml/output":
--> 255 sagemaker.local.utils.recursive_copy(host_dir, output_artifacts)
256
257 # Tar Artifacts -> model.tar.gz and output.tar.gz
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/utils.py in recursive_copy(source, destination)
82 """
83 if os.path.isdir(source):
---> 84 copy_tree(source, destination)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
157 copy_tree(src_name, dst_name, preserve_mode,
158 preserve_times, preserve_symlinks, update,
--> 159 verbose=verbose, dry_run=dry_run))
160 else:
161 copy_file(src_name, dst_name, preserve_mode,
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
157 copy_tree(src_name, dst_name, preserve_mode,
158 preserve_times, preserve_symlinks, update,
--> 159 verbose=verbose, dry_run=dry_run))
160 else:
161 copy_file(src_name, dst_name, preserve_mode,
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
157 copy_tree(src_name, dst_name, preserve_mode,
158 preserve_times, preserve_symlinks, update,
--> 159 verbose=verbose, dry_run=dry_run))
160 else:
161 copy_file(src_name, dst_name, preserve_mode,
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
161 copy_file(src_name, dst_name, preserve_mode,
162 preserve_times, update, verbose=verbose,
--> 163 dry_run=dry_run)
164 outputs.append(dst_name)
165
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in copy_file(src, dst, preserve_mode, preserve_times, update, link, verbose, dry_run)
149 # Otherwise (non-Mac, not linking), copy the file contents and
150 # (optionally) copy the times and mode.
--> 151 _copy_file_contents(src, dst)
152 if preserve_mode or preserve_times:
153 st = os.stat(src)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
29 fsrc = open(src, 'rb')
30 except OSError as e:
---> 31 raise DistutilsFileError("could not open '%s': %s" % (src, e.strerror))
32
33 if os.path.exists(dst):
DistutilsFileError: could not open '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log': Permission denied
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment