Created
December 5, 2017 19:30
-
-
Save aodhan-domhnaill/0bd29247790fc7cfbbdd353a0d3c7e3b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-py2-cpu-{some date and time} | |
..................................................... | |
executing startup script (first run) | |
2017-12-05 18:42:47,306 INFO - root - running container entrypoint | |
2017-12-05 18:42:47,307 INFO - root - starting train task | |
2017-12-05 18:42:47,310 INFO - container_support.environment - starting metrics service | |
2017-12-05T18:42:47Z E! cloudwatch: Error in ListMetrics API call : AccessDenied: User: arn:aws:sts::338197679289:assumed-role/AmazonSageMaker-ExecutionRole-20171205T101659/SageMaker is not authorized to perform: cloudwatch:ListMetrics | |
#011status code: 403, request id: 16bf6256-d9ec-11e7-b9c8-2965f5af2d22 | |
2017-12-05T18:42:47Z E! Failed to connect to output cloudwatch, retrying in 15s, error was 'AccessDenied: User: arn:aws:sts::338197679289:assumed-role/AmazonSageMaker-ExecutionRole-20171205T101659/SageMaker is not authorized to perform: cloudwatch:ListMetrics | |
#011status code: 403, request id: 16bf6256-d9ec-11e7-b9c8-2965f5af2d22' | |
2017-12-05 18:42:48,674 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'enable_cloudwatch_metrics': True, 'available_gpus': 0, 'channels': {u'training': {u'TrainingInputMode': u'File', u'RecordWrapperType': u'None', u'S3DistributionType': u'FullyReplicated'}}, '_ps_verbose': 0, 'resource_config': {u'current_host': u'algo-1', u'hosts': [u'algo-1']}, 'user_script_name': u'mnist.py', 'input_config_dir': '/opt/ml/input/config', 'channel_dirs': {u'training': u'/opt/ml/input/data/training'}, 'code_dir': '/opt/ml/code', 'output_data_dir': '/opt/ml/output/data/', 'output_dir': '/opt/ml/output', 'model_dir': '/opt/ml/model', 'hyperparameters': {u'sagemaker_program': u'mnist.py', u'learning_rate': 0.1, u'batch_size': 100, u'epochs': 20, u'log_interval': 100, u'sagemaker_region': u'us-east-2', u'sagemaker_enable_cloudwatch_metrics': True, u'sagemaker_job_name': u'sagemaker-mxnet-py2-cpu-2017-12-05-18-38-29-317', u'sagemaker_container_log_level': 20, u'momentum': 0.9, u'sagemaker_submit_directory': u's3://sagemaker-us-east-2-338197679289/sagemaker-mxnet-py2-cpu-2017-12-05-18-38-29-317/source/sourcedir.tar.gz'}, 'hosts': [u'algo-1'], '_ps_port': 8000, 'user_script_archive': u's3://sagemaker-us-east-2-338197679289/sagemaker-mxnet-py2-cpu-2017-12-05-18-38-29-317/source/sourcedir.tar.gz', 'sagemaker_region': u'us-east-2', 'input_dir': '/opt/ml/input', 'current_host': u'algo-1', 'container_log_level': 20, 'available_cpus': 4, 'base_dir': '/opt/ml'} | |
Downloading s3://sagemaker-us-east-2-338197679289/sagemaker-mxnet-py2-cpu-2017-12-05-18-38-29-317/source/sourcedir.tar.gz to /tmp/script.tar.gz | |
2017-12-05 18:42:48,773 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTP connection (1): 169.254.170.2 | |
2017-12-05 18:42:48,879 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): s3.amazonaws.com | |
2017-12-05 18:42:48,944 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): s3.us-east-2.amazonaws.com | |
[18:42:49] /tmp/mxnet/dmlc-core/include/dmlc/./logging.h:308: [18:42:49] src/postoffice.cc:16: Check notnull: Environment::Get()->find("DMLC_NUM_WORKER") | |
Stack trace returned 10 entries: | |
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f2f78b0f31c] | |
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN2ps10PostofficeC1Ev+0x1dfe) [0x7f2f7b0a0cce] | |
[bt] (2) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8CustomerC1EiRKSt8functionIFvRKNS_7MessageEEE+0x8f3) [0x7f2f7b099473] | |
[bt] (3) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet7KVStore6CreateEPKc+0x5d0) [0x7f2f7b02cce0] | |
[bt] (4) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(MXKVStoreCreate+0x9) [0x7f2f7afd16a9] | |
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f2f88960e40] | |
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x2eb) [0x7f2f889608ab] | |
[bt] (7) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(_ctypes_callproc+0x48f) [0x7f2f88b703df] | |
[bt] (8) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(+0x11d82) [0x7f2f88b74d82] | |
[bt] (9) python2(PyObject_Call+0x43) [0x4b0c93] | |
2017-12-05 18:42:49,449 ERROR - root - uncaught exception: [18:42:49] src/postoffice.cc:16: Check notnull: Environment::Get()->find("DMLC_NUM_WORKER") | |
Stack trace returned 10 entries: | |
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f2f78b0f31c] | |
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN2ps10PostofficeC1Ev+0x1dfe) [0x7f2f7b0a0cce] | |
[bt] (2) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8CustomerC1EiRKSt8functionIFvRKNS_7MessageEEE+0x8f3) [0x7f2f7b099473] | |
[bt] (3) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet7KVStore6CreateEPKc+0x5d0) [0x7f2f7b02cce0] | |
[bt] (4) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(MXKVStoreCreate+0x9) [0x7f2f7afd16a9] | |
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f2f88960e40] | |
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x2eb) [0x7f2f889608ab] | |
[bt] (7) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(_ctypes_callproc+0x48f) [0x7f2f88b703df] | |
[bt] (8) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(+0x11d82) [0x7f2f88b74d82] | |
[bt] (9) python2(PyObject_Call+0x43) [0x4b0c93] | |
Traceback (most recent call last): | |
File "/opt/amazon/bin/entry.py", line 32, in <module> | |
modes[mode]() | |
File "/opt/amazon/lib/python2.7/site-packages/container_support/training.py", line 21, in start | |
raise e | |
MXNetError: [18:42:49] src/postoffice.cc:16: Check notnull: Environment::Get()->find("DMLC_NUM_WORKER") | |
Stack trace returned 10 entries: | |
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f2f78b0f31c] | |
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN2ps10PostofficeC1Ev+0x1dfe) [0x7f2f7b0a0cce] | |
[bt] (2) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8CustomerC1EiRKSt8functionIFvRKNS_7MessageEEE+0x8f3) [0x7f2f7b099473] | |
[bt] (3) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet7KVStore6CreateEPKc+0x5d0) [0x7f2f7b02cce0] | |
[bt] (4) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.1-py2.7.egg/mxnet/libmxnet.so(MXKVStoreCreate+0x9) [0x7f2f7afd16a9] | |
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f2f88960e40] | |
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x2eb) [0x7f2f889608ab] | |
[bt] (7) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(_ctypes_callproc+0x48f) [0x7f2f88b703df] | |
[bt] (8) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(+0x11d82) [0x7f2f88b74d82] | |
[bt] (9) python2(PyObject_Call+0x43) [0x4b0c93] | |
ValueErrorTraceback (most recent call last) | |
<ipython-input-6-0cbbd7aabf3f> in <module>() | |
----> 1 m.fit(inputs) | |
/home/ec2-user/anaconda3/envs/mxnet_p27/lib/python2.7/site-packages/sagemaker/estimator.pyc in fit(self, inputs, wait, logs, job_name) | |
515 self._hyperparameters[JOB_NAME_PARAM_NAME] = self._current_job_name | |
516 self._hyperparameters[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_session.region_name | |
--> 517 super(Framework, self).fit(inputs, wait, logs, self._current_job_name) | |
518 | |
519 def hyperparameters(self): | |
/home/ec2-user/anaconda3/envs/mxnet_p27/lib/python2.7/site-packages/sagemaker/estimator.pyc in fit(self, inputs, wait, logs, job_name) | |
152 self.latest_training_job = _TrainingJob.start_new(self, inputs) | |
153 if wait: | |
--> 154 self.latest_training_job.wait(logs=logs) | |
155 else: | |
156 raise NotImplemented('Asynchronous fit not available') | |
/home/ec2-user/anaconda3/envs/mxnet_p27/lib/python2.7/site-packages/sagemaker/estimator.pyc in wait(self, logs) | |
321 def wait(self, logs=True): | |
322 if logs: | |
--> 323 self.sagemaker_session.logs_for_job(self.job_name, wait=True) | |
324 else: | |
325 self.sagemaker_session.wait_for_job(self.job_name) | |
/home/ec2-user/anaconda3/envs/mxnet_p27/lib/python2.7/site-packages/sagemaker/session.pyc in logs_for_job(self, job_name, wait, poll) | |
670 | |
671 if wait: | |
--> 672 self._check_job_status(job_name, description) | |
673 if dot: | |
674 print() | |
/home/ec2-user/anaconda3/envs/mxnet_p27/lib/python2.7/site-packages/sagemaker/session.pyc in _check_job_status(self, job, desc) | |
415 if status != 'Completed': | |
416 reason = desc.get('FailureReason', '(No reason provided)') | |
--> 417 raise ValueError('Error training {}: {} Reason: {}'.format(job, status, reason)) | |
418 | |
419 def wait_for_endpoint(self, endpoint, poll=5): | |
ValueError: Error training sagemaker-mxnet-py2-cpu-{some date and time}: Failed Reason: AlgorithmError: uncaught exception during training: [18:42:49] src/postoffice.cc:16: Check notnull: Environment::Get()->find("DMLC_NUM_WORKER") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment