Last active
August 16, 2022 21:08
-
-
Save bveeramani/dfd0f2f4bbe8b2c916d6ba86ef5bc21d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch.nn as nn | |
from ray.air import session | |
from ray.air.config import ScalingConfig | |
from ray.train.torch import TorchCheckpoint, TorchTrainer | |
class Identity(nn.Module): | |
def forward(self, x): | |
return x | |
def train_loop_per_worker(config): | |
model = Identity() | |
for _ in range(2): | |
checkpoint = TorchCheckpoint.from_model(model) | |
session.report({}, checkpoint=checkpoint) | |
trainer = TorchTrainer( | |
train_loop_per_worker=train_loop_per_worker, | |
scaling_config=ScalingConfig(num_workers=2), | |
) | |
trainer.fit() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(base) ray@ip-172-31-93-157:~/workspace-project-balajis-workspace$ python repro.py | |
2022-08-16 14:05:57,580 INFO worker.py:1202 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS | |
2022-08-16 14:05:57,979 INFO worker.py:1312 -- Connecting to existing Ray cluster at address: 172.31.93.157:9031... | |
2022-08-16 14:05:57,986 INFO worker.py:1481 -- Connected to Ray cluster. View the dashboard at https://session-ehpxszc4jtf9t3jbjcxd9put.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAKvVUXyTB-9h-4T2Pf376H468xJNwDRu8oq5CcD0jjsQAiA97ydzxmqKgBTmf3d9VGVxk5J3s3KCp6E5g_b3Dtfs7xJmEiB4D9Nm3eXnE8PlndI8IkL_2DcpzWctEH1WpYXv5s4y4BgCIgNuL2E6DAiPzqniBhDAyL79AkIMCI_LkZcGEMDIvv0C-gEeChxzZXNfRWhwWHN6YzRKdGY5VDNKYkpDWEQ5UFVU&redirect_to=dashboard. | |
2022-08-16 14:05:57,988 INFO packaging.py:349 -- Pushing file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip' (0.07MiB) to Ray cluster... | |
2022-08-16 14:05:57,989 INFO packaging.py:362 -- Successfully pushed file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip'. | |
== Status == | |
Current time: 2022-08-16 14:06:02 (running for 00:00:04.38) | |
Memory usage on this node: 7.9/30.9 GiB | |
Using FIFO scheduling algorithm. | |
Resources requested: 3.0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects | |
Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58 | |
Number of trials: 1/1 (1 RUNNING) | |
+--------------------------+----------+---------------------+ | |
| Trial name | status | loc | | |
|--------------------------+----------+---------------------| | |
| TorchTrainer_3987f_00000 | RUNNING | 172.31.93.157:15919 | | |
+--------------------------+----------+---------------------+ | |
(RayTrainWorker pid=15963) 2022-08-16 14:06:05,245 INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2] | |
2022-08-16 14:06:05,367 ERROR trial_runner.py:980 -- Trial TorchTrainer_3987f_00000: Error processing event. | |
ray.exceptions.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train | |
result = self.step() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step | |
self._report_thread_runner_error(block=True) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error | |
raise e | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run | |
self._entrypoint() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint | |
return self._trainable_func( | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func | |
super()._trainable_func(self._merged_config, reporter, checkpoint_dir) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func | |
output = fn() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func | |
trainer.training_loop() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop | |
for results in training_iterator: | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__ | |
next_results = self._run_with_error_handling(self._fetch_next_result) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling | |
return func() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result | |
results = self._backend_executor.get_next_results() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results | |
raise RuntimeError( | |
RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers. | |
Result for TorchTrainer_3987f_00000: | |
date: 2022-08-16_14-06-02 | |
experiment_id: c583eca1c39f4701b93769977d59afe5 | |
hostname: ip-172-31-93-157 | |
node_ip: 172.31.93.157 | |
pid: 15919 | |
timestamp: 1660683962 | |
trial_id: 3987f_00000 | |
== Status == | |
Current time: 2022-08-16 14:06:05 (running for 00:00:07.19) | |
Memory usage on this node: 8.5/30.9 GiB | |
Using FIFO scheduling algorithm. | |
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects | |
Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58 | |
Number of trials: 1/1 (1 ERROR) | |
+--------------------------+----------+---------------------+ | |
| Trial name | status | loc | | |
|--------------------------+----------+---------------------| | |
| TorchTrainer_3987f_00000 | ERROR | 172.31.93.157:15919 | | |
+--------------------------+----------+---------------------+ | |
Number of errored trials: 1 | |
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+ | |
| Trial name | # failures | error file | | |
|--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------| | |
| TorchTrainer_3987f_00000 | 1 | /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt | | |
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+ | |
== Status == | |
Current time: 2022-08-16 14:06:05 (running for 00:00:07.19) | |
Memory usage on this node: 8.5/30.9 GiB | |
Using FIFO scheduling algorithm. | |
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects | |
Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58 | |
Number of trials: 1/1 (1 ERROR) | |
+--------------------------+----------+---------------------+ | |
| Trial name | status | loc | | |
|--------------------------+----------+---------------------| | |
| TorchTrainer_3987f_00000 | ERROR | 172.31.93.157:15919 | | |
+--------------------------+----------+---------------------+ | |
Number of errored trials: 1 | |
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+ | |
| Trial name | # failures | error file | | |
|--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------| | |
| TorchTrainer_3987f_00000 | 1 | /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt | | |
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+ | |
(TorchTrainer pid=15919) 2022-08-16 14:06:05,300 ERROR function_trainable.py:298 -- Runner Thread raised error. | |
(TorchTrainer pid=15919) Traceback (most recent call last): | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run | |
(TorchTrainer pid=15919) self._entrypoint() | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint | |
(TorchTrainer pid=15919) return self._trainable_func( | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 466, in _resume_span | |
(TorchTrainer pid=15919) return method(self, *_args, **_kwargs) | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func | |
(TorchTrainer pid=15919) super()._trainable_func(self._merged_config, reporter, checkpoint_dir) | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func | |
(TorchTrainer pid=15919) output = fn() | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func | |
(TorchTrainer pid=15919) trainer.training_loop() | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop | |
(TorchTrainer pid=15919) for results in training_iterator: | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__ | |
(TorchTrainer pid=15919) next_results = self._run_with_error_handling(self._fetch_next_result) | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling | |
(TorchTrainer pid=15919) return func() | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result | |
(TorchTrainer pid=15919) results = self._backend_executor.get_next_results() | |
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results | |
(TorchTrainer pid=15919) raise RuntimeError( | |
(TorchTrainer pid=15919) RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers. | |
2022-08-16 14:06:05,479 ERROR tune.py:754 -- Trials did not complete: [TorchTrainer_3987f_00000] | |
2022-08-16 14:06:05,479 INFO tune.py:758 -- Total run time: 7.47 seconds (7.19 seconds for the tuning loop). | |
Traceback (most recent call last): | |
File "repro.py", line 24, in <module> | |
trainer.fit() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 349, in fit | |
raise result.error | |
types.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train | |
result = self.step() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step | |
self._report_thread_runner_error(block=True) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error | |
raise e | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run | |
self._entrypoint() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint | |
return self._trainable_func( | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func | |
super()._trainable_func(self._merged_config, reporter, checkpoint_dir) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func | |
output = fn() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func | |
trainer.training_loop() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop | |
for results in training_iterator: | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__ | |
next_results = self._run_with_error_handling(self._fetch_next_result) | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling | |
return func() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result | |
results = self._backend_executor.get_next_results() | |
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results | |
raise RuntimeError( | |
RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment