Skip to content

Instantly share code, notes, and snippets.

@bveeramani
Last active August 16, 2022 21:08
Show Gist options
  • Save bveeramani/dfd0f2f4bbe8b2c916d6ba86ef5bc21d to your computer and use it in GitHub Desktop.
Save bveeramani/dfd0f2f4bbe8b2c916d6ba86ef5bc21d to your computer and use it in GitHub Desktop.
import torch.nn as nn
from ray.air import session
from ray.air.config import ScalingConfig
from ray.train.torch import TorchCheckpoint, TorchTrainer
class Identity(nn.Module):
def forward(self, x):
return x
def train_loop_per_worker(config):
model = Identity()
for _ in range(2):
checkpoint = TorchCheckpoint.from_model(model)
session.report({}, checkpoint=checkpoint)
trainer = TorchTrainer(
train_loop_per_worker=train_loop_per_worker,
scaling_config=ScalingConfig(num_workers=2),
)
trainer.fit()
(base) ray@ip-172-31-93-157:~/workspace-project-balajis-workspace$ python repro.py
2022-08-16 14:05:57,580 INFO worker.py:1202 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS
2022-08-16 14:05:57,979 INFO worker.py:1312 -- Connecting to existing Ray cluster at address: 172.31.93.157:9031...
2022-08-16 14:05:57,986 INFO worker.py:1481 -- Connected to Ray cluster. View the dashboard at https://session-ehpxszc4jtf9t3jbjcxd9put.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAKvVUXyTB-9h-4T2Pf376H468xJNwDRu8oq5CcD0jjsQAiA97ydzxmqKgBTmf3d9VGVxk5J3s3KCp6E5g_b3Dtfs7xJmEiB4D9Nm3eXnE8PlndI8IkL_2DcpzWctEH1WpYXv5s4y4BgCIgNuL2E6DAiPzqniBhDAyL79AkIMCI_LkZcGEMDIvv0C-gEeChxzZXNfRWhwWHN6YzRKdGY5VDNKYkpDWEQ5UFVU&redirect_to=dashboard.
2022-08-16 14:05:57,988 INFO packaging.py:349 -- Pushing file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip' (0.07MiB) to Ray cluster...
2022-08-16 14:05:57,989 INFO packaging.py:362 -- Successfully pushed file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip'.
== Status ==
Current time: 2022-08-16 14:06:02 (running for 00:00:04.38)
Memory usage on this node: 7.9/30.9 GiB
Using FIFO scheduling algorithm.
Resources requested: 3.0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
Number of trials: 1/1 (1 RUNNING)
+--------------------------+----------+---------------------+
| Trial name | status | loc |
|--------------------------+----------+---------------------|
| TorchTrainer_3987f_00000 | RUNNING | 172.31.93.157:15919 |
+--------------------------+----------+---------------------+
(RayTrainWorker pid=15963) 2022-08-16 14:06:05,245 INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2]
2022-08-16 14:06:05,367 ERROR trial_runner.py:980 -- Trial TorchTrainer_3987f_00000: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
result = self.step()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
self._report_thread_runner_error(block=True)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
raise e
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
self._entrypoint()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
return self._trainable_func(
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
output = fn()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
trainer.training_loop()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
for results in training_iterator:
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
next_results = self._run_with_error_handling(self._fetch_next_result)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
return func()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
results = self._backend_executor.get_next_results()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
raise RuntimeError(
RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
Result for TorchTrainer_3987f_00000:
date: 2022-08-16_14-06-02
experiment_id: c583eca1c39f4701b93769977d59afe5
hostname: ip-172-31-93-157
node_ip: 172.31.93.157
pid: 15919
timestamp: 1660683962
trial_id: 3987f_00000
== Status ==
Current time: 2022-08-16 14:06:05 (running for 00:00:07.19)
Memory usage on this node: 8.5/30.9 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
Number of trials: 1/1 (1 ERROR)
+--------------------------+----------+---------------------+
| Trial name | status | loc |
|--------------------------+----------+---------------------|
| TorchTrainer_3987f_00000 | ERROR | 172.31.93.157:15919 |
+--------------------------+----------+---------------------+
Number of errored trials: 1
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------|
| TorchTrainer_3987f_00000 | 1 | /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt |
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
== Status ==
Current time: 2022-08-16 14:06:05 (running for 00:00:07.19)
Memory usage on this node: 8.5/30.9 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
Number of trials: 1/1 (1 ERROR)
+--------------------------+----------+---------------------+
| Trial name | status | loc |
|--------------------------+----------+---------------------|
| TorchTrainer_3987f_00000 | ERROR | 172.31.93.157:15919 |
+--------------------------+----------+---------------------+
Number of errored trials: 1
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------|
| TorchTrainer_3987f_00000 | 1 | /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt |
+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
(TorchTrainer pid=15919) 2022-08-16 14:06:05,300 ERROR function_trainable.py:298 -- Runner Thread raised error.
(TorchTrainer pid=15919) Traceback (most recent call last):
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
(TorchTrainer pid=15919) self._entrypoint()
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
(TorchTrainer pid=15919) return self._trainable_func(
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 466, in _resume_span
(TorchTrainer pid=15919) return method(self, *_args, **_kwargs)
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
(TorchTrainer pid=15919) super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
(TorchTrainer pid=15919) output = fn()
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
(TorchTrainer pid=15919) trainer.training_loop()
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
(TorchTrainer pid=15919) for results in training_iterator:
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
(TorchTrainer pid=15919) next_results = self._run_with_error_handling(self._fetch_next_result)
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
(TorchTrainer pid=15919) return func()
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
(TorchTrainer pid=15919) results = self._backend_executor.get_next_results()
(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
(TorchTrainer pid=15919) raise RuntimeError(
(TorchTrainer pid=15919) RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
2022-08-16 14:06:05,479 ERROR tune.py:754 -- Trials did not complete: [TorchTrainer_3987f_00000]
2022-08-16 14:06:05,479 INFO tune.py:758 -- Total run time: 7.47 seconds (7.19 seconds for the tuning loop).
Traceback (most recent call last):
File "repro.py", line 24, in <module>
trainer.fit()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 349, in fit
raise result.error
types.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
result = self.step()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
self._report_thread_runner_error(block=True)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
raise e
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
self._entrypoint()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
return self._trainable_func(
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
output = fn()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
trainer.training_loop()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
for results in training_iterator:
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
next_results = self._run_with_error_handling(self._fetch_next_result)
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
return func()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
results = self._backend_executor.get_next_results()
File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
raise RuntimeError(
RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment