bveeramani · August 16, 2022 21:08
diff --git a/repro.py b/repro.py
 import torch.nn as nn

 from ray.air import session
 from ray.air.config import ScalingConfig
 from ray.train.torch import TorchCheckpoint, TorchTrainer


 class Identity(nn.Module):
    def forward(self, x):
        return x


 def train_loop_per_worker(config):
    model = Identity()
    for _ in range(2):
        checkpoint = TorchCheckpoint.from_model(model)
        session.report({}, checkpoint=checkpoint)


 trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers=2),
 )
 trainer.fit()
diff --git a/traceback.txt b/traceback.txt
 (base) ray@ip-172-31-93-157:~/workspace-project-balajis-workspace$ python repro.py 
 2022-08-16 14:05:57,580 INFO worker.py:1202 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS
 2022-08-16 14:05:57,979 INFO worker.py:1312 -- Connecting to existing Ray cluster at address: 172.31.93.157:9031...
 2022-08-16 14:05:57,986 INFO worker.py:1481 -- Connected to Ray cluster. View the dashboard at https://session-ehpxszc4jtf9t3jbjcxd9put.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAKvVUXyTB-9h-4T2Pf376H468xJNwDRu8oq5CcD0jjsQAiA97ydzxmqKgBTmf3d9VGVxk5J3s3KCp6E5g_b3Dtfs7xJmEiB4D9Nm3eXnE8PlndI8IkL_2DcpzWctEH1WpYXv5s4y4BgCIgNuL2E6DAiPzqniBhDAyL79AkIMCI_LkZcGEMDIvv0C-gEeChxzZXNfRWhwWHN6YzRKdGY5VDNKYkpDWEQ5UFVU&redirect_to=dashboard.
 2022-08-16 14:05:57,988 INFO packaging.py:349 -- Pushing file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip' (0.07MiB) to Ray cluster...
 2022-08-16 14:05:57,989 INFO packaging.py:362 -- Successfully pushed file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip'.
 == Status ==
 Current time: 2022-08-16 14:06:02 (running for 00:00:04.38)
 Memory usage on this node: 7.9/30.9 GiB
 Using FIFO scheduling algorithm.
 Resources requested: 3.0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
 Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
 Number of trials: 1/1 (1 RUNNING)
 +--------------------------+----------+---------------------+
 | Trial name               | status   | loc                 |
 |--------------------------+----------+---------------------|
 | TorchTrainer_3987f_00000 | RUNNING  | 172.31.93.157:15919 |
 +--------------------------+----------+---------------------+


 (RayTrainWorker pid=15963) 2022-08-16 14:06:05,245      INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2]
 2022-08-16 14:06:05,367 ERROR trial_runner.py:980 -- Trial TorchTrainer_3987f_00000: Error processing event.
 ray.exceptions.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
    result = self.step()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
    self._report_thread_runner_error(block=True)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
    raise e
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
    self._entrypoint()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
    return self._trainable_func(
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
    output = fn()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
    trainer.training_loop()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
    for results in training_iterator:
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
    next_results = self._run_with_error_handling(self._fetch_next_result)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
    return func()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
    results = self._backend_executor.get_next_results()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
    raise RuntimeError(
 RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
 Result for TorchTrainer_3987f_00000:
  date: 2022-08-16_14-06-02
  experiment_id: c583eca1c39f4701b93769977d59afe5
  hostname: ip-172-31-93-157
  node_ip: 172.31.93.157
  pid: 15919
  timestamp: 1660683962
  trial_id: 3987f_00000
  
 == Status ==
 Current time: 2022-08-16 14:06:05 (running for 00:00:07.19)
 Memory usage on this node: 8.5/30.9 GiB
 Using FIFO scheduling algorithm.
 Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
 Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
 Number of trials: 1/1 (1 ERROR)
 +--------------------------+----------+---------------------+
 | Trial name               | status   | loc                 |
 |--------------------------+----------+---------------------|
 | TorchTrainer_3987f_00000 | ERROR    | 172.31.93.157:15919 |
 +--------------------------+----------+---------------------+
 Number of errored trials: 1
 +--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
 | Trial name               |   # failures | error file                                                                                                      |
 |--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------|
 | TorchTrainer_3987f_00000 |            1 | /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt |
 +--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+

 == Status ==
 Current time: 2022-08-16 14:06:05 (running for 00:00:07.19)
 Memory usage on this node: 8.5/30.9 GiB
 Using FIFO scheduling algorithm.
 Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
 Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
 Number of trials: 1/1 (1 ERROR)
 +--------------------------+----------+---------------------+
 | Trial name               | status   | loc                 |
 |--------------------------+----------+---------------------|
 | TorchTrainer_3987f_00000 | ERROR    | 172.31.93.157:15919 |
 +--------------------------+----------+---------------------+
 Number of errored trials: 1
 +--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
 | Trial name               |   # failures | error file                                                                                                      |
 |--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------|
 | TorchTrainer_3987f_00000 |            1 | /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt |
 +--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+

 (TorchTrainer pid=15919) 2022-08-16 14:06:05,300        ERROR function_trainable.py:298 -- Runner Thread raised error.
 (TorchTrainer pid=15919) Traceback (most recent call last):
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
 (TorchTrainer pid=15919)     self._entrypoint()
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
 (TorchTrainer pid=15919)     return self._trainable_func(
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 466, in _resume_span
 (TorchTrainer pid=15919)     return method(self, *_args, **_kwargs)
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
 (TorchTrainer pid=15919)     super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
 (TorchTrainer pid=15919)     output = fn()
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
 (TorchTrainer pid=15919)     trainer.training_loop()
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
 (TorchTrainer pid=15919)     for results in training_iterator:
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
 (TorchTrainer pid=15919)     next_results = self._run_with_error_handling(self._fetch_next_result)
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
 (TorchTrainer pid=15919)     return func()
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
 (TorchTrainer pid=15919)     results = self._backend_executor.get_next_results()
 (TorchTrainer pid=15919)   File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
 (TorchTrainer pid=15919)     raise RuntimeError(
 (TorchTrainer pid=15919) RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
 2022-08-16 14:06:05,479 ERROR tune.py:754 -- Trials did not complete: [TorchTrainer_3987f_00000]
 2022-08-16 14:06:05,479 INFO tune.py:758 -- Total run time: 7.47 seconds (7.19 seconds for the tuning loop).
 Traceback (most recent call last):
  File "repro.py", line 24, in <module>
    trainer.fit()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 349, in fit
    raise result.error
 types.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
    result = self.step()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
    self._report_thread_runner_error(block=True)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
    raise e
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
    self._entrypoint()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
    return self._trainable_func(
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
    output = fn()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
    trainer.training_loop()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
    for results in training_iterator:
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
    next_results = self._run_with_error_handling(self._fetch_next_result)
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
    return func()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
    results = self._backend_executor.get_next_results()
  File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
    raise RuntimeError(
 RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
	import torch.nn as nn

	from ray.air import session
	from ray.air.config import ScalingConfig
	from ray.train.torch import TorchCheckpoint, TorchTrainer


	class Identity(nn.Module):
	def forward(self, x):
	return x


	def train_loop_per_worker(config):
	model = Identity()
	for _ in range(2):
	checkpoint = TorchCheckpoint.from_model(model)
	session.report({}, checkpoint=checkpoint)


	trainer = TorchTrainer(
	train_loop_per_worker=train_loop_per_worker,
	scaling_config=ScalingConfig(num_workers=2),
	)
	trainer.fit()
	(base) ray@ip-172-31-93-157:~/workspace-project-balajis-workspace$ python repro.py
	2022-08-16 14:05:57,580 INFO worker.py:1202 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS
	2022-08-16 14:05:57,979 INFO worker.py:1312 -- Connecting to existing Ray cluster at address: 172.31.93.157:9031...
	2022-08-16 14:05:57,986 INFO worker.py:1481 -- Connected to Ray cluster. View the dashboard at https://session-ehpxszc4jtf9t3jbjcxd9put.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAKvVUXyTB-9h-4T2Pf376H468xJNwDRu8oq5CcD0jjsQAiA97ydzxmqKgBTmf3d9VGVxk5J3s3KCp6E5g_b3Dtfs7xJmEiB4D9Nm3eXnE8PlndI8IkL_2DcpzWctEH1WpYXv5s4y4BgCIgNuL2E6DAiPzqniBhDAyL79AkIMCI_LkZcGEMDIvv0C-gEeChxzZXNfRWhwWHN6YzRKdGY5VDNKYkpDWEQ5UFVU&redirect_to=dashboard.
	2022-08-16 14:05:57,988 INFO packaging.py:349 -- Pushing file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip' (0.07MiB) to Ray cluster...
	2022-08-16 14:05:57,989 INFO packaging.py:362 -- Successfully pushed file package 'gcs://_ray_pkg_5ab000279cd853ef3d4df1ef2b77abbc.zip'.
	== Status ==
	Current time: 2022-08-16 14:06:02 (running for 00:00:04.38)
	Memory usage on this node: 7.9/30.9 GiB
	Using FIFO scheduling algorithm.
	Resources requested: 3.0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
	Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
	Number of trials: 1/1 (1 RUNNING)
	+--------------------------+----------+---------------------+
	\| Trial name \| status \| loc \|
	\|--------------------------+----------+---------------------\|
	\| TorchTrainer_3987f_00000 \| RUNNING \| 172.31.93.157:15919 \|
	+--------------------------+----------+---------------------+


	(RayTrainWorker pid=15963) 2022-08-16 14:06:05,245 INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2]
	2022-08-16 14:06:05,367 ERROR trial_runner.py:980 -- Trial TorchTrainer_3987f_00000: Error processing event.
	ray.exceptions.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
	result = self.step()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
	self._report_thread_runner_error(block=True)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
	raise e
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
	self._entrypoint()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
	return self._trainable_func(
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
	super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
	output = fn()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
	trainer.training_loop()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
	for results in training_iterator:
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
	next_results = self._run_with_error_handling(self._fetch_next_result)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
	return func()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
	results = self._backend_executor.get_next_results()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
	raise RuntimeError(
	RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
	Result for TorchTrainer_3987f_00000:
	date: 2022-08-16_14-06-02
	experiment_id: c583eca1c39f4701b93769977d59afe5
	hostname: ip-172-31-93-157
	node_ip: 172.31.93.157
	pid: 15919
	timestamp: 1660683962
	trial_id: 3987f_00000

	== Status ==
	Current time: 2022-08-16 14:06:05 (running for 00:00:07.19)
	Memory usage on this node: 8.5/30.9 GiB
	Using FIFO scheduling algorithm.
	Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
	Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
	Number of trials: 1/1 (1 ERROR)
	+--------------------------+----------+---------------------+
	\| Trial name \| status \| loc \|
	\|--------------------------+----------+---------------------\|
	\| TorchTrainer_3987f_00000 \| ERROR \| 172.31.93.157:15919 \|
	+--------------------------+----------+---------------------+
	Number of errored trials: 1
	+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
	\| Trial name \| # failures \| error file \|
	\|--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------\|
	\| TorchTrainer_3987f_00000 \| 1 \| /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt \|
	+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+

	== Status ==
	Current time: 2022-08-16 14:06:05 (running for 00:00:07.19)
	Memory usage on this node: 8.5/30.9 GiB
	Using FIFO scheduling algorithm.
	Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/18.2 GiB heap, 0.0/9.1 GiB objects
	Result logdir: /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58
	Number of trials: 1/1 (1 ERROR)
	+--------------------------+----------+---------------------+
	\| Trial name \| status \| loc \|
	\|--------------------------+----------+---------------------\|
	\| TorchTrainer_3987f_00000 \| ERROR \| 172.31.93.157:15919 \|
	+--------------------------+----------+---------------------+
	Number of errored trials: 1
	+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+
	\| Trial name \| # failures \| error file \|
	\|--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------\|
	\| TorchTrainer_3987f_00000 \| 1 \| /home/ray/ray_results/TorchTrainer_2022-08-16_14-05-58/TorchTrainer_3987f_00000_0_2022-08-16_14-05-59/error.txt \|
	+--------------------------+--------------+-----------------------------------------------------------------------------------------------------------------+

	(TorchTrainer pid=15919) 2022-08-16 14:06:05,300 ERROR function_trainable.py:298 -- Runner Thread raised error.
	(TorchTrainer pid=15919) Traceback (most recent call last):
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
	(TorchTrainer pid=15919) self._entrypoint()
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
	(TorchTrainer pid=15919) return self._trainable_func(
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 466, in _resume_span
	(TorchTrainer pid=15919) return method(self, _args, *_kwargs)
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
	(TorchTrainer pid=15919) super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
	(TorchTrainer pid=15919) output = fn()
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
	(TorchTrainer pid=15919) trainer.training_loop()
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
	(TorchTrainer pid=15919) for results in training_iterator:
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
	(TorchTrainer pid=15919) next_results = self._run_with_error_handling(self._fetch_next_result)
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
	(TorchTrainer pid=15919) return func()
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
	(TorchTrainer pid=15919) results = self._backend_executor.get_next_results()
	(TorchTrainer pid=15919) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
	(TorchTrainer pid=15919) raise RuntimeError(
	(TorchTrainer pid=15919) RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.
	2022-08-16 14:06:05,479 ERROR tune.py:754 -- Trials did not complete: [TorchTrainer_3987f_00000]
	2022-08-16 14:06:05,479 INFO tune.py:758 -- Total run time: 7.47 seconds (7.19 seconds for the tuning loop).
	Traceback (most recent call last):
	File "repro.py", line 24, in <module>
	trainer.fit()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 349, in fit
	raise result.error
	types.RayTaskError(RuntimeError): ray::TrainTrainable.train() (pid=15919, ip=172.31.93.157, repr=TorchTrainer)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
	result = self.step()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
	self._report_thread_runner_error(block=True)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
	raise e
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
	self._entrypoint()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
	return self._trainable_func(
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
	super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
	output = fn()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
	trainer.training_loop()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py", line 358, in training_loop
	for results in training_iterator:
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 758, in __next__
	next_results = self._run_with_error_handling(self._fetch_next_result)
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 722, in _run_with_error_handling
	return func()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py", line 784, in _fetch_next_result
	results = self._backend_executor.get_next_results()
	File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py", line 395, in get_next_results
	raise RuntimeError(
	RuntimeError: Some workers returned results while others didn't. Make sure that `session.report()` (legacy API:`train.report()` and `train.save_checkpoint()`) are called the same number of times on all workers.