Created
August 26, 2024 18:55
-
-
Save hanzhanggit/b8161811a4f1fbb7d253d91e38043a4c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff78790823e72098e6057528dc01000000 Worker ID: 8683ec6d11263c22dd0d9b2e020e9dc0a21892e1351e41c864d93940 Node ID: 0c1a86314b69f902942ec5c0678a43307476ec8862da5d23e9e634eb Worker IP address: 172.20.90.62 Worker port: 40229 Worker PID: 1032972 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors. | |
(RayTrainWorker pid=1032972) Loading cached queryfile '/tmp/.cache/reve-dataloader/training-ready-laion5B_datacompxl_1024px_aesthetics_with_geminiv2.db'. [repeated 7x across cluster] | |
(TorchTrainer pid=1027792) Worker 5 has failed. | |
2024-08-26 11:36:58,674 ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_0c933_00000 | |
Traceback (most recent call last): | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future | |
result = ray.get(future) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper | |
return fn(*args, **kwargs) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper | |
return func(*args, **kwargs) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/_private/worker.py", line 2659, in get | |
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/_private/worker.py", line 871, in get_objects | |
raise value.as_instanceof_cause() | |
ray.exceptions.RayTaskError(ActorDiedError): ray::_Inner.train() (pid=1027792, ip=172.20.90.62, actor_id=f39e7730803f3be97ffb934e01000000, repr=TorchTrainer) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 331, in train | |
raise skipped from exception_cause(skipped) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/air/_internal/util.py", line 98, in run | |
self._ret = self._target(*self._args, **self._kwargs) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 45, in <lambda> | |
training_func=lambda: self._trainable_func(self.config), | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/base_trainer.py", line 799, in _trainable_func | |
super()._trainable_func(self._merged_config) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 250, in _trainable_func | |
output = fn() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/base_trainer.py", line 107, in _train_coordinator_fn | |
trainer.training_loop() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/data_parallel_trainer.py", line 471, in training_loop | |
self._run_training(training_iterator) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/data_parallel_trainer.py", line 370, in _run_training | |
for training_results in training_iterator: | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/trainer.py", line 124, in __next__ | |
next_results = self._run_with_error_handling(self._fetch_next_result) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/trainer.py", line 89, in _run_with_error_handling | |
return func() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/trainer.py", line 156, in _fetch_next_result | |
results = self._backend_executor.get_next_results() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/backend_executor.py", line 600, in get_next_results | |
results = self.get_with_failure_handling(futures) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/backend_executor.py", line 700, in get_with_failure_handling | |
self._increment_failures() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/backend_executor.py", line 762, in _increment_failures | |
raise failure | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/utils.py", line 53, in check_for_failure | |
ray.get(object_ref) | |
ray.exceptions.ActorDiedError: The actor died unexpectedly before finishing this task. | |
class_name: RayTrainWorker | |
actor_id: 78790823e72098e6057528dc01000000 | |
pid: 1032972 | |
namespace: f15d5eed-a121-49df-bf65-b47a100b7aed | |
ip: 172.20.90.62 | |
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors. | |
Training errored after 0 iterations at 2024-08-26 11:36:58. Total running time: 36min 42s | |
Error file: /tmp/ray/session_2024-08-26_11-00-13_263422_1021734/artifacts/2024-08-26_11-00-16/20240826_110015_gpt_debug_run/driver_artifacts/TorchTrainer_0c933_00000_0_2024-08-26_11-00-16/error.txt | |
2024-08-26 11:36:58,683 INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/tmp/runs/20240826_110015_gpt_debug_run' in 0.0026s. | |
2024-08-26 11:36:58,683 ERROR tune.py:1037 -- Trials did not complete: [TorchTrainer_0c933_00000] | |
Error executing job with overrides: [] | |
ray.exceptions.RayTaskError(ActorDiedError): ray::_Inner.train() (pid=1027792, ip=172.20.90.62, actor_id=f39e7730803f3be97ffb934e01000000, repr=TorchTrainer) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 331, in train | |
raise skipped from exception_cause(skipped) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/air/_internal/util.py", line 98, in run | |
self._ret = self._target(*self._args, **self._kwargs) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 45, in <lambda> | |
training_func=lambda: self._trainable_func(self.config), | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/base_trainer.py", line 799, in _trainable_func | |
super()._trainable_func(self._merged_config) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 250, in _trainable_func | |
output = fn() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/base_trainer.py", line 107, in _train_coordinator_fn | |
trainer.training_loop() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/data_parallel_trainer.py", line 471, in training_loop | |
self._run_training(training_iterator) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/data_parallel_trainer.py", line 370, in _run_training | |
for training_results in training_iterator: | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/trainer.py", line 124, in __next__ | |
next_results = self._run_with_error_handling(self._fetch_next_result) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/trainer.py", line 89, in _run_with_error_handling | |
return func() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/trainer.py", line 156, in _fetch_next_result | |
results = self._backend_executor.get_next_results() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/backend_executor.py", line 600, in get_next_results | |
results = self.get_with_failure_handling(futures) | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/backend_executor.py", line 700, in get_with_failure_handling | |
self._increment_failures() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/backend_executor.py", line 762, in _increment_failures | |
raise failure | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/_internal/utils.py", line 53, in check_for_failure | |
ray.get(object_ref) | |
ray.exceptions.ActorDiedError: The actor died unexpectedly before finishing this task. | |
class_name: RayTrainWorker | |
actor_id: 78790823e72098e6057528dc01000000 | |
pid: 1032972 | |
namespace: f15d5eed-a121-49df-bf65-b47a100b7aed | |
ip: 172.20.90.62 | |
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors. | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "/mnt/home/project/reve-ml/train.py", line 136, in main | |
trainer.fit() | |
File "/mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/train/base_trainer.py", line 638, in fit | |
raise TrainingFailedError( | |
ray.train.base_trainer.TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run. | |
To continue this run, you can use: `trainer = TorchTrainer.restore("/tmp/runs/20240826_110015_gpt_debug_run")`. | |
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries. | |
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace. | |
(RayTrainWorker pid=1032971) [rank4]:[E826 11:36:56.552462838 ProcessGroupNCCL.cpp:607] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14339, OpType=ALLREDUCE, NumelIn=1, NumelOut=1, Timeout(ms)=1800000) ran for 1800044 milliseconds before timing out. [repeated 6x across cluster] | |
(RayTrainWorker pid=1032971) [rank4]:[E826 11:36:56.552683587 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 4] Exception (either an error or timeout) detected by watchdog at work: 14339, last enqueued NCCL work: 14339, last completed NCCL work: 14338. [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [rank0]:[E826 11:36:57.547795353 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 0] Timeout at NCCL work: 14339, last enqueued NCCL work: 14340, last completed NCCL work: 14338. [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [rank0]:[E826 11:36:57.547841010 ProcessGroupNCCL.cpp:621] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [rank0]:[E826 11:36:57.547849006 ProcessGroupNCCL.cpp:627] [Rank 0] To avoid data inconsistency, we are taking the entire process down. [repeated 7x across cluster] | |
(RayTrainWorker pid=1032968) [rank1]:[E826 11:36:57.281886122 ProcessGroupNCCL.cpp:1515] [PG 0 (default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14339, OpType=ALLREDUCE, NumelIn=1, NumelOut=1, Timeout(ms)=1800000) ran for 1800027 milliseconds before timing out. [repeated 6x across cluster] | |
(RayTrainWorker pid=1032967) Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:609 (most recent call first): [repeated 14x across cluster] | |
(RayTrainWorker pid=1032967) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f4eec546f86 in /mnt/home/.conda/envs/remote/lib/python3.10/site-packages/torch/lib/libc10.so) [repeated 21x across cluster] | |
(RayTrainWorker pid=1032967) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x1d2 (0x7f4dbfd7b8d2 in /mnt/home/.conda/envs/remote/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) [repeated 14x across cluster] | |
(RayTrainWorker pid=1032967) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4dbfd846fc in /mnt/home/.conda/envs/remote/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) [repeated 28x across cluster] | |
(RayTrainWorker pid=1032967) frame #3: <unknown function> + 0x8609 (0x7f7e77f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) [repeated 49x across cluster] | |
(RayTrainWorker pid=1032967) frame #4: clone + 0x43 (0x7f7e77d3e353 in /lib/x86_64-linux-gnu/libc.so.6) [repeated 21x across cluster] | |
(RayTrainWorker pid=1032968) [2024-08-26 11:36:57,525 E 1032968 1033879] logging.cc:108: Unhandled exception: N3c1016DistBackendErrorE. what(): [PG 0 (default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14339, OpType=ALLREDUCE, NumelIn=1, NumelOut=1, Timeout(ms)=1800000) ran for 1800027 milliseconds before timing out. [repeated 6x across cluster] | |
(RayTrainWorker pid=1032967) Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1521 (most recent call first): [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [2024-08-26 11:36:57,804 E 1032967 1033904] logging.cc:115: Stack trace: [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) /mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/_raylet.so(+0x10b96aa) [0x7f7e76fae6aa] ray::operator<<() [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) /mnt/home/.conda/envs/remote/lib/python3.10/site-packages/ray/_raylet.so(+0x10bc932) [0x7f7e76fb1932] ray::TerminateHandler() [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) /mnt/home/.conda/envs/remote/bin/../lib/libstdc++.so.6(+0xdbbf4) [0x7f7e75da7bf4] execute_native_thread_routine [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) /lib/x86_64-linux-gnu/libpthread.so.0(+0x8609) [0x7f7e77f73609] start_thread [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) /lib/x86_64-linux-gnu/libc.so.6(clone+0x43) [0x7f7e77d3e353] __clone [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) *** SIGABRT received at time=1724697417 on cpu 91 *** [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) PC: @ 0x7f7e77c6200b (unknown) raise [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) @ 0x7f7e75d7d35a (unknown) __cxxabiv1::__terminate() [repeated 14x across cluster] | |
(RayTrainWorker pid=1032967) @ 0x7f7e75d7d070 (unknown) (unknown) [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [2024-08-26 11:36:57,805 E 1032967 1033904] logging.cc:440: *** SIGABRT received at time=1724697417 on cpu 91 *** [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [2024-08-26 11:36:57,805 E 1032967 1033904] logging.cc:440: PC: @ 0x7f7e77c6200b (unknown) raise [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) [2024-08-26 11:36:57,805 E 1032967 1033904] logging.cc:440: @ 0x7f7e75d7d35a (unknown) __cxxabiv1::__terminate() [repeated 14x across cluster] | |
(RayTrainWorker pid=1032967) [2024-08-26 11:36:57,805 E 1032967 1033904] logging.cc:440: @ 0x7f7e75d7d070 (unknown) (unknown) [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) Fatal Python error: Aborted [repeated 7x across cluster] | |
(RayTrainWorker pid=1032967) Extension modules: msgpack._cmsgpack, google._upb._message, psutil._psutil_linux, psutil._psutil_posix, setproctitle, yaml._yaml, charset_normalizer.md, requests.packages.charset_normalizer.md, requests.packages.chardet.md, uvloop.loop, ray._raylet, lz4._version, lz4.frame._frame, numpy.core._multiarray_umath, numpy.core._multiarray_tests, numpy.linalg._umath_linalg, numpy.fft._pocketfft_internal, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, pyarrow.lib, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, pyarrow._fs, pyarrow._azurefs, pyarrow._hdfs, pyarrow._gcsfs, pyarrow._s3fs, pyarrow._parquet, pyarrow._json, torch._C, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, grpc._cython.cygrpc, google_crc32c._crc32c, PIL._imaging, PIL._imagingft, multidict._multidict, yarl._quoting_c, aiohttp._helpers, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket, frozenlist._frozenlist, xxhash._xxhash, sentencepiece._sentencepiece, regex._regex, scipy._lib._ccallback_c, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg.cython_blas, scipy.linalg._matfuncs_expm, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering (total: 123) [repeated 7x across cluster] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment