Created
May 22, 2025 10:21
-
-
Save xrsrke/9ede20dedefdd0e46b2b175cef172f9f to your computer and use it in GitHub Desktop.
megatron fails 1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1: [default0]:[rank4]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/training.py", line 682, in pretrain | |
1: [default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400093543ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
3: [default0]:[rank12]: frame #17: <unknown function> + 0x5a4eff4 (0x40003fd7eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #17: <unknown function> + 0x5a4eff4 (0x40000c19eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:frame #5: c10d::ProcessGroupNCCL::heartbeatMonitor() + 0x298 (0x40003cc2fad8 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default2]:[rank14]: ^^^^^^^^^^^^^^^^^^^^^ | |
3: [default3]:[rank15]: frame #8: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40001863335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default1]:[rank9]: frame #31: <unknown function> + 0x284c4 (0x4000390a84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default0]:[rank4]: initialize_megatron( | |
3: [default1]:[rank13]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
2: [default3]:[rank11]: frame #18: <unknown function> + 0x5a4fc94 (0x40000c19fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/initialize.py", line 160, in initialize_megatron | |
3: [default0]:[rank12]: frame #18: <unknown function> + 0x5a4fc94 (0x40003fd7fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4657, in barrier | |
3: [default2]:[rank14]: work = group.barrier(opts=opts) | |
2: [default0]:[rank8]: torch.distributed.barrier() | |
1: [default2]:frame #1: <unknown function> + 0x5a96a40 (0x400042096a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: _compile_dependencies() | |
1: [default0]:[rank4]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/initialize.py", line 221, in _compile_dependencies | |
1: [default1]:frame #6: <unknown function> + 0xe1ae0 (0x4000a4041ae0 in /usr/lib/aarch64-linux-gnu/libstdc++.so.6) | |
3: [default0]:[rank12]: frame #19: <unknown function> + 0xf5ac84 (0x40003a1dac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: [default0]:[rank8]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper | |
2: [default3]:[rank11]: frame #19: <unknown function> + 0xf5ac84 (0x4000065fac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default0]:[rank4]: torch.distributed.barrier() | |
3: [default3]:[rank15]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int) + 0x14c (0x40001b75c54c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default2]:[rank14]: ^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: [default1]:[rank9]: frame #32: __libc_start_main + 0x98 (0x4000390a8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default1]:frame #7: <unknown function> + 0x8595c (0x40003250595c in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:frame #2: <unknown function> + 0x5a9ac24 (0x40004209ac24 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:frame #8: <unknown function> + 0xeba4c (0x40003256ba4c in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: [default2]:[rank14]: torch.distributed.DistBackendError: [14] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: failed to recv, got 0 bytes | |
2: [default0]:[rank8]: return func(*args, **kwargs) | |
1: [default0]:[rank4]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper | |
3: [default3]:[rank15]: frame #10: c10d::ProcessGroupNCCL::initNCCLComm(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::Device&, c10d::OpType, int, bool) + 0x122c (0x40001b7789ac in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
2: [default3]:[rank11]: frame #20: <unknown function> + 0x595360 (0x400005c35360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default1]: | |
3: [default2]:[rank14]: Exception raised from recvBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:678 (most recent call first): | |
3: [default0]:[rank12]: frame #20: <unknown function> + 0x595360 (0x400039815360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: [default1]:[rank9]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: [default2]:frame #3: <unknown function> + 0x5a9bbe4 (0x40004209bbe4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: return func(*args, **kwargs) | |
1: [default1]:[rank5]:[W522 12:08:21.400270632 ProcessGroupNCCL.cpp:1660] [PG ID 0 PG GUID 0(default_pg) Rank 5] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: failed to recv, got 0 bytes | |
1: [default0]:[rank4]: ^^^^^^^^^^^^^^^^^^^^^ | |
3: [default3]:[rank15]: frame #11: <unknown function> + 0x10bd9d0 (0x40001b77d9d0 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default3]:[rank15]: frame #12: c10d::ProcessGroupNCCL::allreduce_impl(at::Tensor&, char const*, c10d::AllreduceOptions const&) + 0xf4 (0x40001b77e284 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default1]:[rank13]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
3: [default0]:[rank12]: frame #21: /usr/bin/python() [0x5036b4] | |
2: [default1]:[rank9]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
1: [default2]:frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x1d4 (0x40004209ccc4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default0]:[rank12]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
2: [default0]:[rank8]: ^^^^^^^^^^^^^^^^^^^^^ | |
2: [default0]:[rank8]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4657, in barrier | |
2: [default0]:[rank8]: work = group.barrier(opts=opts) | |
2: [default0]:[rank8]: ^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: [default1]:[rank5]: Traceback (most recent call last): | |
3: [default1]:[rank13]: frame #25: /usr/bin/python() [0x59c3c4] | |
2: [default3]:[rank11]: frame #21: /usr/bin/python() [0x5036b4] | |
1: [default0]:[rank4]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4657, in barrier | |
1: [default2]:frame #5: c10d::ProcessGroupNCCL::heartbeatMonitor() + 0x298 (0x40004517fad8 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default3]:[rank15]: frame #13: c10d::ProcessGroupNCCL::barrier(c10d::BarrierOptions const&) + 0x464 (0x40001b78c534 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default0]:[rank12]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
3: [default1]:[rank13]: frame #26: /usr/bin/python() [0x680e94] | |
2: [default0]:[rank8]: torch.distributed.DistBackendError: [8] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: failed to recv, got 0 bytes | |
2: [default0]:[rank8]: Exception raised from recvBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:678 (most recent call first): | |
1: [default2]:frame #6: <unknown function> + 0xe1ae0 (0x4000ac591ae0 in /usr/lib/aarch64-linux-gnu/libstdc++.so.6) | |
1: [default0]:[rank4]: work = group.barrier(opts=opts) | |
3: [default3]:[rank15]: frame #14: <unknown function> + 0x5a339a4 (0x4000186239a4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
1: [default0]:[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: [default2]:[rank14]: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400060633ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
3: [default1]:[rank13]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400091643ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
1: [default1]:[rank5]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/pretrain_gpt.py", line 337, in <module> | |
3: [default3]:[rank15]: frame #15: <unknown function> + 0x5a40674 (0x400018630674 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
1: [default2]:frame #7: <unknown function> + 0x8595c (0x40003aa5595c in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default0]:[rank4]: torch.distributed.DistBackendError: [4] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: failed to recv, got 0 bytes | |
3: [default0]:[rank12]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
3: [default0]:[rank12]: frame #25: /usr/bin/python() [0x59c3c4] | |
2: [default0]:[rank8]: frame #1: <unknown function> + 0x5a96a40 (0x400040196a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: pretrain( | |
1: [default2]:frame #8: <unknown function> + 0xeba4c (0x40003aabba4c in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: [default1]:[rank13]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
2: [default3]:[rank11]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
1: [default0]:[rank4]: Exception raised from recvBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:678 (most recent call first): | |
3: [default0]:[rank12]: frame #26: /usr/bin/python() [0x680e94] | |
2: [default0]:[rank8]: frame #2: <unknown function> + 0x5a9ae14 (0x40004019ae14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/training.py", line 682, in pretrain | |
1: [default1]:[rank5]: initialize_megatron( | |
1: [default2]: | |
3: [default0]:[rank12]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
2: [default3]:[rank11]: frame #25: /usr/bin/python() [0x59c3c4] | |
2: [default0]:[rank8]: frame #3: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x26c (0x40004019d38c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #26: /usr/bin/python() [0x680e94] | |
2: [default0]:[rank8]: frame #4: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40004019d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: [default2]:[rank6]:[W522 12:08:21.400116300 ProcessGroupNCCL.cpp:1660] [PG ID 0 PG GUID 0(default_pg) Rank 6] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: failed to recv, got 0 bytes | |
1: [default0]:[rank4]: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400085c73ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
1: [default1]:[rank5]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/initialize.py", line 160, in initialize_megatron | |
1: [default1]:[rank5]: _compile_dependencies() | |
1: [default2]:[rank6]: Traceback (most recent call last): | |
1: [default1]:[rank5]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/initialize.py", line 221, in _compile_dependencies | |
3: [default2]:[rank14]: frame #1: <unknown function> + 0x5a96a40 (0x40000f186a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default1]:[rank13]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
2: [default0]:[rank8]: frame #5: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40004019eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #1: <unknown function> + 0x5a96a40 (0x4000347c6a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/pretrain_gpt.py", line 337, in <module> | |
3: [default1]:[rank13]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
2: [default3]:[rank11]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: [default1]:[rank5]: torch.distributed.barrier() | |
1: [default2]:[rank6]: pretrain( | |
1: [default0]:[rank4]: frame #2: <unknown function> + 0x5a9ae14 (0x4000347cae14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #16: <unknown function> + 0x51b3408 (0x400017da3408 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #2: <unknown function> + 0x5a9ae14 (0x40000f18ae14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default1]:[rank13]: frame #31: <unknown function> + 0x284c4 (0x40002ec984c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: [default0]:[rank8]: frame #6: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40004014335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper | |
1: [default0]:[rank4]: frame #3: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x26c (0x4000347cd38c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #17: <unknown function> + 0x5a4eff4 (0x40001863eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
2: [default0]:[rank8]: frame #7: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40004014335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #8: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40004014335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #31: <unknown function> + 0x284c4 (0x400004b484c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default1]:[rank5]: return func(*args, **kwargs) | |
3: [default2]:[rank14]: frame #3: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x26c (0x40000f18d38c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default1]:[rank13]: frame #32: __libc_start_main + 0x98 (0x40002ec98598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: [default1]:[rank13]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
3: [default0]:[rank12]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int) + 0x14c (0x40004326c54c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
2: [default0]:[rank8]: frame #10: c10d::ProcessGroupNCCL::initNCCLComm(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::Device&, c10d::OpType, int, bool) + 0x122c (0x4000432889ac in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default2]:[rank6]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/training.py", line 682, in pretrain | |
3: [default0]:[rank12]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
2: [default0]:[rank8]: frame #11: <unknown function> + 0x10bd9d0 (0x40004328d9d0 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
2: [default3]:[rank11]: frame #32: __libc_start_main + 0x98 (0x400004b48598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default0]:[rank4]: frame #4: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x4000347cd7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: initialize_megatron( | |
1: [default1]:[rank5]: ^^^^^^^^^^^^^^^^^^^^^ | |
3: [default0]:[rank12]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #12: c10d::ProcessGroupNCCL::allreduce_impl(at::Tensor&, char const*, c10d::AllreduceOptions const&) + 0xf4 (0x40004328e284 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default0]:[rank4]: frame #5: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x4000347ceedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #18: <unknown function> + 0x5a4fc94 (0x40001863fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default3]:[rank11]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: [default2]:[rank6]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/initialize.py", line 160, in initialize_megatron | |
1: [default1]:[rank5]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4657, in barrier | |
1: [default2]:[rank6]: _compile_dependencies() | |
3: [default3]:[rank15]: frame #19: <unknown function> + 0xf5ac84 (0x400012a9ac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: [default1]:[rank13]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
2: [default3]:[rank11]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
2: [default0]:[rank8]: frame #13: c10d::ProcessGroupNCCL::barrier(c10d::BarrierOptions const&) + 0x464 (0x40004329c534 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: work = group.barrier(opts=opts) | |
3: [default3]:[rank15]: frame #20: <unknown function> + 0x595360 (0x4000120d5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: [default3]:[rank15]: frame #21: /usr/bin/python() [0x5036b4] | |
2: [default0]:[rank8]: frame #14: <unknown function> + 0x5a339a4 (0x4000401339a4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default0]:[rank8]: frame #15: <unknown function> + 0x5a40674 (0x400040140674 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default0]:[rank8]: frame #16: <unknown function> + 0x51b3408 (0x40003f8b3408 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: File "/iopsstor/scratch/cscs/haojunzhao/code/Megatron-LM/megatron/training/initialize.py", line 221, in _compile_dependencies | |
3: [default2]:[rank14]: frame #4: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40000f18d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default0]:[rank12]: frame #31: <unknown function> + 0x284c4 (0x4000387284c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: [default0]:[rank8]: frame #17: <unknown function> + 0x5a4eff4 (0x40004014eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default0]:[rank8]: frame #18: <unknown function> + 0x5a4fc94 (0x40004014fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #6: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40003477335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #7: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40003477335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: [default2]:[rank6]: torch.distributed.barrier() | |
3: [default3]:[rank15]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #19: <unknown function> + 0xf5ac84 (0x40003a5aac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: [default0]:[rank8]: frame #20: <unknown function> + 0x595360 (0x400039be5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: [default0]:[rank8]: frame #21: /usr/bin/python() [0x5036b4] | |
2: [default0]:[rank8]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #25: /usr/bin/python() [0x59c3c4] | |
2: [default0]:[rank8]: frame #26: /usr/bin/python() [0x680e94] | |
2: [default0]:[rank8]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: [default1]:[rank5]: torch.distributed.DistBackendError: [5] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: failed to recv, got 0 bytes | |
1: [default0]:[rank4]: frame #8: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40003477335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #5: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40000f18eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: [default0]:[rank8]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #31: <unknown function> + 0x284c4 (0x400038af84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default1]:[rank5]: Exception raised from recvBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:678 (most recent call first): | |
1: [default2]:[rank6]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper | |
3: [default3]:[rank15]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
3: [default3]:[rank15]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
2: [default0]:[rank8]: frame #32: __libc_start_main + 0x98 (0x400038af8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: [default0]:[rank8]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
2: [default0]:[rank8]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
1: [default0]:[rank4]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int) + 0x14c (0x40003789c54c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default0]:[rank12]: frame #32: __libc_start_main + 0x98 (0x400038728598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default0]:[rank4]: frame #10: c10d::ProcessGroupNCCL::initNCCLComm(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::Device&, c10d::OpType, int, bool) + 0x122c (0x4000378b89ac in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default0]:[rank12]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
3: [default3]:[rank15]: frame #25: /usr/bin/python() [0x59c3c4] | |
1: [default1]:[rank5]: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x40008aff3ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
1: [default0]:[rank4]: frame #11: <unknown function> + 0x10bd9d0 (0x4000378bd9d0 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #1: <unknown function> + 0x5a96a40 (0x400039b46a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default0]:[rank12]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
3: [default2]:[rank14]: frame #6: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40000f13335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #2: <unknown function> + 0x5a9ae14 (0x400039b4ae14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: return func(*args, **kwargs) | |
1: [default0]:[rank4]: frame #12: c10d::ProcessGroupNCCL::allreduce_impl(at::Tensor&, char const*, c10d::AllreduceOptions const&) + 0xf4 (0x4000378be284 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default2]:[rank14]: frame #7: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40000f13335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #3: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x26c (0x400039b4d38c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #26: /usr/bin/python() [0x680e94] | |
1: [default2]:[rank6]: ^^^^^^^^^^^^^^^^^^^^^ | |
3: [default2]:[rank14]: frame #8: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40000f13335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #13: c10d::ProcessGroupNCCL::barrier(c10d::BarrierOptions const&) + 0x464 (0x4000378cc534 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #4: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x400039b4d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4657, in barrier | |
1: [default0]:[rank4]: frame #14: <unknown function> + 0x5a339a4 (0x4000347639a4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int) + 0x14c (0x40001225c54c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default2]:[rank14]: frame #10: c10d::ProcessGroupNCCL::initNCCLComm(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::Device&, c10d::OpType, int, bool) + 0x122c (0x4000122789ac in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default3]:[rank15]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #5: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x400039b4eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #15: <unknown function> + 0x5a40674 (0x400034770674 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #11: <unknown function> + 0x10bd9d0 (0x40001227d9d0 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default2]:[rank14]: frame #12: c10d::ProcessGroupNCCL::allreduce_impl(at::Tensor&, char const*, c10d::AllreduceOptions const&) + 0xf4 (0x40001227e284 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #6: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x400039af335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: work = group.barrier(opts=opts) | |
3: [default2]:[rank14]: frame #13: c10d::ProcessGroupNCCL::barrier(c10d::BarrierOptions const&) + 0x464 (0x40001228c534 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
3: [default2]:[rank14]: frame #14: <unknown function> + 0x5a339a4 (0x40000f1239a4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #16: <unknown function> + 0x51b3408 (0x400033ee3408 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #7: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x400039af335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #15: <unknown function> + 0x5a40674 (0x40000f130674 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: [default2]:[rank14]: frame #16: <unknown function> + 0x51b3408 (0x40000e8a3408 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #8: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x400039af335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: [default0]:[rank4]: frame #17: <unknown function> + 0x5a4eff4 (0x40003477eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: torch.distributed.DistBackendError: [6] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: failed to recv, got 0 bytes | |
1: [default0]:[rank4]: frame #18: <unknown function> + 0x5a4fc94 (0x40003477fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int) + 0x14c (0x40003cc1c54c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default0]:[rank4]: frame #19: <unknown function> + 0xf5ac84 (0x40002ebdac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: [default2]:[rank14]: frame #17: <unknown function> + 0x5a4eff4 (0x40000f13eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
3: [default2]:[rank14]: frame #18: <unknown function> + 0x5a4fc94 (0x40000f13fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: frame #31: <unknown function> + 0x284c4 (0x400010fe84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:[rank6]: Exception raised from recvBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:678 (most recent call first): | |
3: [default2]:[rank14]: frame #19: <unknown function> + 0xf5ac84 (0x40000959ac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default1]:[rank5]: frame #10: c10d::ProcessGroupNCCL::initNCCLComm(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::Device&, c10d::OpType, int, bool) + 0x122c (0x40003cc389ac in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default0]:[rank4]: frame #20: <unknown function> + 0x595360 (0x40002e215360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: [default2]:[rank14]: frame #20: <unknown function> + 0x595360 (0x400008bd5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default1]:[rank5]: frame #11: <unknown function> + 0x10bd9d0 (0x40003cc3d9d0 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default0]:[rank4]: frame #21: /usr/bin/python() [0x5036b4] | |
3: [default3]:[rank15]: frame #32: __libc_start_main + 0x98 (0x400010fe8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: [default2]:[rank14]: frame #21: /usr/bin/python() [0x5036b4] | |
1: [default2]:[rank6]: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400093543ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
3: [default3]:[rank15]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #12: c10d::ProcessGroupNCCL::allreduce_impl(at::Tensor&, char const*, c10d::AllreduceOptions const&) + 0xf4 (0x40003cc3e284 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default0]:[rank4]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
3: [default2]:[rank14]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #1: <unknown function> + 0x5a96a40 (0x400042096a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #13: c10d::ProcessGroupNCCL::barrier(c10d::BarrierOptions const&) + 0x464 (0x40003cc4c534 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default0]:[rank4]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #2: <unknown function> + 0x5a9ae14 (0x40004209ae14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default3]:[rank15]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
1: [default0]:[rank4]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #3: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x26c (0x40004209d38c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: [default2]:[rank14]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
3: [default2]:[rank14]: frame #25: /usr/bin/python() [0x59c3c4] | |
3: [default2]:[rank14]: frame #26: /usr/bin/python() [0x680e94] | |
3: [default2]:[rank14]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
3: [default2]:[rank14]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
3: [default2]:[rank14]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
3: [default2]:[rank14]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
3: [default2]:[rank14]: frame #31: <unknown function> + 0x284c4 (0x400007ae84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: [default2]:[rank14]: frame #32: __libc_start_main + 0x98 (0x400007ae8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: [default2]:[rank14]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
3: [default2]:[rank14]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
1: [default0]:[rank4]: frame #25: /usr/bin/python() [0x59c3c4] | |
1: [default1]:[rank5]: frame #14: <unknown function> + 0x5a339a4 (0x400039ae39a4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: frame #4: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40004209d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #26: /usr/bin/python() [0x680e94] | |
1: [default1]:[rank5]: frame #15: <unknown function> + 0x5a40674 (0x400039af0674 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: frame #5: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40004209eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #16: <unknown function> + 0x51b3408 (0x400039263408 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: frame #6: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40004204335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: frame #7: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40004204335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default0]:[rank4]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #8: c10d::PrefixStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3c (0x40004204335c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #17: <unknown function> + 0x5a4eff4 (0x400039afeff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #18: <unknown function> + 0x5a4fc94 (0x400039affc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int) + 0x14c (0x40004516c54c in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default2]:[rank6]: frame #10: c10d::ProcessGroupNCCL::initNCCLComm(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::Device&, c10d::OpType, int, bool) + 0x122c (0x40 | |
1: 00451889ac in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #19: <unknown function> + 0xf5ac84 (0x400033f5ac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default0]:[rank4]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: [default2]:[rank6]: frame #11: <unknown function> + 0x10bd9d0 (0x40004518d9d0 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #20: <unknown function> + 0x595360 (0x400033595360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default0]:[rank4]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #12: c10d::ProcessGroupNCCL::allreduce_impl(at::Tensor&, char const*, c10d::AllreduceOptions const&) + 0xf4 (0x40004518e284 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #21: /usr/bin/python() [0x5036b4] | |
1: [default0]:[rank4]: frame #31: <unknown function> + 0x284c4 (0x40002d1284c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:[rank6]: frame #13: c10d::ProcessGroupNCCL::barrier(c10d::BarrierOptions const&) + 0x464 (0x40004519c534 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
1: [default1]:[rank5]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
1: [default0]:[rank4]: frame #32: __libc_start_main + 0x98 (0x40002d128598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default1]:[rank5]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #14: <unknown function> + 0x5a339a4 (0x4000420339a4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
1: [default0]:[rank4]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #15: <unknown function> + 0x5a40674 (0x400042040674 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #25: /usr/bin/python() [0x59c3c4] | |
1: [default0]:[rank4]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
1: [default1]:[rank5]: frame #26: /usr/bin/python() [0x680e94] | |
1: [default2]:[rank6]: frame #16: <unknown function> + 0x51b3408 (0x4000417b3408 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #17: <unknown function> + 0x5a4eff4 (0x40004204eff4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default1]:[rank5]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #18: <unknown function> + 0x5a4fc94 (0x40004204fc94 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: [default2]:[rank6]: frame #19: <unknown function> + 0xf5ac84 (0x40003c4aac84 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default1]:[rank5]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: [default2]:[rank6]: frame #20: <unknown function> + 0x595360 (0x40003bae5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: [default2]:[rank6]: frame #21: /usr/bin/python() [0x5036b4] | |
1: [default2]:[rank6]: frame #22: _PyObject_MakeTpCall + 0x130 (0x4c2d50 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #23: _PyEval_EvalFrameDefault + 0x8a0 (0x563eb4 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #31: <unknown function> + 0x284c4 (0x4000324a84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:[rank6]: frame #24: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
1: [default1]:[rank5]: frame #32: __libc_start_main + 0x98 (0x4000324a8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:[rank6]: frame #25: /usr/bin/python() [0x59c3c4] | |
1: [default2]:[rank6]: frame #26: /usr/bin/python() [0x680e94] | |
1: [default1]:[rank5]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #27: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: [default1]:[rank5]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
1: [default2]:[rank6]: frame #28: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #29: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: [default2]:[rank6]: frame #30: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
1: [default2]:[rank6]: frame #31: <unknown function> + 0x284c4 (0x40003a9f84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:[rank6]: frame #32: __libc_start_main + 0x98 (0x40003a9f8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: [default2]:[rank6]: frame #33: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: [default2]:[rank6]: . This may indicate a possible application crash on rank 0 or a network set up issue. | |
2: W0522 12:08:21.603000 126833 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 127193 closing signal SIGTERM | |
1: W0522 12:08:21.604000 167121 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 167476 closing signal SIGTERM | |
1: W0522 12:08:21.605000 167121 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 167477 closing signal SIGTERM | |
1: W0522 12:08:21.607000 167121 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 167478 closing signal SIGTERM | |
2: W0522 12:08:21.607000 126833 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 127194 closing signal SIGTERM | |
2: W0522 12:08:21.608000 126833 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 127195 closing signal SIGTERM | |
2: W0522 12:08:21.611000 126833 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 127196 closing signal SIGTERM | |
1: W0522 12:08:21.611000 167121 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 167479 closing signal SIGTERM | |
1: [W522 12:08:22.089305155 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[nid006825]:37524, remote=[nid006803]:25678): Broken pipe | |
1: Exception raised from sendBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): | |
1: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400091843ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
1: frame #1: <unknown function> + 0x5a96a40 (0x400040396a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #2: <unknown function> + 0x5a99acc (0x400040399acc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #3: <unknown function> + 0x5a9bfb4 (0x40004039bfb4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #4: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration< | |
1: long, std::ratio<1l, 1000l> >) + 0x148 (0x40004039d268 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40004039d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #6: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40004039eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #7: <unknown function> + 0xf5ec14 (0x40003a7aec14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: frame #8: <unknown function> + 0x595360 (0x400039de5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: frame #9: /usr/bin/python() [0x5036b4] | |
1: frame #10: _PyObject_MakeTpCall + 0x78 (0x4c2c98 in /usr/bin/python) | |
1: frame #11: /usr/bin/python() [0x4c6f6c] | |
1: frame #12: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/py | |
1: thon) | |
1: frame #13: _PyObject_Call_Prepend + 0xc4 (0x4c4774 in /usr/bin/python) | |
1: frame #14: /usr/bin/python() [0x528ba0] | |
1: frame #15: PyObject_Call + 0x6c (0x4c517c in /usr/bin/python) | |
1: frame #16: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/python) | |
1: frame #17: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
1: frame #18: /usr/bin/python() [0x59c3c4] | |
1: frame #19: /usr/bin/python() [0x680e94] | |
1: frame #20: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: frame #21: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: frame #22: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: frame #23: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
1: frame #24: <unknown function> + 0x284c4 (0x400038bf84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: frame #25: __libc_start_main + 0x98 (0x400038bf8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: frame #26: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: | |
2: [W522 12:08:22.369348807 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[nid006826-hsn2]:58966, remote=[nid006803]:25678): Broken pipe | |
2: Exception raised from sendBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): | |
2: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400091d03ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
2: frame #1: <unknown function> + 0x5a96a40 (0x400040856a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #2: <unknown function> + 0x5a99acc (0x400040859acc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #3: <unknown function> + 0x5a9bfb4 (0x40004085bfb4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #4: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::dura | |
2: tion<long, std::ratio<1l, 1000l> >) + 0x148 (0x40004085d268 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40004085d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #6: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40004085eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #7: <unknown function> + 0xf5ec14 (0x40003ac6ec14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: frame #8: <unknown function> + 0x595360 (0x40003a2a5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: frame #9: /usr/bin/python() [0x5036b4] | |
2: frame #10: _PyObject_MakeTpCall + 0x78 (0x4c2c98 in /usr/bin/python) | |
2: frame #11: /usr/bin/python() [0x4c6f6c] | |
2: frame #12: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/b | |
2: in/python) | |
2: frame #13: _PyObject_Call_Prepend + 0xc4 (0x4c4774 in /usr/bin/python) | |
2: frame #14: /usr/bin/python() [0x528ba0] | |
2: frame #15: PyObject_Call + 0x6c (0x4c517c in /usr/bin/python) | |
2: frame #16: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/python) | |
2: frame #17: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
2: frame #18: /usr/bin/python() [0x59c3c4] | |
2: frame #19: /usr/bin/python() [0x680e94] | |
2: frame #20: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
2: frame #21: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
2: frame #22: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
2: frame #23: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
2: frame #24: <unknown function> + 0x284c4 (0x4000390b84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: frame #25: __libc_start_main + 0x98 (0x4000390b8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: frame #26: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
2: | |
1: W0522 12:08:22.092000 167121 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'nid006825_167121_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. | |
2: W0522 12:08:22.093000 126833 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'nid006826_126833_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. | |
1: [W522 12:08:22.100731295 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[nid006825]:37524, remote=[nid006803]:25678): Broken pipe | |
1: Exception raised from sendBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): | |
1: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400091843ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
1: frame #1: <unknown function> + 0x5a96a40 (0x400040396a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #2: <unknown function> + 0x5a99acc (0x400040399acc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #3: <unknown function> + 0x5a9bfb4 (0x40004039bfb4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #4: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::duration< | |
1: long, std::ratio<1l, 1000l> >) + 0x148 (0x40004039d268 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40004039d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #6: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40004039eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
1: frame #7: <unknown function> + 0xf5ec14 (0x40003a7aec14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: frame #8: <unknown function> + 0x595360 (0x400039de5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
1: frame #9: /usr/bin/python() [0x5036b4] | |
1: frame #10: _PyObject_MakeTpCall + 0x78 (0x4c2c98 in /usr/bin/python) | |
1: frame #11: /usr/bin/python() [0x4c6f6c] | |
1: frame #12: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/py | |
1: thon) | |
1: frame #13: _PyObject_Call_Prepend + 0xc4 (0x4c4774 in /usr/bin/python) | |
1: frame #14: /usr/bin/python() [0x528ba0] | |
1: frame #15: PyObject_Call + 0x6c (0x4c517c in /usr/bin/python) | |
1: frame #16: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/python) | |
1: frame #17: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
1: frame #18: /usr/bin/python() [0x59c3c4] | |
1: frame #19: /usr/bin/python() [0x680e94] | |
1: frame #20: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
1: frame #21: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
1: frame #22: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
1: frame #23: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
1: frame #24: <unknown function> + 0x284c4 (0x400038bf84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: frame #25: __libc_start_main + 0x98 (0x400038bf8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
1: frame #26: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
1: | |
1: W0522 12:08:22.103000 167121 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'nid006825_167121_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. | |
1: Traceback (most recent call last): | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 117, in _call_store | |
1: return getattr(self._store, store_op)(*args, **kwargs) | |
1: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: torch.distributed.DistNetworkError: failed to recv, got 0 bytes | |
1: | |
1: The above exception was the direct cause of the following exception: | |
1: | |
1: Traceback (most recent call last): | |
1: File "/usr/local/bin/torchrun", line 33, in <module> | |
1: sys.exit(load_entry_point('torch==2.7.0a0+79aa17489c.nv25.4', 'console_scripts', 'torchrun')()) | |
1: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper | |
2: [W522 12:08:22.381328179 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[nid006826-hsn2]:58966, remote=[nid006803]:25678): Broken pipe | |
2: Exception raised from sendBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): | |
2: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400091d03ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
2: frame #1: <unknown function> + 0x5a96a40 (0x400040856a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #2: <unknown function> + 0x5a99acc (0x400040859acc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #3: <unknown function> + 0x5a9bfb4 (0x40004085bfb4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #4: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::dura | |
2: tion<long, std::ratio<1l, 1000l> >) + 0x148 (0x40004085d268 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40004085d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #6: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40004085eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
2: frame #7: <unknown function> + 0xf5ec14 (0x40003ac6ec14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: frame #8: <unknown function> + 0x595360 (0x40003a2a5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
2: frame #9: /usr/bin/python() [0x5036b4] | |
2: frame #10: _PyObject_MakeTpCall + 0x78 (0x4c2c98 in /usr/bin/python) | |
2: frame #11: /usr/bin/python() [0x4c6f6c] | |
2: frame #12: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/b | |
2: in/python) | |
2: frame #13: _PyObject_Call_Prepend + 0xc4 (0x4c4774 in /usr/bin/python) | |
2: frame #14: /usr/bin/python() [0x528ba0] | |
2: frame #15: PyObject_Call + 0x6c (0x4c517c in /usr/bin/python) | |
2: frame #16: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/python) | |
2: frame #17: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
2: frame #18: /usr/bin/python() [0x59c3c4] | |
2: frame #19: /usr/bin/python() [0x680e94] | |
2: frame #20: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
2: frame #21: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
2: frame #22: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
2: frame #23: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
2: frame #24: <unknown function> + 0x284c4 (0x4000390b84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: frame #25: __libc_start_main + 0x98 (0x4000390b8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
2: frame #26: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
2: | |
1: return f(*args, **kwargs) | |
1: ^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 892, in main | |
1: run(args) | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 883, in run | |
1: elastic_launch( | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__ | |
2: W0522 12:08:22.104000 126833 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'nid006826_126833_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. | |
2: Traceback (most recent call last): | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 117, in _call_store | |
1: return launch_agent(self._config, self._entrypoint, list(args)) | |
1: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent | |
2: return getattr(self._store, store_op)(*args, **kwargs) | |
1: result = agent.run() | |
1: ^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper | |
2: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: torch.distributed.DistNetworkError: failed to recv, got 0 bytes | |
2: | |
2: The above exception was the direct cause of the following exception: | |
2: | |
2: Traceback (most recent call last): | |
2: File "/usr/local/bin/torchrun", line 33, in <module> | |
1: result = f(*args, **kwargs) | |
1: ^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run | |
2: sys.exit(load_entry_point('torch==2.7.0a0+79aa17489c.nv25.4', 'console_scripts', 'torchrun')()) | |
2: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper | |
1: result = self._invoke_run(role) | |
1: ^^^^^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _invoke_run | |
2: return f(*args, **kwargs) | |
2: ^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 892, in main | |
1: num_nodes_waiting = rdzv_handler.num_nodes_waiting() | |
1: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1263, in num_nodes_waiting | |
1: self._state_holder.sync() | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 437, in sync | |
2: run(args) | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 883, in run | |
1: get_response = self._backend.get_state() | |
1: ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 75, in get_state | |
1: base64_state: bytes = self._call_store("get", self._key) | |
2: elastic_launch( | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__ | |
1: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
1: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 119, in _call_store | |
1: raise RendezvousConnectionError( | |
1: torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. | |
2: return launch_agent(self._config, self._entrypoint, list(args)) | |
2: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent | |
2: result = agent.run() | |
2: ^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper | |
2: result = f(*args, **kwargs) | |
2: ^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run | |
2: result = self._invoke_run(role) | |
2: ^^^^^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _invoke_run | |
2: num_nodes_waiting = rdzv_handler.num_nodes_waiting() | |
2: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1263, in num_nodes_waiting | |
2: self._state_holder.sync() | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 437, in sync | |
2: get_response = self._backend.get_state() | |
2: ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 75, in get_state | |
2: base64_state: bytes = self._call_store("get", self._key) | |
2: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
2: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 119, in _call_store | |
2: raise RendezvousConnectionError( | |
2: torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. | |
3: W0522 12:08:22.304000 221982 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 222328 closing signal SIGTERM | |
3: W0522 12:08:22.305000 221982 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 222329 closing signal SIGTERM | |
3: W0522 12:08:22.308000 221982 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 222330 closing signal SIGTERM | |
3: W0522 12:08:22.310000 221982 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 222331 closing signal SIGTERM | |
srun: error: nid006803: task 0: Exited with exit code 1 | |
srun: Terminating StepId=457295.1 | |
3: E0522 12:08:22.628000 221982 torch/distributed/elastic/multiprocessing/tail_log.py:144] error in log tailor for default0. SignalException: Process 221982 got signal: 15 | |
3: [W522 12:08:22.620662239 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=4, addr=[nid006827-hsn3]:56472, remote=[nid006803]:25678): Broken pipe | |
3: Exception raised from sendBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): | |
3: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400060f13ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
3: frame #1: <unknown function> + 0x5a96a40 (0x40000fa66a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #2: <unknown function> + 0x5a99acc (0x40000fa69acc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #3: <unknown function> + 0x5a9bfb4 (0x40000fa6bfb4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #4: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::dura | |
3: tion<long, std::ratio<1l, 1000l> >) + 0x148 (0x40000fa6d268 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40000fa6d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #6: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40000fa6eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #7: <unknown function> + 0xf5ec14 (0x400009e7ec14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: frame #8: <unknown function> + 0x595360 (0x4000094b5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: frame #9: /usr/bin/python() [0x5036b4] | |
3: frame #10: _PyObject_MakeTpCall + 0x78 (0x4c2c98 in /usr/bin/python) | |
3: frame #11: /usr/bin/python() [0x4c6f6c] | |
3: frame #12: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/b | |
3: in/python) | |
3: frame #13: _PyObject_Call_Prepend + 0xc4 (0x4c4774 in /usr/bin/python) | |
3: frame #14: /usr/bin/python() [0x528ba0] | |
3: frame #15: PyObject_Call + 0x6c (0x4c517c in /usr/bin/python) | |
3: frame #16: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/python) | |
3: frame #17: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
3: frame #18: /usr/bin/python() [0x59c3c4] | |
3: frame #19: /usr/bin/python() [0x680e94] | |
3: frame #20: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
3: frame #21: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
3: frame #22: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
3: frame #23: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
3: frame #24: <unknown function> + 0x284c4 (0x4000082c84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: frame #25: __libc_start_main + 0x98 (0x4000082c8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: frame #26: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
3: | |
3: W0522 12:08:22.694000 221982 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'nid006827_221982_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. | |
3: [W522 12:08:22.631234366 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=4, addr=[nid006827-hsn3]:56472, remote=[nid006803]:25678): Broken pipe | |
3: Exception raised from sendBytes at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): | |
3: frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xd4 (0x400060f13ea4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
3: frame #1: <unknown function> + 0x5a96a40 (0x40000fa66a40 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #2: <unknown function> + 0x5a99acc (0x40000fa69acc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #3: <unknown function> + 0x5a9bfb4 (0x40000fa6bfb4 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #4: c10d::TCPStore::doWait(c10::ArrayRef<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::chrono::dura | |
3: tion<long, std::ratio<1l, 1000l> >) + 0x148 (0x40000fa6d268 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x2c (0x40000fa6d7fc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #6: c10d::TCPStore::get(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x9c (0x40000fa6eedc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) | |
3: frame #7: <unknown function> + 0xf5ec14 (0x400009e7ec14 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: frame #8: <unknown function> + 0x595360 (0x4000094b5360 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
3: frame #9: /usr/bin/python() [0x5036b4] | |
3: frame #10: _PyObject_MakeTpCall + 0x78 (0x4c2c98 in /usr/bin/python) | |
3: frame #11: /usr/bin/python() [0x4c6f6c] | |
3: frame #12: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/b | |
3: in/python) | |
3: frame #13: _PyObject_Call_Prepend + 0xc4 (0x4c4774 in /usr/bin/python) | |
3: frame #14: /usr/bin/python() [0x528ba0] | |
3: frame #15: PyObject_Call + 0x6c (0x4c517c in /usr/bin/python) | |
3: frame #16: _PyEval_EvalFrameDefault + 0x3cf4 (0x567308 in /usr/bin/python) | |
3: frame #17: PyEval_EvalCode + 0x130 (0x562204 in /usr/bin/python) | |
3: frame #18: /usr/bin/python() [0x59c3c4] | |
3: frame #19: /usr/bin/python() [0x680e94] | |
3: frame #20: _PyRun_SimpleFileObject + 0x194 (0x680a68 in /usr/bin/python) | |
3: frame #21: _PyRun_AnyFileObject + 0x54 (0x680834 in /usr/bin/python) | |
3: frame #22: Py_RunMain + 0x2dc (0x68b83c in /usr/bin/python) | |
3: frame #23: Py_BytesMain + 0x28 (0x68b3f8 in /usr/bin/python) | |
3: frame #24: <unknown function> + 0x284c4 (0x4000082c84c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: frame #25: __libc_start_main + 0x98 (0x4000082c8598 in /usr/lib/aarch64-linux-gnu/libc.so.6) | |
3: frame #26: _start + 0x30 (0x5f6df0 in /usr/bin/python) | |
3: | |
3: W0522 12:08:22.703000 221982 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'nid006827_221982_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. | |
3: Traceback (most recent call last): | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 117, in _call_store | |
3: return getattr(self._store, store_op)(*args, **kwargs) | |
3: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: torch.distributed.DistNetworkError: failed to recv, got 0 bytes | |
3: | |
3: The above exception was the direct cause of the following exception: | |
3: | |
3: Traceback (most recent call last): | |
3: File "/usr/local/bin/torchrun", line 33, in <module> | |
3: sys.exit(load_entry_point('torch==2.7.0a0+79aa17489c.nv25.4', 'console_scripts', 'torchrun')()) | |
3: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper | |
3: return f(*args, **kwargs) | |
3: ^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 892, in main | |
3: run(args) | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 883, in run | |
3: elastic_launch( | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 139, in __call__ | |
3: return launch_agent(self._config, self._entrypoint, list(args)) | |
3: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent | |
3: result = agent.run() | |
3: ^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper | |
3: result = f(*args, **kwargs) | |
3: ^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run | |
3: result = self._invoke_run(role) | |
3: ^^^^^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _invoke_run | |
3: num_nodes_waiting = rdzv_handler.num_nodes_waiting() | |
3: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1263, in num_nodes_waiting | |
3: self._state_holder.sync() | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 437, in sync | |
3: get_response = self._backend.get_state() | |
3: ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 75, in get_state | |
3: base64_state: bytes = self._call_store("get", self._key) | |
3: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
3: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 119, in _call_store | |
3: raise RendezvousConnectionError( | |
3: torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. | |
srun: error: nid006825: task 1: Exited with exit code 1 | |
srun: error: nid006826: task 2: Exited with exit code 1 | |
srun: error: nid006827: task 3: Exited with exit code 1 | |
++ date | |
+ echo 'END TIME: Thu May 22 12:08:23 CEST 2025' | |
END TIME: Thu May 22 12:08:23 CEST 2025 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment