Created
December 31, 2019 21:56
-
-
Save yngtodd/315e2e56da9d513593a1f57d9c6eb573 to your computer and use it in GitHub Desktop.
mpirun -np 2 python init.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Traceback (most recent call last): | |
File "init.py", line 17, in <module> | |
main() | |
File "init.py", line 12, in main | |
rpc.init_rpc(name, rank=rank, world_size=size) | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 84, in init_rpc | |
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options) | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 155, in _init_rpc_backend | |
rpc_backend_options=rpc_backend_options, | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 75, in init_backend | |
return backend.value.init_backend_handler(*args, **kwargs) | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 131, in _process_group_init_backend_handler | |
raise ex | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 127, in _process_group_init_backend_handler | |
rpc_backend_options.rpc_timeout, | |
RuntimeError: RpcAgent name dgx-dl04 is not unique. (collectNames at ../torch/csrc/distributed/rpc/process_group_agent.cpp:60) | |
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x6a (0x7fdf6bad701a in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libc10.so) | |
frame #1: torch::distributed::rpc::ProcessGroupAgent::collectNames() + 0x99c (0x7fdf84acab0c in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #2: torch::distributed::rpc::ProcessGroupAgent::ProcessGroupAgent(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::shared_ptr<c10d::ProcessGroup>, int, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x462 (0x7fdf84accdb2 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #3: <unknown function> + 0xac145d (0x7fdf84ac445d in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #4: <unknown function> + 0x2e2791 (0x7fdf842e5791 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #34: __libc_start_main + 0xf0 (0x7fdf94fc0830 in /lib/x86_64-linux-gnu/libc.so.6) | |
Traceback (most recent call last): | |
File "init.py", line 17, in <module> | |
main() | |
File "init.py", line 12, in main | |
rpc.init_rpc(name, rank=rank, world_size=size) | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 84, in init_rpc | |
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options) | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 155, in _init_rpc_backend | |
rpc_backend_options=rpc_backend_options, | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 75, in init_backend | |
return backend.value.init_backend_handler(*args, **kwargs) | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 131, in _process_group_init_backend_handler | |
raise ex | |
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 127, in _process_group_init_backend_handler | |
rpc_backend_options.rpc_timeout, | |
RuntimeError: RpcAgent name dgx-dl04 is not unique. (collectNames at ../torch/csrc/distributed/rpc/process_group_agent.cpp:60) | |
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x6a (0x7f079db8401a in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libc10.so) | |
frame #1: torch::distributed::rpc::ProcessGroupAgent::collectNames() + 0x99c (0x7f07b6b77b0c in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #2: torch::distributed::rpc::ProcessGroupAgent::ProcessGroupAgent(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::shared_ptr<c10d::ProcessGroup>, int, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x462 (0x7f07b6b79db2 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #3: <unknown function> + 0xac145d (0x7f07b6b7145d in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
frame #4: <unknown function> + 0x2e2791 (0x7f07b6392791 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so) | |
<omitting python frames> | |
frame #34: __libc_start_main + 0xf0 (0x7f07c706d830 in /lib/x86_64-linux-gnu/libc.so.6) | |
------------------------------------------------------- | |
Primary job terminated normally, but 1 process returned | |
a non-zero exit code.. Per user-direction, the job has been aborted. | |
------------------------------------------------------- | |
-------------------------------------------------------------------------- | |
mpirun detected that one or more processes exited with non-zero status, thus causing | |
the job to be terminated. The first process to do so was: | |
Process name: [[21818,1],0] | |
Exit code: 1 | |
-------------------------------------------------------------------------- |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.distributed.rpc as rpc | |
from mpi4py import MPI | |
def main(): | |
size = MPI.COMM_WORLD.Get_size() | |
rank = MPI.COMM_WORLD.Get_rank() | |
name = MPI.Get_processor_name() | |
rpc.init_rpc(name, rank=rank, world_size=size) | |
print('initialized!') | |
if __name__=='__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment