Skip to content

Instantly share code, notes, and snippets.

@yngtodd
Created December 31, 2019 21:56
Show Gist options
  • Save yngtodd/315e2e56da9d513593a1f57d9c6eb573 to your computer and use it in GitHub Desktop.
Save yngtodd/315e2e56da9d513593a1f57d9c6eb573 to your computer and use it in GitHub Desktop.
mpirun -np 2 python init.py
Traceback (most recent call last):
File "init.py", line 17, in <module>
main()
File "init.py", line 12, in main
rpc.init_rpc(name, rank=rank, world_size=size)
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 84, in init_rpc
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 155, in _init_rpc_backend
rpc_backend_options=rpc_backend_options,
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 75, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 131, in _process_group_init_backend_handler
raise ex
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 127, in _process_group_init_backend_handler
rpc_backend_options.rpc_timeout,
RuntimeError: RpcAgent name dgx-dl04 is not unique. (collectNames at ../torch/csrc/distributed/rpc/process_group_agent.cpp:60)
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x6a (0x7fdf6bad701a in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: torch::distributed::rpc::ProcessGroupAgent::collectNames() + 0x99c (0x7fdf84acab0c in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #2: torch::distributed::rpc::ProcessGroupAgent::ProcessGroupAgent(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::shared_ptr<c10d::ProcessGroup>, int, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x462 (0x7fdf84accdb2 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #3: <unknown function> + 0xac145d (0x7fdf84ac445d in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #4: <unknown function> + 0x2e2791 (0x7fdf842e5791 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
<omitting python frames>
frame #34: __libc_start_main + 0xf0 (0x7fdf94fc0830 in /lib/x86_64-linux-gnu/libc.so.6)
Traceback (most recent call last):
File "init.py", line 17, in <module>
main()
File "init.py", line 12, in main
rpc.init_rpc(name, rank=rank, world_size=size)
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 84, in init_rpc
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 155, in _init_rpc_backend
rpc_backend_options=rpc_backend_options,
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 75, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 131, in _process_group_init_backend_handler
raise ex
File "/home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 127, in _process_group_init_backend_handler
rpc_backend_options.rpc_timeout,
RuntimeError: RpcAgent name dgx-dl04 is not unique. (collectNames at ../torch/csrc/distributed/rpc/process_group_agent.cpp:60)
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x6a (0x7f079db8401a in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: torch::distributed::rpc::ProcessGroupAgent::collectNames() + 0x99c (0x7f07b6b77b0c in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #2: torch::distributed::rpc::ProcessGroupAgent::ProcessGroupAgent(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::shared_ptr<c10d::ProcessGroup>, int, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x462 (0x7f07b6b79db2 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #3: <unknown function> + 0xac145d (0x7f07b6b7145d in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #4: <unknown function> + 0x2e2791 (0x7f07b6392791 in /home/ygx/lib/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
<omitting python frames>
frame #34: __libc_start_main + 0xf0 (0x7f07c706d830 in /lib/x86_64-linux-gnu/libc.so.6)
-------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code.. Per user-direction, the job has been aborted.
-------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[21818,1],0]
Exit code: 1
--------------------------------------------------------------------------
import torch
import torch.distributed.rpc as rpc
from mpi4py import MPI
def main():
size = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
name = MPI.Get_processor_name()
rpc.init_rpc(name, rank=rank, world_size=size)
print('initialized!')
if __name__=='__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment