Created
February 23, 2022 21:22
-
-
Save jamesr66a/e1894d79c4d7a4276a203d1090509ef8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WARNING:torch.distributed.run: | |
***************************************** | |
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
***************************************** | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:39643 is not yet listening (errno: 111 - Connection refused), will retry. | |
REPLICATE config: False -> MultiUseParameterConfig.TRANSMIT | |
GraphModule( | |
(submod_0): GraphModule() | |
(submod_1): GraphModule() | |
(submod_2): GraphModule() | |
(_loss): MSELoss() | |
) | |
def forward(self, x, target): | |
submod_0 = self.submod_0(x) | |
getitem_2 = submod_0[2] | |
getitem = submod_0[0] | |
getitem_1 = submod_0[1] | |
submod_1 = self.submod_1(getitem, getitem_2) | |
getitem_4 = submod_1[1] | |
getitem_3 = submod_1[0] | |
submod_2 = self.submod_2(getitem_3, getitem_1, getitem_4) | |
_loss = self._loss(submod_2, target) | |
stage_backward = pippy_IR_stage_backward(stage_output = _loss, output_grads = None, input_values = [submod_2, target]); target = None | |
getitem_5 = stage_backward[0] | |
getitem_6 = stage_backward[1]; stage_backward = None | |
stage_backward_1 = pippy_IR_stage_backward(stage_output = submod_2, output_grads = getitem_5, input_values = [getitem_3, getitem_1, getitem_4]); submod_2 = getitem_5 = getitem_3 = getitem_1 = getitem_4 = None | |
getitem_7 = stage_backward_1[0] | |
getitem_8 = stage_backward_1[1] | |
getitem_9 = stage_backward_1[2]; stage_backward_1 = None | |
stage_backward_2 = pippy_IR_stage_backward(stage_output = submod_1, output_grads = [getitem_7, getitem_9], input_values = [getitem, getitem_2]); submod_1 = getitem_7 = getitem_9 = getitem = getitem_2 = None | |
getitem_10 = stage_backward_2[0] | |
getitem_11 = stage_backward_2[1]; stage_backward_2 = None | |
stage_backward_3 = pippy_IR_stage_backward(stage_output = submod_0, output_grads = [getitem_10, getitem_8, getitem_11], input_values = [x]); submod_0 = getitem_10 = getitem_8 = getitem_11 = x = None | |
getitem_12 = stage_backward_3[0]; stage_backward_3 = None | |
return _loss | |
/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py:394: UserWarning: Running pipeline with 3 stages on world_size of 10. Remaining ranks will be idle. | |
warnings.warn(f'Running pipeline with {len(executor_descriptors)} stages on world_size of {self.world_size}. ' | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x56537d81cf60 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x56537d81cf60 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x56537d81cf60 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x56537d814240 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x56537d857210 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x56537d81cf60 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x56537d8556d0 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x56537d814240 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x56537d85e360 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x56537d814240 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x56537d814240 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x56537d814240 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x56537d8556d0 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x56537d85e360 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x56537d857210 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f5a3c006750 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f5a3c006750 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f5a3c006750 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f5a3c006750 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f5a2c0068a0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7f5a38008210 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7f5a38008210 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f5a2c0068a0 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7f5a38008650 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7f5a38008650 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f5a2c0068a0 | |
(49856) ^^^^ Scenario 1 (created_on=0, local_id=30) | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f85b0007870 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7f85a8006ac0 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7f85a4008000 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7f85a8006ac0 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7f85b00091f0 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7f85b00091f0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7f85a400a9b0 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7f85a400a9b0 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x56537d89b4d0 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7f85a400ae00 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7f85a400ae00 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=85) with future 0x56537d89dfe0 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x56537d89b4d0 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x56449ccab120 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x56449ccab120 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x56449ccab120 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30)(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7f85a8006ac0 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7f85940097d0 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f85a40092e0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f85a40092e0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f85a40092e0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f85a40092e0 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x56537d81cf60 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7f3ce000eb50 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7f3ce000eb50 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f3c38006a20 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f3c38006a20 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7f3ce000eb50 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f3c38009470 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f3c38009470 | |
(49854) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7f3c38008e40 | |
(49854) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7f3c38008e40 | |
(49854) ^^^^ Destructing OwnerRRef (created_on=0, local_id=86) | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x56537d814240 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f3c38009470 | |
(49854) ^^^^ Destructing OwnerRRef (created_on=0, local_id=92) | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f3c38006a20 | |
(49854) ^^^^ Destructing OwnerRRef (created_on=0, local_id=89) | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7f3c38008e40 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7f5a2c006c70 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7f5a2c006c70 | |
(49854) ^^^^ Destructing OwnerRRef (created_on=0, local_id=96) | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7f5a38007840 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7f5a38007840 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7f5a38008210 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7f5a38008650 | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=0)(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x56537d8556d0 | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=3) | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7f859c006970 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x56537d857210 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7f859c006970 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7f85b00091f0 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=0) | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f5a2c0068a0 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f5a30008280 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f5a30008280 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7f85a400a9b0 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f5a2c007c80 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f5a2c007c80 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=3) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=11) with future 0x7f859400c600 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=11) with future 0x7f859400c600 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f8630015290 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f5a3c006750 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=13) with future 0x7f85a8006d60 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=13) with future 0x7f85a8006d60 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7f5a14006cd0 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7f5a14006cd0 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7f85a4008000 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f85b0007870 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=17) with future 0x7f8594009d30 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=17) with future 0x7f8594009d30 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7f5a44006550 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7f5a44006550 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=19) with future 0x7f85c8008500 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=19) with future 0x7f85c8008500 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7f5a44009de0 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7f5a44009de0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7f85a400ae00 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7f5a14006cd0 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=17) with future 0x7f5a4400a8e0 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=17) with future 0x7f5a4400a8e0 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7f8594009090 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7f8594009090 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7f5a44006550 | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7f85c80092e0 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7f85c80092e0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=11) with future 0x7f859400c600 | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=8) | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=30) with future 0x7f85c800a370 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=30) with future 0x7f85c800a370 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f5a2c0068a0 | |
(49855) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7f5a4400b3f0 | |
(49855) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7f5a4400b3f0 | |
(49856) ^^^^ Scenario 1 (created_on=0, local_id=30) | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f859400d840 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=6) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=19) with future 0x7f85c8008500 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7f5a44009de0 | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=11) | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7f86300195c0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f85b0007870 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=13) with future 0x7f85a8006d60 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=19) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7f86300195c0 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=17) with future 0x7f5a4400a8e0 | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=14) | |
(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=36) with future 0x7f85c800afd0 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=36) with future 0x7f85c800afd0 | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7f5a4400b3f0 | |
(49856) ^^^^ Scenario 2 (created_on=2, local_id=30) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=17) with future 0x7f8594009d30 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=11) | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=22) | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f5a30008280 | |
(49855) ^^^^ Destructing OwnerRRef (created_on=1, local_id=17) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7f85940097d0 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x56537d85e360(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7f85c80092e0 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=17) | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x56537d857210(49856) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7f859400f8c0 | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x56537d8556d0 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7f8594009090 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7f859400f8c0 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=13) | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=26) | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f5a30008280 | |
(49856) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f5a2c007c80 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=30) with future 0x7f85c800a370 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=24) | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f85b0007870 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=30) | |
(49854) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x56537d85e360 | |
(49856) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=36) with future 0x7f85c800afd0 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=2, local_id=36) | |
(49855) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f5a2c007c80 | |
(49856) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f859400d840 | |
(49856) ^^^^ Destructing OwnerRRef (created_on=0, local_id=30) | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py", line 105, in <module> | |
out = pipe_driver.run(input, target, chunks=CHUNKS, _debug_mask_minibatches = DEBUG_MASK_MINIBATCHES) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 584, in run | |
return self._retrieve_output_values(microbatch_interpreters, last_nodes, _debug_mask_minibatches, splits_per_arg) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 594, in _retrieve_output_values | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 594, in <listcomp> | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 45, in to_here | |
return a.to_here() | |
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error | |
terminate called without an active exception | |
[W tensorpipe_agent.cpp:682] RPC agent for worker7 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker6 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker1 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker2 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker8 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker5 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker3 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker9 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker4 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49855 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49856 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49857 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49858 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49859 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49860 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49861 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49862 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 49863 closing signal SIGTERM | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 49854) of binary: /fsx/users/jamesreed/conda/bin/python | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/conda/bin/torchrun", line 33, in <module> | |
sys.exit(load_entry_point('torch', 'console_scripts', 'torchrun')()) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 724, in main | |
run(args) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 715, in run | |
elastic_launch( | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 131, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 245, in launch_agent | |
raise ChildFailedError( | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2022-02-23_21:20:16 | |
host : ip-10-200-31-5.ec2.internal | |
rank : 0 (local_rank: 0) | |
exitcode : -6 (pid: 49854) | |
error_file: <N/A> | |
traceback : Signal 6 (SIGABRT) received by PID 49854 | |
============================================================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment