Created
February 23, 2022 20:44
-
-
Save jamesr66a/4bb7702e94c0831ed60576e7cddcc97f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WARNING:torch.distributed.run: | |
***************************************** | |
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
***************************************** | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:59755 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:59755 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:59755 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:59755 is not yet listening (errno: 111 - Connection refused), will retry. | |
REPLICATE config: False -> MultiUseParameterConfig.TRANSMIT | |
GraphModule( | |
(submod_0): GraphModule() | |
(submod_1): GraphModule() | |
(submod_2): GraphModule() | |
(_loss): MSELoss() | |
) | |
def forward(self, x, target): | |
submod_0 = self.submod_0(x) | |
getitem_2 = submod_0[2] | |
getitem = submod_0[0] | |
getitem_1 = submod_0[1] | |
submod_1 = self.submod_1(getitem, getitem_2) | |
getitem_4 = submod_1[1] | |
getitem_3 = submod_1[0] | |
submod_2 = self.submod_2(getitem_3, getitem_1, getitem_4) | |
_loss = self._loss(submod_2, target) | |
stage_backward = pippy_IR_stage_backward(stage_output = _loss, output_grads = None, input_values = [submod_2, target]); target = None | |
getitem_5 = stage_backward[0] | |
getitem_6 = stage_backward[1]; stage_backward = None | |
stage_backward_1 = pippy_IR_stage_backward(stage_output = submod_2, output_grads = getitem_5, input_values = [getitem_3, getitem_1, getitem_4]); submod_2 = getitem_5 = getitem_3 = getitem_1 = getitem_4 = None | |
getitem_7 = stage_backward_1[0] | |
getitem_8 = stage_backward_1[1] | |
getitem_9 = stage_backward_1[2]; stage_backward_1 = None | |
stage_backward_2 = pippy_IR_stage_backward(stage_output = submod_1, output_grads = [getitem_7, getitem_9], input_values = [getitem, getitem_2]); submod_1 = getitem_7 = getitem_9 = getitem = getitem_2 = None | |
getitem_10 = stage_backward_2[0] | |
getitem_11 = stage_backward_2[1]; stage_backward_2 = None | |
stage_backward_3 = pippy_IR_stage_backward(stage_output = submod_0, output_grads = [getitem_10, getitem_8, getitem_11], input_values = [x]); submod_0 = getitem_10 = getitem_8 = getitem_11 = x = None | |
getitem_12 = stage_backward_3[0]; stage_backward_3 = None | |
return _loss | |
/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py:394: UserWarning: Running pipeline with 3 stages on world_size of 10. Remaining ranks will be idle. | |
warnings.warn(f'Running pipeline with {len(executor_descriptors)} stages on world_size of {self.world_size}. ' | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x563fac438740 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x563fac438740 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x563fac438740 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x563fac45a800 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x563fac3c1cc0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x563fac438740 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x563fac3c31c0 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x563fac43af80 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x563fac45a800 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x563fac45a800 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x563fac45a800 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x563fac45a800 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x563fac3c1cc0 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x563fac43af80 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x563fac3c31c0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fdc38006750 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fdc38006750 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fdc38006750 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fdc28006ac0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fdc38006750 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fdc28006ac0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fdc28006ac0 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7fdc340087a0 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7fdc340087a0 | |
^^^^ Scenario 1 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f804c007a00 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7fdc34008b10 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7fdc34008b10 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7f8044006ac0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7f804c008dc0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7f804c008dc0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7f8050007f10 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7f8044006ac0 | |
^^^^ Scenario 2 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7f8044006ac0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x563fac4ffed0 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=85) with future 0x563fac50b710 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7f805000a3b0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7f805000a3b0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x563fac4ffed0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7f805000a800 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7f805000a800 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7fdc10006ae0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7fdc10006ae0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f8034006ac0 | |
^^^^ Scenario 2 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7f8040008bb0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f8034006ac0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f8034006ac0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7fdc10006ae0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7f8034006ac0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x563fac438740 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7f81340064a0 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7f81340064a0 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f813400e5a0 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f813400e5a0 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f813400f490 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f813400f490 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7f81340064a0 | |
(32500) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7f8134010000 | |
(32500) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7f8134010000 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x563fac45a800 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f813400e5a0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f813400f490 | |
^^^^ Scenario 2 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7f805000a3b0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7f804c008dc0 | |
^^^^ Scenario 2 | |
^^^^ Scenario 2 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7f8040007660 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7f805000a800 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f804c007a00 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7f8050007f10 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7f8040007660 | |
^^^^ Scenario 2 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f804c007a00 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7fdc2c006cb0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7fdc2c006cb0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7f8134010000 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=12) with future 0x7f8044006d60 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=12) with future 0x7f8044006d60 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fdc38006750 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7f80d8015290 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7fdc200061a0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=15) with future 0x7f8034006e90 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=15) with future 0x7f8034006e90 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=6) with future 0x7fdc40006410 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=6) with future 0x7fdc40006410 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7fdc34008b10 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7fdc340087a0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7fdc200061a0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=17) with future 0x55877209b7f0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=17) with future 0x55877209b7f0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=21) with future 0x7f80700085c0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=21) with future 0x7f80700085c0 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7fdc40008f00 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7fdc40008f00 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=12) with future 0x7f8044006d60 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7f805000dee0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7f805000dee0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x563fac3c31c0 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7fdc40009ee0 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7fdc40009ee0 | |
^^^^ Scenario 2 | |
^^^^ Scenario 1(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x563fac3c1cc0 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f8070009670 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=17) with future 0x55877209b7f0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=6) with future 0x7fdc40006410 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7fdc40008f00 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=21) with future 0x7f80700085c0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x563fac43af80 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7fdc4000bb00 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7fdc4000bb00 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=15) with future 0x7f8034006e90 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=30) with future 0x7f805000e9e0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=30) with future 0x7f805000e9e0 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fdc38009030 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fdc28006ac0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7f805000dee0 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7fdc4000c150 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7fdc4000c150 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fdc38009030 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7f80d800c430 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7fdc40009ee0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7f80d800c430 | |
(32501) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fdc380092a0^^^^ Scenario 2 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=35) with future 0x7f805000f530 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=35) with future 0x7f805000f530 | |
^^^^ Scenario 2 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7f8070007860 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f804c007a00 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7f8040008bb0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7f8070007860 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fdc380092a0 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=30) with future 0x7f805000e9e0 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7fdc4000bb00 | |
(32502) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=38) with future 0x7f805000fb70 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=38) with future 0x7f805000fb70 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7fdc4000c150 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=35) with future 0x7f805000f530 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x563fac3c31c0 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fdc28006ac0 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fdc380092a0 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x563fac3c1cc0 | |
(32501) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fdc38009030 | |
(32502) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=38) with future 0x7f805000fb70 | |
(32500) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x563fac43af80 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fdc38009030 | |
(32501) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fdc380092a0 | |
(32502) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7f8070009670 | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py", line 105, in <module> | |
out = pipe_driver.run(input, target, chunks=CHUNKS, _debug_mask_minibatches = DEBUG_MASK_MINIBATCHES) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 586, in run | |
return self._retrieve_output_values(microbatch_interpreters, last_nodes, _debug_mask_minibatches, splits_per_arg) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 596, in _retrieve_output_values | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 596, in <listcomp> | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 45, in to_here | |
return a.to_here() | |
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error | |
terminate called without an active exception | |
[W tensorpipe_agent.cpp:682] RPC agent for worker1 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker4 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker9 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker2 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker7 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker5 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker6 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker3 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker8 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32501 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32502 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32503 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32504 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32505 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32506 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32507 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32508 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 32509 closing signal SIGTERM | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 32500) of binary: /fsx/users/jamesreed/conda/bin/python | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/conda/bin/torchrun", line 33, in <module> | |
sys.exit(load_entry_point('torch', 'console_scripts', 'torchrun')()) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 724, in main | |
run(args) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 715, in run | |
elastic_launch( | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 131, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 245, in launch_agent | |
raise ChildFailedError( | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2022-02-23_20:44:10 | |
host : ip-10-200-31-5.ec2.internal | |
rank : 0 (local_rank: 0) | |
exitcode : -6 (pid: 32500) | |
error_file: <N/A> | |
traceback : Signal 6 (SIGABRT) received by PID 32500 | |
============================================================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment