Created
February 23, 2022 21:00
-
-
Save jamesr66a/e61af6a3dab6caa466230314efed0c78 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WARNING:torch.distributed.run: | |
***************************************** | |
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
***************************************** | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:56439 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:56439 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:56439 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:56439 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:56439 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:56439 is not yet listening (errno: 111 - Connection refused), will retry. | |
REPLICATE config: False -> MultiUseParameterConfig.TRANSMIT | |
GraphModule( | |
(submod_0): GraphModule() | |
(submod_1): GraphModule() | |
(submod_2): GraphModule() | |
(_loss): MSELoss() | |
) | |
def forward(self, x, target): | |
submod_0 = self.submod_0(x) | |
getitem_2 = submod_0[2] | |
getitem = submod_0[0] | |
getitem_1 = submod_0[1] | |
submod_1 = self.submod_1(getitem, getitem_2) | |
getitem_4 = submod_1[1] | |
getitem_3 = submod_1[0] | |
submod_2 = self.submod_2(getitem_3, getitem_1, getitem_4) | |
_loss = self._loss(submod_2, target) | |
stage_backward = pippy_IR_stage_backward(stage_output = _loss, output_grads = None, input_values = [submod_2, target]); target = None | |
getitem_5 = stage_backward[0] | |
getitem_6 = stage_backward[1]; stage_backward = None | |
stage_backward_1 = pippy_IR_stage_backward(stage_output = submod_2, output_grads = getitem_5, input_values = [getitem_3, getitem_1, getitem_4]); submod_2 = getitem_5 = getitem_3 = getitem_1 = getitem_4 = None | |
getitem_7 = stage_backward_1[0] | |
getitem_8 = stage_backward_1[1] | |
getitem_9 = stage_backward_1[2]; stage_backward_1 = None | |
stage_backward_2 = pippy_IR_stage_backward(stage_output = submod_1, output_grads = [getitem_7, getitem_9], input_values = [getitem, getitem_2]); submod_1 = getitem_7 = getitem_9 = getitem = getitem_2 = None | |
getitem_10 = stage_backward_2[0] | |
getitem_11 = stage_backward_2[1]; stage_backward_2 = None | |
stage_backward_3 = pippy_IR_stage_backward(stage_output = submod_0, output_grads = [getitem_10, getitem_8, getitem_11], input_values = [x]); submod_0 = getitem_10 = getitem_8 = getitem_11 = x = None | |
getitem_12 = stage_backward_3[0]; stage_backward_3 = None | |
return _loss | |
/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py:394: UserWarning: Running pipeline with 3 stages on world_size of 10. Remaining ranks will be idle. | |
warnings.warn(f'Running pipeline with {len(executor_descriptors)} stages on world_size of {self.world_size}. ' | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x559ea1076e90 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x559ea1076e90 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x559ea1076e90 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x559ea10bd6a0 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x559ea10bf350 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x559ea1076e90 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x559ea1081bf0 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x559ea0ff7c90 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x559ea10bd6a0 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x559ea10bd6a0(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x559ea10bd6a0 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x559ea10bd6a0 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x559ea10bf350 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x559ea0ff7c90 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x559ea1081bf0 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fb940006530 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fb940006530 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fb940006530 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fb9300068a0(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fb940006530 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7fb92c008230 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7fb92c008230 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fb9300068a0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fb9300068a0 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7fb92c008670 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7fb92c008670 | |
(40082) ^^^^ Scenario 1 (created_on=0, local_id=30) | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7fda38007a90 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7fda380090a0 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7fda380090a0 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7fda30008770 | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7fda3000a730 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7fda3000a730 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7fda3000ab80 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7fda3000ab80(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x559ea10c1250 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7fda3000ce20 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=85) with future 0x559ea10ce950 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x559ea10c1250 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7fb918006ae0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7fb918006ae0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7fb918006ae0 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7fda3000ce20(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7fda3000ce20 | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7fda4c009c20 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7fdab400c430 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7fdab400c430 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7fdab400c430 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x7fdab400c430 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x559ea1076e90 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7faf30006210 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7faf30006210 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7faf3000e830 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7faf3000e830 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x7faf30006210 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7faf3000f3f0 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7faf3000f3f0 | |
(40080) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7faf30010010 | |
(40080) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7faf30010010 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x559ea10bd6a0 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7faf3000e830 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=96) with future 0x7faf30010010 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7fb92c008230 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7fb9100064a0 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7fb914008a10 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7fb9100064a0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7fb92c008670 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x559ea1081bf0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7fb914008a10 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x559ea10bf350 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fb9300068a0 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fb930007920 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fb930007920 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fb928008180 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fb928008180 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7faf3000f3f0 | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7fda3000a730 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7fda3000ab80 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7fda380090a0 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7fdab4017450 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=12) with future 0x7fda3800c370 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=12) with future 0x7fda3800c370 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7fb940006530 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7fdab4015290 | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7fb92c00a970 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7fb92c00a970 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7fda30008770 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=14) with future 0x7fda0c0061a0 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=14) with future 0x7fda0c0061a0 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7fb9440074d0 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7fb9440074d0 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7fda38007a90 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7fdab4017450 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7fda38007a90 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7fb9440099d0 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7fb9440099d0 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=18) with future 0x7fda3800b410 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=18) with future 0x7fda3800b410 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7fb92c00a970 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=20) with future 0x7fda0c008b10 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=20) with future 0x7fda0c008b10 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7fb94400a660 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7fb94400a660 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7fb9440074d0 | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7fda3800cd70 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7fda3800cd70 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=14) with future 0x7fda0c0061a0 | |
(40081) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7fb944009160 | |
(40081) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7fb944009160 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=12) with future 0x7fda3800c370 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7fda0c009db0 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7fda0c009db0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7fb9440099d0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7fb9300068a0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7fb94400a660 | |
(40082) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7fb944009160 | |
(40082) ^^^^ Scenario 1 (created_on=0, local_id=30) | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7fda3800d940 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7fda2c008e40 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7fda38007a90 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=18) with future 0x7fda3800b410 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=20) with future 0x7fda0c008b10 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x559ea1081bf0 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=32) with future 0x7fda0c00aac0 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=32) with future 0x7fda0c00aac0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fb928008180 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x559ea10bf350 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7fda2c008e40 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7fda4c009e70 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x559ea0ff7c90 | |
(40082) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=37) with future 0x7fda0c00b390 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=37) with future 0x7fda0c00b390 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7fda4c009e70 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7fda4c009c20 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7fda3800cd70 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7fda0c009db0 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fb930007920 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=32) with future 0x7fda0c00aac0(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7fb928008180 | |
(40082) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=37) with future 0x7fda0c00b390 | |
(40080) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x559ea0ff7c90 | |
(40081) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7fb930007920 | |
(40082) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7fda3800d940 | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py", line 105, in <module> | |
out = pipe_driver.run(input, target, chunks=CHUNKS, _debug_mask_minibatches = DEBUG_MASK_MINIBATCHES) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 586, in run | |
return self._retrieve_output_values(microbatch_interpreters, last_nodes, _debug_mask_minibatches, splits_per_arg) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 596, in _retrieve_output_values | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 596, in <listcomp> | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 45, in to_here | |
return a.to_here() | |
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error | |
terminate called without an active exception | |
terminate called recursively | |
[W tensorpipe_agent.cpp:682] RPC agent for worker1 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker6 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker2 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker8 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker5 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker9 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker4 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker7 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker3 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40081 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40082 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40083 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40084 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40085 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40086 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40087 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40088 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 40089 closing signal SIGTERM | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 40080) of binary: /fsx/users/jamesreed/conda/bin/python | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/conda/bin/torchrun", line 33, in <module> | |
sys.exit(load_entry_point('torch', 'console_scripts', 'torchrun')()) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 724, in main | |
run(args) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 715, in run | |
elastic_launch( | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 131, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 245, in launch_agent | |
raise ChildFailedError( | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2022-02-23_20:59:16 | |
host : ip-10-200-31-5.ec2.internal | |
rank : 0 (local_rank: 0) | |
exitcode : -6 (pid: 40080) | |
error_file: <N/A> | |
traceback : Signal 6 (SIGABRT) received by PID 40080 | |
============================================================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment