Created
February 23, 2022 21:26
-
-
Save jamesr66a/d142e450c9b6a09199790f6c7ac98525 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WARNING:torch.distributed.run: | |
***************************************** | |
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
***************************************** | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:46879 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:46879 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:46879 is not yet listening (errno: 111 - Connection refused), will retry. | |
[W socket.cpp:701] The server socket on [ip-10-200-31-5.ec2.internal]:46879 is not yet listening (errno: 111 - Connection refused), will retry. | |
REPLICATE config: False -> MultiUseParameterConfig.TRANSMIT | |
GraphModule( | |
(submod_0): GraphModule() | |
(submod_1): GraphModule() | |
(submod_2): GraphModule() | |
(_loss): MSELoss() | |
) | |
def forward(self, x, target): | |
submod_0 = self.submod_0(x) | |
getitem_2 = submod_0[2] | |
getitem = submod_0[0] | |
getitem_1 = submod_0[1] | |
submod_1 = self.submod_1(getitem, getitem_2) | |
getitem_4 = submod_1[1] | |
getitem_3 = submod_1[0] | |
submod_2 = self.submod_2(getitem_3, getitem_1, getitem_4) | |
_loss = self._loss(submod_2, target) | |
stage_backward = pippy_IR_stage_backward(stage_output = _loss, output_grads = None, input_values = [submod_2, target]); target = None | |
getitem_5 = stage_backward[0] | |
getitem_6 = stage_backward[1]; stage_backward = None | |
stage_backward_1 = pippy_IR_stage_backward(stage_output = submod_2, output_grads = getitem_5, input_values = [getitem_3, getitem_1, getitem_4]); submod_2 = getitem_5 = getitem_3 = getitem_1 = getitem_4 = None | |
getitem_7 = stage_backward_1[0] | |
getitem_8 = stage_backward_1[1] | |
getitem_9 = stage_backward_1[2]; stage_backward_1 = None | |
stage_backward_2 = pippy_IR_stage_backward(stage_output = submod_1, output_grads = [getitem_7, getitem_9], input_values = [getitem, getitem_2]); submod_1 = getitem_7 = getitem_9 = getitem = getitem_2 = None | |
getitem_10 = stage_backward_2[0] | |
getitem_11 = stage_backward_2[1]; stage_backward_2 = None | |
stage_backward_3 = pippy_IR_stage_backward(stage_output = submod_0, output_grads = [getitem_10, getitem_8, getitem_11], input_values = [x]); submod_0 = getitem_10 = getitem_8 = getitem_11 = x = None | |
getitem_12 = stage_backward_3[0]; stage_backward_3 = None | |
return _loss | |
/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py:394: UserWarning: Running pipeline with 3 stages on world_size of 10. Remaining ranks will be idle. | |
warnings.warn(f'Running pipeline with {len(executor_descriptors)} stages on world_size of {self.world_size}. ' | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x555c076a07a0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x555c076a07a0 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x555c076a07a0 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x555c07531120 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x555c07692550 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x555c0762bc20 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x555c076a07a0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x555c07531120 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x555c0757fda0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x555c07531120 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x555c07531120 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x555c07531120 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x555c0757fda0 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x555c07692550 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x555c0762bc20 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f152c006530 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f152c006530 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f152c006530 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f151c006ac0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f151c006ac0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f152c006530 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f151c006ac0 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7f15280085c0 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7f15280085c0 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7f15280089c0 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7f15280089c0 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55829) ^^^^ Scenario 1 (created_on=0, local_id=30) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7ff1140078d0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7ff114008ce0 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7ff114008ce0 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7ff10c0068a0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7ff10c0068a0 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7ff118008b20 | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7ff11800a360 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7ff11800a360 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7ff11800a760 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7ff11800a760 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x555c07771540 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=85) with future 0x555c0778c580 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7f1510006ae0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=83) with future 0x555c07771540 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7f1510006ae0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=39) with future 0x7ff10c0068a0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=70) with future 0x7f1510006ae0 | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7ff108008ef0 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x55654ef779d0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x55654ef779d0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x55654ef779d0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=53) with future 0x55654ef779d0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=0) with future 0x555c076a07a0 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x555c076e5880 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x555c076e5880 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f6c54009090 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f6c54009090 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f6c54009bb0 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f6c54009bb0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=86) with future 0x555c076e5880 | |
(55827) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=95) with future 0x7f6c54008950 | |
(55827) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=95) with future 0x7f6c54008950 | |
(55827) ^^^^ Destructing OwnerRRef (created_on=0, local_id=86) | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=6) with future 0x555c07531120 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=0) with future 0x7f15280085c0 | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=0) | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=89) with future 0x7f6c54009090 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=3) with future 0x7f15280089c0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=95) with future 0x7f6c54008950 | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=3) | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x555c0762bc20 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7f1510006eb0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=73) with future 0x7f1510006eb0 | |
(55827) ^^^^ Destructing OwnerRRef (created_on=0, local_id=89) | |
(55827) ^^^^ Destructing OwnerRRef (created_on=0, local_id=95) | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7f1504006ab0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x555c07692550 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=76) with future 0x7f1504006ab0 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f151c006ac0 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f151c0080d0 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f151c0080d0 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f15200078c0 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f15200078c0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=92) with future 0x7f6c54009bb0 | |
(55827) ^^^^ Destructing OwnerRRef (created_on=0, local_id=92) | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=1) with future 0x7f152c006530 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7ff11c021d30 | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7f152800c010 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7f152800c010 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=3) with future 0x7ff124006750 | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7f1534006550 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7f1534006550 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=9) with future 0x7ff124006b40 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=9) with future 0x7ff124006b40 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=8) with future 0x7f152800c010 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=59) with future 0x7ff11c021d30 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=0) with future 0x7ff114008ce0(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7f1534009610 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7f1534009610 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=11) with future 0x7ff118008ef0 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=11) with future 0x7ff118008ef0 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7f153400a4e0 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7f153400a4e0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=11) with future 0x7f1534006550 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=3) with future 0x7ff11800a360 | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=8) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=16) with future 0x7ff12c006410 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=16) with future 0x7ff12c006410 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=17) with future 0x7f151c006ac0 | |
(55828) Instantiating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7f1534009c20 | |
(55828) Populating OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7f1534009c20 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=18) with future 0x7ff10800a9e0 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=18) with future 0x7ff10800a9e0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=14) with future 0x7f1534009610 | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=11) | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7ff10c006b20 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=18) with future 0x7f153400a4e0(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=6) with future 0x7ff11800a760 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=0) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7ff1140078d0 | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=14) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7ff11400b9e0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=1, local_id=22) with future 0x7f1534009c20 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=10) with future 0x555c0762bc20(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7ff10800d060 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7ff10800d060 | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=22) | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=6) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=62) with future 0x7ff10c006b20 | |
(55828) ^^^^ Destructing OwnerRRef (created_on=1, local_id=18) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7ff12c0094a0 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7ff12c0094a0 | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=8) with future 0x555c07692550 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=3) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=9) with future 0x7ff124006b40 | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=56) with future 0x7ff11400b9e0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=16) with future 0x7ff12c006410 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=11) with future 0x7ff118008ef0 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=31) with future 0x7ff12c00a340 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=31) with future 0x7ff12c00a340 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f15200078c0 | |
(55829) ^^^^ Scenario 1 (created_on=0, local_id=30) | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7ff1080096b0 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=9) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=34) with future 0x7ff118008b20 | |
(55829) Instantiating OwnerRRef GloballyUniqueId(created_on=2, local_id=36) with future 0x7ff12c00ad60 | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=2, local_id=36) with future 0x7ff12c00ad60 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7ff1140078d0 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=11) | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=16) | |
(55829) ^^^^ Scenario 2 (created_on=0, local_id=30) | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=18) with future 0x7ff10800a9e0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7ff1140078d0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=26) with future 0x7ff12c0094a0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=42) with future 0x7ff108008ef0 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=18) | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=26) | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x555c0757fda0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=24) with future 0x7ff10800d060 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=31) with future 0x7ff12c00a340 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=23) with future 0x7f15200078c0 | |
(55829) Waiting on OwnerRRef GloballyUniqueId(created_on=2, local_id=36) with future 0x7ff12c00ad60 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=31) | |
(55827) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=12) with future 0x555c0757fda0 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=24) | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f151c0080d0 | |
(55828) Waiting on OwnerRRef GloballyUniqueId(created_on=0, local_id=20) with future 0x7f151c0080d0 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=2, local_id=36) | |
(55829) Populating OwnerRRef GloballyUniqueId(created_on=0, local_id=30) with future 0x7ff1080096b0 | |
(55829) ^^^^ Destructing OwnerRRef (created_on=0, local_id=30) | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py", line 105, in <module> | |
out = pipe_driver.run(input, target, chunks=CHUNKS, _debug_mask_minibatches = DEBUG_MASK_MINIBATCHES) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 584, in run | |
return self._retrieve_output_values(microbatch_interpreters, last_nodes, _debug_mask_minibatches, splits_per_arg) | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 594, in _retrieve_output_values | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 594, in <listcomp> | |
local_results = [to_here(result) for result in output_vals] | |
File "/fsx/users/jamesreed/pipeline_for_real/pippy/PipelineDriver.py", line 45, in to_here | |
return a.to_here() | |
RuntimeError: RPCErr:1:RPC ran for more than set timeout (60000 ms) and will now be marked with an error | |
terminate called without an active exception | |
terminate called recursively | |
[W tensorpipe_agent.cpp:682] RPC agent for worker7 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker9 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker3 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker1 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker2 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker4 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker6 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker5 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
[W tensorpipe_agent.cpp:682] RPC agent for worker8 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55828 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55829 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55830 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55831 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55832 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55833 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55834 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55835 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 55836 closing signal SIGTERM | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 55827) of binary: /fsx/users/jamesreed/conda/bin/python | |
Traceback (most recent call last): | |
File "/fsx/users/jamesreed/conda/bin/torchrun", line 33, in <module> | |
sys.exit(load_entry_point('torch', 'console_scripts', 'torchrun')()) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 724, in main | |
run(args) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/run.py", line 715, in run | |
elastic_launch( | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 131, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
File "/fsx/users/jamesreed/pytorch/torch/distributed/launcher/api.py", line 245, in launch_agent | |
raise ChildFailedError( | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/fsx/users/jamesreed/pipeline_for_real/test/local_test_forward_backward.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2022-02-23_21:25:30 | |
host : ip-10-200-31-5.ec2.internal | |
rank : 0 (local_rank: 0) | |
exitcode : -6 (pid: 55827) | |
error_file: <N/A> | |
traceback : Signal 6 (SIGABRT) received by PID 55827 | |
============================================================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment