October 21, 2025 06:07 · April 16, 2024 23:43 · October 5, 2023 16:32 · May 16, 2022 18:00 · May 14, 2022 02:57 · May 13, 2022 22:24
 [RESOLVED] Increased Error Rates and Latencies

 [03:53 PM PDT] Between 11:49 PM PDT on October 19 and 2:24 AM PDT on October 20, we experienced increased error rates and latencies for AWS Services in the US-EAST-1 Region. Additionally, services or features that rely on US-EAST-1 endpoints such as IAM and DynamoDB Global Tables also experienced issues during this time. At 12:26 AM on October 20, we identified the trigger of the event as DNS resolution issues for the regional DynamoDB service endpoints. After resolving the DynamoDB DNS issue at 2:24 AM, services began recovering but we had a subsequent impairment in the internal subsystem of EC2 that is responsible for launching EC2 instances due to its dependency on DynamoDB. As we continued to work through EC2 instance launch impairments, Network Load Balancer health checks also became impaired, resulting in network connectivity issues in multiple services such as Lambda, DynamoDB, and CloudWatch. We recovered the Network Load Balancer health checks at 9:38 A
 ================================================================================
 Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/fireworks/util/multiprocessing.py", line 70, in _wrap
    fn(i, *args)
  File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/image_generation/coordinator.py", line 32, in _worker_fn
    worker.init(args)
  File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/image_generation/worker.py", line 121, in init
    self.model_manager.warmup_models()
  File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/image_generation/model_manager.py", line 333, in warmup_models
    pipeline.text_to_image(
 ERROR:    Error responding to query
 Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/fastapi_poe/base.py", line 174, in handle_query
    async for event in self.get_response(query):
  File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/poe_text_completion_server/fw_poe_server_bot.py", line 114, in get_response
    async for response in ChatCompletion.acreate(
  File "/usr/local/lib/python3.10/dist-packages/fireworks/client/base_completion.py", line 145, in _acreate_streaming
    async for event in _parse_sse_async(response, cls.stream_response_class):
  File "/usr/local/lib/python3.10/dist-packages/fireworks/client/base_completion.py", line 16, in _parse_sse_async
    async for line in lines:
 import torch
 import torch.fx

 def foo(x):
    with torch.autograd.profiler.record_function('fooo'):
        return torch.relu(x)


 class RecordFunctionTracer(torch.fx.Tracer):
    def trace(self, root, concrete_args=None):
  0%|          | 0/25 [00:00<?, ?it/s]The following columns in the training set don't have a corresponding argument in `Pipe.forward` and have been ignored: input_ids, decoder_input_ids, labels. If input_ids, decoder_input_ids, labels are not expected by `Pipe.forward`,  you can safely ignore this message.
 Traceback (most recent call last):
  File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 302, in <module>
    run_worker(args.rank, args.world_size, args)
  File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 250, in run_worker
    main(args, pp_ranks_per_dp_group[rank])
  File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 207, in main
    run_training(args, model, datasets, data_collator, pippy_driver=pipeline_driver)
  File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 125, in run_training
    trainer.train()
 ###### test.py #####
 import torch
 from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5Config,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    set_seed,
 diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
 index e2f033d72a..7b3a97991d 100644
 --- a/torch/fx/graph_module.py
 +++ b/torch/fx/graph_module.py
 @@ -222,6 +222,56 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
     else:
         setattr(to_module, field, from_obj)
 
 +class _WrappedCall:
 +    def __init__(self, cls, cls_call):
 import argparse, socket, os

 import torch
 import torch.fx
 import torch.distributed.rpc as rpc
 import torch.multiprocessing as mp

 def run_main(args):
    class MyCode(torch.nn.Module):
        def forward(self, x):
 diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
 index 0270295dd..acf3deaf4 100644
 --- a/src/transformers/utils/fx.py
 +++ b/src/transformers/utils/fx.py
 @@ -485,18 +485,18 @@ class HFTracer(Tracer):
 
         _reset_tensor_methods(self.original_methods)
 
 -        # TODO: keep this until necessary.
 -        # This is necessary because concrete args are added as input to the traced module since
 diff --git a/test/t5_test.py b/test/t5_test.py
 index e6b36b8..09c2f13 100644
 --- a/test/t5_test.py
 +++ b/test/t5_test.py
 @@ -33,5 +33,5 @@ print(t5_pipe.split_gm.graph)
 t5_input = torch.zeros(bs, seq_length, dtype=torch.long).random_(t5.config.vocab_size)
 decoder_input_ids = torch.zeros(bs, seq_length, dtype=torch.long).random_(t5.config.vocab_size)
 
 -t5_output = t5(input_ids=t5_input, decoder_attention_mask=None, decoder_input_ids=decoder_input_ids)
 -t5_pipe_output = t5_pipe(input_ids=t5_input, decoder_attention_mask=None, decoder_input_ids=decoder_input_ids)
	[RESOLVED] Increased Error Rates and Latencies

	[03:53 PM PDT] Between 11:49 PM PDT on October 19 and 2:24 AM PDT on October 20, we experienced increased error rates and latencies for AWS Services in the US-EAST-1 Region. Additionally, services or features that rely on US-EAST-1 endpoints such as IAM and DynamoDB Global Tables also experienced issues during this time. At 12:26 AM on October 20, we identified the trigger of the event as DNS resolution issues for the regional DynamoDB service endpoints. After resolving the DynamoDB DNS issue at 2:24 AM, services began recovering but we had a subsequent impairment in the internal subsystem of EC2 that is responsible for launching EC2 instances due to its dependency on DynamoDB. As we continued to work through EC2 instance launch impairments, Network Load Balancer health checks also became impaired, resulting in network connectivity issues in multiple services such as Lambda, DynamoDB, and CloudWatch. We recovered the Network Load Balancer health checks at 9:38 A
	================================================================================
	Traceback (most recent call last):
	File "/usr/local/lib/python3.10/dist-packages/fireworks/util/multiprocessing.py", line 70, in _wrap
	fn(i, *args)
	File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/image_generation/coordinator.py", line 32, in _worker_fn
	worker.init(args)
	File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/image_generation/worker.py", line 121, in init
	self.model_manager.warmup_models()
	File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/image_generation/model_manager.py", line 333, in warmup_models
	pipeline.text_to_image(
	ERROR: Error responding to query
	Traceback (most recent call last):
	File "/usr/local/lib/python3.10/dist-packages/fastapi_poe/base.py", line 174, in handle_query
	async for event in self.get_response(query):
	File "/usr/local/lib/python3.10/dist-packages/fireworks/serving/poe_text_completion_server/fw_poe_server_bot.py", line 114, in get_response
	async for response in ChatCompletion.acreate(
	File "/usr/local/lib/python3.10/dist-packages/fireworks/client/base_completion.py", line 145, in _acreate_streaming
	async for event in _parse_sse_async(response, cls.stream_response_class):
	File "/usr/local/lib/python3.10/dist-packages/fireworks/client/base_completion.py", line 16, in _parse_sse_async
	async for line in lines:
	import torch
	import torch.fx

	def foo(x):
	with torch.autograd.profiler.record_function('fooo'):
	return torch.relu(x)


	class RecordFunctionTracer(torch.fx.Tracer):
	def trace(self, root, concrete_args=None):
	0%\| \| 0/25 [00:00<?, ?it/s]The following columns in the training set don't have a corresponding argument in `Pipe.forward` and have been ignored: input_ids, decoder_input_ids, labels. If input_ids, decoder_input_ids, labels are not expected by `Pipe.forward`, you can safely ignore this message.
	Traceback (most recent call last):
	File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 302, in <module>
	run_worker(args.rank, args.world_size, args)
	File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 250, in run_worker
	main(args, pp_ranks_per_dp_group[rank])
	File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 207, in main
	run_training(args, model, datasets, data_collator, pippy_driver=pipeline_driver)
	File "/fsx/users/jamesreed/hf_t5_gcp_megagpu_pippy/train.py", line 125, in run_training
	trainer.train()
	###### test.py #####
	import torch
	from transformers import (
	Seq2SeqTrainer,
	Seq2SeqTrainingArguments,
	T5Config,
	T5ForConditionalGeneration,
	DataCollatorForSeq2Seq,
	AutoTokenizer,
	set_seed,
	diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
	index e2f033d72a..7b3a97991d 100644
	--- a/torch/fx/graph_module.py
	+++ b/torch/fx/graph_module.py
	@@ -222,6 +222,56 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
	else:
	setattr(to_module, field, from_obj)

	+class _WrappedCall:
	+ def __init__(self, cls, cls_call):
	import argparse, socket, os

	import torch
	import torch.fx
	import torch.distributed.rpc as rpc
	import torch.multiprocessing as mp

	def run_main(args):
	class MyCode(torch.nn.Module):
	def forward(self, x):
	diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
	index 0270295dd..acf3deaf4 100644
	--- a/src/transformers/utils/fx.py
	+++ b/src/transformers/utils/fx.py
	@@ -485,18 +485,18 @@ class HFTracer(Tracer):

	_reset_tensor_methods(self.original_methods)

	- # TODO: keep this until necessary.
	- # This is necessary because concrete args are added as input to the traced module since
	diff --git a/test/t5_test.py b/test/t5_test.py
	index e6b36b8..09c2f13 100644
	--- a/test/t5_test.py
	+++ b/test/t5_test.py
	@@ -33,5 +33,5 @@ print(t5_pipe.split_gm.graph)
	t5_input = torch.zeros(bs, seq_length, dtype=torch.long).random_(t5.config.vocab_size)
	decoder_input_ids = torch.zeros(bs, seq_length, dtype=torch.long).random_(t5.config.vocab_size)

	-t5_output = t5(input_ids=t5_input, decoder_attention_mask=None, decoder_input_ids=decoder_input_ids)
	-t5_pipe_output = t5_pipe(input_ids=t5_input, decoder_attention_mask=None, decoder_input_ids=decoder_input_ids)