antiagainst · August 9, 2021 21:46
diff --git a/MobileBertSquad-2021-08-09.mlir b/MobileBertSquad-2021-08-09.mlir
 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
 builtin.module  {
  flow.variable @"__iree_flow_bert/embeddings/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/embeddings/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/embeddings/embedding_transformation/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/embeddings/embedding_transformation/kernel" opaque<"_", "0xDEADBEEF"> : tensor<384x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/embeddings/position_embeddings" opaque<"_", "0xDEADBEEF"> : tensor<512x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/embeddings/token_type_embeddings" opaque<"_", "0xDEADBEEF"> : tensor<2x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/embeddings/word_embeddings" opaque<"_", "0xDEADBEEF"> : tensor<30522x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_10/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_11/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_12/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_13/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_14/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_15/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_16/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_17/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_18/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_19/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_20/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_21/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_22/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_23/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_3/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_4/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_5/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_6/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_7/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_8/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/self/key/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/self/key/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/self/query/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/self/query/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/self/value/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/attention/self/value/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/attention/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/attention/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/attention/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/attention/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/input/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/input/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/input/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/bottleneck/input/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_0/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_0/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_0/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_0/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_0/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_0/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_1/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_1/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_1/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_1/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_1/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_1/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_2/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_2/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_2/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_2/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_2/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/ffn_layer_2/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/intermediate/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/intermediate/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/bottleneck/FakeLayerNorm/beta" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/bottleneck/FakeLayerNorm/gamma" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/bottleneck/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/bottleneck/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/dense/bias" opaque<"_", "0xDEADBEEF"> : tensor<128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_bert/encoder/layer_9/output/dense/kernel" opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_cls/squad/output_bias" dense<[0.0287729427, 0.0297581609]> : tensor<2xf32> attributes {sym_visibility = "private"}
  flow.variable @"__iree_flow_cls/squad/output_weights" opaque<"_", "0xDEADBEEF"> : tensor<2x512xf32> attributes {sym_visibility = "private"}
  builtin.func @serving_default(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi = "{\22a\22:[[\22named\22,\22segment_ids\22,[\22ndarray\22,\22i32\22,2,1,384]],[\22named\22,\22input_mask\22,[\22ndarray\22,\22i32\22,2,1,384]],[\22named\22,\22input_ids\22,[\22ndarray\22,\22i32\22,2,1,384]]],\22r\22:[[\22sdict\22,[\22end_logits\22,[\22ndarray\22,\22f32\22,2,1,384]],[\22start_logits\22,[\22ndarray\22,\22f32\22,2,1,384]]]],\22v\22:1}"}} {
    %0:2 = call @_serving_default(%arg0, %arg1, %arg2) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view)
    return %0#0, %0#1 : !hal.buffer_view, !hal.buffer_view
  }
  builtin.func private @_serving_default(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi = "{\22a\22:[[\22named\22,\22segment_ids\22,[\22ndarray\22,\22i32\22,2,1,384]],[\22named\22,\22input_mask\22,[\22ndarray\22,\22i32\22,2,1,384]],[\22named\22,\22input_ids\22,[\22ndarray\22,\22i32\22,2,1,384]]],\22r\22:[[\22sdict\22,[\22end_logits\22,[\22ndarray\22,\22f32\22,2,1,384]],[\22start_logits\22,[\22ndarray\22,\22f32\22,2,1,384]]]],\22v\22:1}"} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x384xi32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<1x384xi32>
    %2 = hal.tensor.cast %arg2 : !hal.buffer_view -> tensor<1x384xi32>
    %3:2 = call @serving_default__ireesm(%0, %1, %2) : (tensor<1x384xi32>, tensor<1x384xi32>, tensor<1x384xi32>) -> (tensor<1x384xf32>, tensor<1x384xf32>)
    %4 = hal.tensor.cast %3#0 : tensor<1x384xf32> -> !hal.buffer_view
    %5 = hal.tensor.cast %3#1 : tensor<1x384xf32> -> !hal.buffer_view
    return %4, %5 : !hal.buffer_view, !hal.buffer_view
  }
  builtin.func private @serving_default__ireesm(%arg0: tensor<1x384xi32>, %arg1: tensor<1x384xi32>, %arg2: tensor<1x384xi32>) -> (tensor<1x384xf32>, tensor<1x384xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "segment_ids:0,input_mask:0,input_ids:0", outputs = "end_logits:0,start_logits:0"}} {
    %cst = constant opaque<"_", "0xDEADBEEF"> : tensor<2x512xf32>
    %cst_0 = constant dense<[0.0287729427, 0.0297581609]> : tensor<2xf32>
    %cst_1 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_2 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_3 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_4 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_5 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_6 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_7 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_8 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_9 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_10 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_11 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_12 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_13 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_14 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_15 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_16 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_17 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_18 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_19 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_20 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_21 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_22 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_23 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_24 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_25 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_26 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_27 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_28 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_29 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_30 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_31 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_32 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_33 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_34 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_35 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_36 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_37 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_38 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_39 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_40 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_41 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_42 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_43 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_44 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_45 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_46 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_47 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_48 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_49 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_50 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_51 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_52 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_53 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_54 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_55 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_56 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_57 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_58 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_59 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_60 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_61 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_62 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_63 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_64 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_65 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_66 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_67 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_68 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_69 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_70 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_71 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_72 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_73 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_74 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_75 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_76 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_77 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_78 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_79 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_80 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_81 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_82 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_83 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_84 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_85 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_86 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_87 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_88 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_89 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_90 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_91 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_92 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_93 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_94 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_95 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_96 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_97 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_98 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_99 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_100 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_101 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_102 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_103 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_104 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_105 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_106 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_107 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_108 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_109 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_110 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_111 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_112 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_113 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_114 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_115 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_116 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_117 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_118 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_119 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_120 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_121 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_122 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_123 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_124 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_125 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_126 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_127 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_128 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_129 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_130 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_131 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_132 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_133 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_134 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_135 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_136 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_137 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_138 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_139 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_140 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_141 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_142 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_143 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_144 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_145 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_146 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_147 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_148 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_149 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_150 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_151 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_152 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_153 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_154 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_155 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_156 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_157 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_158 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_159 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_160 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_161 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_162 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_163 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_164 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_165 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_166 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_167 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_168 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_169 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_170 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_171 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_172 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_173 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_174 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_175 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_176 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_177 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_178 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_179 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_180 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_181 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_182 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_183 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_184 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_185 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_186 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_187 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_188 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_189 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_190 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_191 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_192 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_193 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_194 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_195 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_196 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_197 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_198 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_199 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_200 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_201 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_202 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_203 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_204 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_205 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_206 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_207 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_208 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_209 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_210 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_211 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_212 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_213 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_214 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_215 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_216 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_217 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_218 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_219 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_220 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_221 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_222 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_223 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_224 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_225 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_226 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_227 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_228 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_229 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_230 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_231 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_232 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_233 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_234 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_235 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_236 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_237 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_238 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_239 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_240 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_241 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_242 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_243 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_244 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_245 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_246 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_247 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_248 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_249 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_250 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_251 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_252 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_253 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_254 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_255 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_256 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_257 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_258 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_259 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_260 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_261 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_262 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_263 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_264 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_265 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_266 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_267 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_268 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_269 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_270 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_271 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_272 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_273 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_274 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_275 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_276 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_277 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_278 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_279 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_280 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_281 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_282 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_283 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_284 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_285 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_286 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_287 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_288 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_289 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_290 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_291 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_292 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_293 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_294 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_295 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_296 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_297 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_298 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_299 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_300 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_301 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_302 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_303 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_304 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_305 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_306 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_307 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_308 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_309 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_310 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_311 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_312 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_313 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_314 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_315 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_316 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_317 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_318 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_319 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_320 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_321 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_322 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_323 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_324 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_325 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_326 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_327 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_328 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_329 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_330 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_331 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_332 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_333 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_334 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_335 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_336 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_337 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_338 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_339 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_340 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_341 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_342 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_343 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_344 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_345 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_346 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_347 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_348 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_349 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_350 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_351 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_352 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_353 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_354 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_355 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_356 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_357 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_358 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_359 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_360 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_361 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_362 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_363 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_364 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_365 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_366 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_367 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_368 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_369 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_370 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_371 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_372 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_373 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_374 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_375 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_376 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_377 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_378 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_379 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_380 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_381 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_382 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_383 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_384 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_385 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_386 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_387 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_388 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_389 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_390 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_391 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_392 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_393 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_394 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_395 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_396 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_397 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_398 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_399 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_400 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_401 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_402 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_403 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_404 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_405 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_406 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_407 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_408 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_409 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_410 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_411 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_412 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_413 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_414 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_415 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_416 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_417 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_418 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_419 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_420 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_421 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_422 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_423 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_424 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_425 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_426 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_427 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_428 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_429 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_430 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_431 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_432 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_433 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_434 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_435 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_436 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_437 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_438 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_439 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_440 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_441 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_442 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_443 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_444 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_445 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_446 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_447 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_448 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_449 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_450 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_451 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_452 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_453 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_454 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_455 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_456 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_457 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_458 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_459 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_460 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_461 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_462 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_463 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_464 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_465 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_466 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_467 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_468 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_469 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_470 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_471 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_472 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_473 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_474 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_475 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_476 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_477 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_478 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_479 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_480 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_481 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_482 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_483 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_484 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_485 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_486 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_487 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_488 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_489 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_490 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_491 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_492 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_493 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_494 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_495 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_496 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_497 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_498 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_499 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_500 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_501 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_502 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_503 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_504 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_505 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_506 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_507 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_508 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_509 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_510 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_511 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_512 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_513 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_514 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_515 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_516 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_517 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_518 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_519 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_520 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_521 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_522 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_523 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_524 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_525 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_526 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_527 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_528 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_529 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_530 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_531 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_532 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_533 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_534 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_535 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_536 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_537 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_538 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_539 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_540 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_541 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_542 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_543 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_544 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_545 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_546 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_547 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_548 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_549 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_550 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_551 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_552 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_553 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_554 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_555 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_556 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_557 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_558 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_559 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_560 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_561 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_562 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_563 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_564 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_565 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_566 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_567 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_568 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_569 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_570 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_571 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_572 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_573 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_574 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_575 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_576 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_577 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_578 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_579 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_580 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_581 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_582 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_583 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_584 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_585 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_586 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_587 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_588 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_589 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_590 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_591 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_592 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_593 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_594 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_595 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_596 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_597 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_598 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_599 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_600 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_601 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_602 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_603 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_604 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_605 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_606 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_607 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_608 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_609 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_610 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_611 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_612 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_613 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_614 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_615 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_616 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_617 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_618 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_619 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_620 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_621 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_622 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_623 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_624 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_625 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_626 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_627 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_628 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_629 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_630 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_631 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_632 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_633 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_634 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_635 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_636 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_637 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_638 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_639 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_640 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_641 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_642 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_643 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_644 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_645 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_646 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_647 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_648 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_649 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_650 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_651 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_652 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_653 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_654 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_655 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_656 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_657 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_658 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_659 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_660 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_661 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_662 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_663 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_664 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_665 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_666 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_667 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_668 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_669 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_670 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_671 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_672 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_673 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_674 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_675 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_676 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_677 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_678 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_679 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_680 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_681 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_682 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_683 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_684 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_685 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_686 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_687 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_688 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_689 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_690 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_691 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_692 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_693 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_694 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_695 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_696 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_697 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_698 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_699 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_700 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_701 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_702 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_703 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_704 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_705 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_706 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_707 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_708 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_709 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_710 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_711 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_712 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_713 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_714 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_715 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_716 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_717 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_718 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_719 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_720 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_721 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_722 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_723 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_724 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_725 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_726 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_727 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_728 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_729 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_730 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_731 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_732 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_733 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_734 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_735 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_736 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_737 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_738 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_739 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_740 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_741 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_742 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_743 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_744 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_745 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_746 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_747 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_748 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_749 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_750 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_751 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_752 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_753 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_754 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_755 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_756 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_757 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_758 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_759 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_760 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_761 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_762 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_763 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_764 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_765 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_766 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_767 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_768 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_769 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_770 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_771 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_772 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_773 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_774 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_775 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_776 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_777 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_778 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_779 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_780 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_781 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_782 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_783 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_784 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_785 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_786 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_787 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_788 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_789 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_790 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_791 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_792 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_793 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_794 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_795 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_796 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_797 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_798 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_799 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_800 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_801 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_802 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_803 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_804 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_805 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_806 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_807 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_808 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_809 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_810 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_811 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_812 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_813 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_814 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_815 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_816 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_817 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_818 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_819 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_820 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_821 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_822 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_823 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_824 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_825 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_826 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_827 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_828 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_829 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_830 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_831 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_832 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_833 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_834 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_835 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_836 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_837 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_838 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_839 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_840 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_841 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_842 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_843 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_844 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_845 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_846 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_847 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_848 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_849 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_850 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_851 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_852 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_853 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_854 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_855 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_856 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_857 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_858 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_859 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_860 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_861 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_862 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_863 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_864 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_865 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_866 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_867 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_868 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_869 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_870 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_871 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_872 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_873 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_874 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_875 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_876 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_877 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_878 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_879 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_880 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_881 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_882 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_883 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_884 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_885 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_886 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_887 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_888 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_889 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_890 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_891 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_892 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_893 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_894 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_895 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_896 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_897 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_898 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_899 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_900 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_901 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_902 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_903 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_904 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_905 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_906 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_907 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_908 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_909 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_910 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_911 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_912 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_913 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_914 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_915 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_916 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_917 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_918 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_919 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_920 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_921 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_922 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_923 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_924 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_925 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_926 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_927 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_928 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_929 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_930 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_931 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_932 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_933 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_934 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_935 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_936 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_937 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_938 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_939 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_940 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_941 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_942 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_943 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_944 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_945 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_946 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_947 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_948 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_949 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_950 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_951 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_952 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_953 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_954 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_955 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_956 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_957 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_958 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_959 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_960 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_961 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_962 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_963 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_964 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_965 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_966 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_967 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_968 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_969 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_970 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_971 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_972 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_973 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_974 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_975 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_976 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_977 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_978 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_979 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_980 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_981 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_982 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_983 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_984 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_985 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_986 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_987 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_988 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_989 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_990 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_991 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_992 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_993 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_994 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_995 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_996 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_997 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_998 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_999 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1000 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1001 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1002 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1003 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1004 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1005 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1006 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1007 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1008 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1009 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1010 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1011 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1012 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1013 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1014 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1015 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1016 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1017 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1018 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1019 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1020 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1021 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1022 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1023 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1024 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1025 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1026 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1027 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1028 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_1029 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1030 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_1031 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1032 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_1033 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1034 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1035 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1036 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1037 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1038 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1039 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1040 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1041 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1042 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1043 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1044 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1045 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1046 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1047 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1048 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1049 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1050 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1051 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1052 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1053 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1054 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1055 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1056 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1057 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1058 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1059 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1060 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1061 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1062 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
    %cst_1063 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1064 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1065 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1066 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1067 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1068 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1069 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1070 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1071 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
    %cst_1072 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1073 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_1074 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1075 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_1076 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1077 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
    %cst_1078 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1079 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1080 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
    %cst_1081 = constant opaque<"_", "0xDEADBEEF"> : tensor<30522x128xf32>
    %cst_1082 = constant opaque<"_", "0xDEADBEEF"> : tensor<2x512xf32>
    %0 = mhlo.constant opaque<"_", "0xDEADBEEF"> : tensor<1x384x512xf32>
    %cst_1083 = constant opaque<"_", "0xDEADBEEF"> : tensor<384x512xf32>
    %cst_1084 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1085 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %cst_1086 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
    %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
    %2 = mhlo.constant dense<0xFF800000> : tensor<f32>
    %3 = mhlo.constant dense<-1.000000e+04> : tensor<f32>
    %4 = mhlo.constant dense<0.176776692> : tensor<f32>
    %5 = mhlo.constant dense<1.000000e+04> : tensor<f32>
    %6 = mhlo.constant dense<1.000000e+00> : tensor<1x384x1xf32>
    %7 = linalg.tensor_expand_shape %arg2 [[0], [1, 2]] : tensor<1x384xi32> into tensor<1x384x1xi32>
    %8 = "mhlo.torch_index_select"(%cst_1081, %7) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<30522x128xf32>, tensor<1x384x1xi32>) -> tensor<1x384x1x128xf32>
    %9 = "mhlo.reshape"(%8) : (tensor<1x384x1x128xf32>) -> tensor<1x384x128xf32>
    %10 = "mhlo.slice"(%9) {limit_indices = dense<[1, 384, 128]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x384x128xf32>) -> tensor<1x383x128xf32>
    %11 = "mhlo.pad"(%10, %1) {edge_padding_high = dense<[0, 1, 0]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x383x128xf32>, tensor<f32>) -> tensor<1x384x128xf32>
    %12 = "mhlo.slice"(%9) {limit_indices = dense<[1, 383, 128]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x384x128xf32>) -> tensor<1x383x128xf32>
    %13 = "mhlo.pad"(%12, %1) {edge_padding_high = dense<0> : tensor<3xi64>, edge_padding_low = dense<[0, 1, 0]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x383x128xf32>, tensor<f32>) -> tensor<1x384x128xf32>
    %14 = "mhlo.concatenate"(%11, %9, %13) {dimension = 2 : i64} : (tensor<1x384x128xf32>, tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x384xf32>
    %15 = "mhlo.reshape"(%14) : (tensor<1x384x384xf32>) -> tensor<384x384xf32>
    %16 = "mhlo.dot"(%15, %cst_1083) : (tensor<384x384xf32>, tensor<384x512xf32>) -> tensor<384x512xf32>
    %17 = chlo.broadcast_add %16, %cst_1084 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %18 = "mhlo.reshape"(%17) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %19 = "mhlo.convert"(%arg1) : (tensor<1x384xi32>) -> tensor<1x384xf32>
    %20 = "mhlo.reshape"(%19) : (tensor<1x384xf32>) -> tensor<1x1x384xf32>
    %21 = chlo.broadcast_multiply %20, %6 : (tensor<1x1x384xf32>, tensor<1x384x1xf32>) -> tensor<1x384x384xf32>
    %22 = linalg.tensor_expand_shape %21 [[0], [1, 2], [3]] : tensor<1x384x384xf32> into tensor<1x1x384x384xf32>
    %23 = chlo.broadcast_multiply %22, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x384x384xf32>, tensor<f32>) -> tensor<1x1x384x384xf32>
    %24 = chlo.broadcast_add %23, %3 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x384x384xf32>, tensor<f32>) -> tensor<1x1x384x384xf32>
    %25 = "mhlo.torch_index_select"(%cst_1082, %arg0) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<2x512xf32>, tensor<1x384xi32>) -> tensor<1x384x512xf32>
    %26 = chlo.broadcast_add %18, %25 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %27 = chlo.broadcast_add %26, %0 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %28 = chlo.broadcast_multiply %27, %cst_1085 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %29 = chlo.broadcast_add %28, %cst_1086 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %30 = "mhlo.reshape"(%29) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %31 = "mhlo.dot"(%30, %cst_1071) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %32 = chlo.broadcast_add %31, %cst_1072 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %33 = "mhlo.reshape"(%32) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %34 = "mhlo.transpose"(%33) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %35 = "mhlo.dot"(%30, %cst_1067) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %36 = "mhlo.reshape"(%35) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %37 = "mhlo.broadcast_in_dim"(%cst_1068) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %38 = mhlo.add %36, %37 : tensor<1x384x128xf32>
    %39 = chlo.broadcast_multiply %38, %cst_1069 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %40 = chlo.broadcast_add %39, %cst_1070 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %41 = "mhlo.reshape"(%40) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %42 = "mhlo.dot"(%41, %cst_1075) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %43 = chlo.broadcast_add %42, %cst_1076 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %44 = "mhlo.reshape"(%43) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %45 = "mhlo.transpose"(%44) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %46 = "mhlo.dot"(%41, %cst_1073) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %47 = chlo.broadcast_add %46, %cst_1074 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %48 = "mhlo.reshape"(%47) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %49 = "mhlo.transpose"(%48) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %50 = "mhlo.dot_general"(%49, %45) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %51 = chlo.broadcast_multiply %50, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %52 = chlo.broadcast_add %51, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %53 = "mhlo.reduce"(%52, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %54 = linalg.tensor_expand_shape %53 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %55 = chlo.broadcast_subtract %52, %54 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %56 = "mhlo.exponential"(%55) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %57 = "mhlo.reduce"(%56, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %58 = linalg.tensor_expand_shape %57 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %59 = chlo.broadcast_divide %56, %58 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %60 = "mhlo.dot_general"(%59, %34) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %61 = "mhlo.transpose"(%60) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %62 = "mhlo.reshape"(%61) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %63 = "mhlo.dot"(%62, %cst_1077) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %64 = chlo.broadcast_add %63, %cst_1078 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %65 = "mhlo.reshape"(%64) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %66 = "mhlo.dot"(%30, %cst_1064) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %67 = chlo.broadcast_add %66, %cst_1065 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %68 = "mhlo.reshape"(%67) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %69 = chlo.broadcast_multiply %68, %cst_1066 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %70 = chlo.broadcast_add %69, %cst_1078 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %71 = chlo.broadcast_add %65, %70 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %72 = chlo.broadcast_multiply %71, %cst_1079 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %73 = chlo.broadcast_add %72, %cst_1080 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %74 = "mhlo.reshape"(%73) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %75 = "mhlo.dot"(%74, %cst_1062) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %76 = chlo.broadcast_add %75, %cst_1063 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %77 = "mhlo.reshape"(%76) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %78 = chlo.broadcast_maximum %77, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %79 = "mhlo.reshape"(%78) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %80 = "mhlo.dot"(%79, %cst_1058) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %81 = chlo.broadcast_add %80, %cst_1059 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %82 = "mhlo.reshape"(%81) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %83 = chlo.broadcast_add %82, %73 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %84 = chlo.broadcast_multiply %83, %cst_1060 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %85 = chlo.broadcast_add %84, %cst_1061 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %86 = "mhlo.reshape"(%85) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %87 = "mhlo.dot"(%86, %cst_1056) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %88 = chlo.broadcast_add %87, %cst_1057 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %89 = "mhlo.reshape"(%88) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %90 = chlo.broadcast_maximum %89, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %91 = "mhlo.reshape"(%90) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %92 = "mhlo.dot"(%91, %cst_1052) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %93 = chlo.broadcast_add %92, %cst_1053 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %94 = "mhlo.reshape"(%93) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %95 = chlo.broadcast_add %94, %85 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %96 = chlo.broadcast_multiply %95, %cst_1054 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %97 = chlo.broadcast_add %96, %cst_1055 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %98 = "mhlo.reshape"(%97) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %99 = "mhlo.dot"(%98, %cst_1050) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %100 = chlo.broadcast_add %99, %cst_1051 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %101 = "mhlo.reshape"(%100) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %102 = chlo.broadcast_maximum %101, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %103 = "mhlo.reshape"(%102) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %104 = "mhlo.dot"(%103, %cst_1046) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %105 = chlo.broadcast_add %104, %cst_1047 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %106 = "mhlo.reshape"(%105) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %107 = chlo.broadcast_add %106, %97 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %108 = chlo.broadcast_multiply %107, %cst_1048 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %109 = chlo.broadcast_add %108, %cst_1049 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %110 = "mhlo.reshape"(%109) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %111 = "mhlo.dot"(%110, %cst_1044) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %112 = chlo.broadcast_add %111, %cst_1045 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %113 = "mhlo.reshape"(%112) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %114 = chlo.broadcast_maximum %113, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %115 = "mhlo.reshape"(%114) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %116 = "mhlo.dot"(%115, %cst_1036) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %117 = chlo.broadcast_add %116, %cst_1037 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %118 = "mhlo.reshape"(%117) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %119 = chlo.broadcast_add %118, %109 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %120 = chlo.broadcast_multiply %119, %cst_1042 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %121 = chlo.broadcast_add %120, %cst_1043 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %122 = "mhlo.reshape"(%121) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %123 = "mhlo.dot"(%122, %cst_1038) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %124 = chlo.broadcast_add %123, %cst_1039 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %125 = "mhlo.reshape"(%124) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %126 = chlo.broadcast_add %125, %29 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %127 = chlo.broadcast_multiply %126, %cst_1040 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %128 = chlo.broadcast_add %127, %cst_1041 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %129 = "mhlo.reshape"(%128) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %130 = "mhlo.dot"(%129, %cst_1026) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %131 = chlo.broadcast_add %130, %cst_1027 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %132 = "mhlo.reshape"(%131) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %133 = "mhlo.transpose"(%132) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %134 = "mhlo.dot"(%129, %cst_1022) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %135 = "mhlo.reshape"(%134) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %136 = "mhlo.broadcast_in_dim"(%cst_1023) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %137 = mhlo.add %135, %136 : tensor<1x384x128xf32>
    %138 = chlo.broadcast_multiply %137, %cst_1024 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %139 = chlo.broadcast_add %138, %cst_1025 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %140 = "mhlo.reshape"(%139) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %141 = "mhlo.dot"(%140, %cst_1030) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %142 = chlo.broadcast_add %141, %cst_1031 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %143 = "mhlo.reshape"(%142) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %144 = "mhlo.transpose"(%143) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %145 = "mhlo.dot"(%140, %cst_1028) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %146 = chlo.broadcast_add %145, %cst_1029 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %147 = "mhlo.reshape"(%146) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %148 = "mhlo.transpose"(%147) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %149 = "mhlo.dot_general"(%148, %144) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %150 = chlo.broadcast_multiply %149, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %151 = chlo.broadcast_add %150, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %152 = "mhlo.reduce"(%151, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %153 = linalg.tensor_expand_shape %152 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %154 = chlo.broadcast_subtract %151, %153 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %155 = "mhlo.exponential"(%154) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %156 = "mhlo.reduce"(%155, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %157 = linalg.tensor_expand_shape %156 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %158 = chlo.broadcast_divide %155, %157 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %159 = "mhlo.dot_general"(%158, %133) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %160 = "mhlo.transpose"(%159) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %161 = "mhlo.reshape"(%160) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %162 = "mhlo.dot"(%161, %cst_1032) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %163 = chlo.broadcast_add %162, %cst_1033 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %164 = "mhlo.reshape"(%163) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %165 = "mhlo.dot"(%129, %cst_1019) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %166 = chlo.broadcast_add %165, %cst_1020 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %167 = "mhlo.reshape"(%166) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %168 = chlo.broadcast_multiply %167, %cst_1021 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %169 = chlo.broadcast_add %168, %cst_1033 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %170 = chlo.broadcast_add %164, %169 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %171 = chlo.broadcast_multiply %170, %cst_1034 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %172 = chlo.broadcast_add %171, %cst_1035 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %173 = "mhlo.reshape"(%172) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %174 = "mhlo.dot"(%173, %cst_1017) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %175 = chlo.broadcast_add %174, %cst_1018 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %176 = "mhlo.reshape"(%175) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %177 = chlo.broadcast_maximum %176, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %178 = "mhlo.reshape"(%177) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %179 = "mhlo.dot"(%178, %cst_1013) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %180 = chlo.broadcast_add %179, %cst_1014 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %181 = "mhlo.reshape"(%180) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %182 = chlo.broadcast_add %181, %172 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %183 = chlo.broadcast_multiply %182, %cst_1015 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %184 = chlo.broadcast_add %183, %cst_1016 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %185 = "mhlo.reshape"(%184) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %186 = "mhlo.dot"(%185, %cst_1011) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %187 = chlo.broadcast_add %186, %cst_1012 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %188 = "mhlo.reshape"(%187) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %189 = chlo.broadcast_maximum %188, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %190 = "mhlo.reshape"(%189) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %191 = "mhlo.dot"(%190, %cst_1007) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %192 = chlo.broadcast_add %191, %cst_1008 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %193 = "mhlo.reshape"(%192) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %194 = chlo.broadcast_add %193, %184 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %195 = chlo.broadcast_multiply %194, %cst_1009 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %196 = chlo.broadcast_add %195, %cst_1010 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %197 = "mhlo.reshape"(%196) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %198 = "mhlo.dot"(%197, %cst_1005) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %199 = chlo.broadcast_add %198, %cst_1006 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %200 = "mhlo.reshape"(%199) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %201 = chlo.broadcast_maximum %200, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %202 = "mhlo.reshape"(%201) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %203 = "mhlo.dot"(%202, %cst_1001) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %204 = chlo.broadcast_add %203, %cst_1002 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %205 = "mhlo.reshape"(%204) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %206 = chlo.broadcast_add %205, %196 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %207 = chlo.broadcast_multiply %206, %cst_1003 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %208 = chlo.broadcast_add %207, %cst_1004 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %209 = "mhlo.reshape"(%208) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %210 = "mhlo.dot"(%209, %cst_999) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %211 = chlo.broadcast_add %210, %cst_1000 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %212 = "mhlo.reshape"(%211) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %213 = chlo.broadcast_maximum %212, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %214 = "mhlo.reshape"(%213) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %215 = "mhlo.dot"(%214, %cst_991) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %216 = chlo.broadcast_add %215, %cst_992 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %217 = "mhlo.reshape"(%216) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %218 = chlo.broadcast_add %217, %208 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %219 = chlo.broadcast_multiply %218, %cst_997 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %220 = chlo.broadcast_add %219, %cst_998 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %221 = "mhlo.reshape"(%220) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %222 = "mhlo.dot"(%221, %cst_993) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %223 = chlo.broadcast_add %222, %cst_994 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %224 = "mhlo.reshape"(%223) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %225 = chlo.broadcast_add %224, %128 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %226 = chlo.broadcast_multiply %225, %cst_995 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %227 = chlo.broadcast_add %226, %cst_996 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %228 = "mhlo.reshape"(%227) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %229 = "mhlo.dot"(%228, %cst_531) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %230 = chlo.broadcast_add %229, %cst_532 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %231 = "mhlo.reshape"(%230) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %232 = "mhlo.transpose"(%231) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %233 = "mhlo.dot"(%228, %cst_527) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %234 = "mhlo.reshape"(%233) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %235 = "mhlo.broadcast_in_dim"(%cst_528) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %236 = mhlo.add %234, %235 : tensor<1x384x128xf32>
    %237 = chlo.broadcast_multiply %236, %cst_529 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %238 = chlo.broadcast_add %237, %cst_530 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %239 = "mhlo.reshape"(%238) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %240 = "mhlo.dot"(%239, %cst_535) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %241 = chlo.broadcast_add %240, %cst_536 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %242 = "mhlo.reshape"(%241) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %243 = "mhlo.transpose"(%242) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %244 = "mhlo.dot"(%239, %cst_533) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %245 = chlo.broadcast_add %244, %cst_534 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %246 = "mhlo.reshape"(%245) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %247 = "mhlo.transpose"(%246) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %248 = "mhlo.dot_general"(%247, %243) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %249 = chlo.broadcast_multiply %248, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %250 = chlo.broadcast_add %249, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %251 = "mhlo.reduce"(%250, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %252 = linalg.tensor_expand_shape %251 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %253 = chlo.broadcast_subtract %250, %252 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %254 = "mhlo.exponential"(%253) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %255 = "mhlo.reduce"(%254, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %256 = linalg.tensor_expand_shape %255 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %257 = chlo.broadcast_divide %254, %256 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %258 = "mhlo.dot_general"(%257, %232) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %259 = "mhlo.transpose"(%258) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %260 = "mhlo.reshape"(%259) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %261 = "mhlo.dot"(%260, %cst_537) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %262 = chlo.broadcast_add %261, %cst_538 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %263 = "mhlo.reshape"(%262) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %264 = "mhlo.dot"(%228, %cst_524) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %265 = chlo.broadcast_add %264, %cst_525 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %266 = "mhlo.reshape"(%265) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %267 = chlo.broadcast_multiply %266, %cst_526 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %268 = chlo.broadcast_add %267, %cst_538 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %269 = chlo.broadcast_add %263, %268 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %270 = chlo.broadcast_multiply %269, %cst_539 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %271 = chlo.broadcast_add %270, %cst_540 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %272 = "mhlo.reshape"(%271) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %273 = "mhlo.dot"(%272, %cst_522) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %274 = chlo.broadcast_add %273, %cst_523 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %275 = "mhlo.reshape"(%274) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %276 = chlo.broadcast_maximum %275, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %277 = "mhlo.reshape"(%276) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %278 = "mhlo.dot"(%277, %cst_518) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %279 = chlo.broadcast_add %278, %cst_519 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %280 = "mhlo.reshape"(%279) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %281 = chlo.broadcast_add %280, %271 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %282 = chlo.broadcast_multiply %281, %cst_520 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %283 = chlo.broadcast_add %282, %cst_521 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %284 = "mhlo.reshape"(%283) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %285 = "mhlo.dot"(%284, %cst_516) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %286 = chlo.broadcast_add %285, %cst_517 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %287 = "mhlo.reshape"(%286) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %288 = chlo.broadcast_maximum %287, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %289 = "mhlo.reshape"(%288) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %290 = "mhlo.dot"(%289, %cst_512) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %291 = chlo.broadcast_add %290, %cst_513 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %292 = "mhlo.reshape"(%291) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %293 = chlo.broadcast_add %292, %283 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %294 = chlo.broadcast_multiply %293, %cst_514 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %295 = chlo.broadcast_add %294, %cst_515 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %296 = "mhlo.reshape"(%295) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %297 = "mhlo.dot"(%296, %cst_510) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %298 = chlo.broadcast_add %297, %cst_511 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %299 = "mhlo.reshape"(%298) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %300 = chlo.broadcast_maximum %299, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %301 = "mhlo.reshape"(%300) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %302 = "mhlo.dot"(%301, %cst_506) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %303 = chlo.broadcast_add %302, %cst_507 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %304 = "mhlo.reshape"(%303) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %305 = chlo.broadcast_add %304, %295 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %306 = chlo.broadcast_multiply %305, %cst_508 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %307 = chlo.broadcast_add %306, %cst_509 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %308 = "mhlo.reshape"(%307) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %309 = "mhlo.dot"(%308, %cst_504) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %310 = chlo.broadcast_add %309, %cst_505 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %311 = "mhlo.reshape"(%310) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %312 = chlo.broadcast_maximum %311, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %313 = "mhlo.reshape"(%312) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %314 = "mhlo.dot"(%313, %cst_496) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %315 = chlo.broadcast_add %314, %cst_497 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %316 = "mhlo.reshape"(%315) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %317 = chlo.broadcast_add %316, %307 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %318 = chlo.broadcast_multiply %317, %cst_502 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %319 = chlo.broadcast_add %318, %cst_503 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %320 = "mhlo.reshape"(%319) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %321 = "mhlo.dot"(%320, %cst_498) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %322 = chlo.broadcast_add %321, %cst_499 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %323 = "mhlo.reshape"(%322) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %324 = chlo.broadcast_add %323, %227 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %325 = chlo.broadcast_multiply %324, %cst_500 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %326 = chlo.broadcast_add %325, %cst_501 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %327 = "mhlo.reshape"(%326) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %328 = "mhlo.dot"(%327, %cst_306) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %329 = chlo.broadcast_add %328, %cst_307 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %330 = "mhlo.reshape"(%329) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %331 = "mhlo.transpose"(%330) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %332 = "mhlo.dot"(%327, %cst_302) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %333 = "mhlo.reshape"(%332) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %334 = "mhlo.broadcast_in_dim"(%cst_303) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %335 = mhlo.add %333, %334 : tensor<1x384x128xf32>
    %336 = chlo.broadcast_multiply %335, %cst_304 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %337 = chlo.broadcast_add %336, %cst_305 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %338 = "mhlo.reshape"(%337) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %339 = "mhlo.dot"(%338, %cst_310) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %340 = chlo.broadcast_add %339, %cst_311 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %341 = "mhlo.reshape"(%340) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %342 = "mhlo.transpose"(%341) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %343 = "mhlo.dot"(%338, %cst_308) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %344 = chlo.broadcast_add %343, %cst_309 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %345 = "mhlo.reshape"(%344) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %346 = "mhlo.transpose"(%345) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %347 = "mhlo.dot_general"(%346, %342) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %348 = chlo.broadcast_multiply %347, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %349 = chlo.broadcast_add %348, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %350 = "mhlo.reduce"(%349, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %351 = linalg.tensor_expand_shape %350 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %352 = chlo.broadcast_subtract %349, %351 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %353 = "mhlo.exponential"(%352) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %354 = "mhlo.reduce"(%353, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %355 = linalg.tensor_expand_shape %354 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %356 = chlo.broadcast_divide %353, %355 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %357 = "mhlo.dot_general"(%356, %331) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %358 = "mhlo.transpose"(%357) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %359 = "mhlo.reshape"(%358) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %360 = "mhlo.dot"(%359, %cst_312) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %361 = chlo.broadcast_add %360, %cst_313 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %362 = "mhlo.reshape"(%361) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %363 = "mhlo.dot"(%327, %cst_299) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %364 = chlo.broadcast_add %363, %cst_300 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %365 = "mhlo.reshape"(%364) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %366 = chlo.broadcast_multiply %365, %cst_301 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %367 = chlo.broadcast_add %366, %cst_313 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %368 = chlo.broadcast_add %362, %367 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %369 = chlo.broadcast_multiply %368, %cst_314 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %370 = chlo.broadcast_add %369, %cst_315 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %371 = "mhlo.reshape"(%370) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %372 = "mhlo.dot"(%371, %cst_297) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %373 = chlo.broadcast_add %372, %cst_298 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %374 = "mhlo.reshape"(%373) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %375 = chlo.broadcast_maximum %374, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %376 = "mhlo.reshape"(%375) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %377 = "mhlo.dot"(%376, %cst_293) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %378 = chlo.broadcast_add %377, %cst_294 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %379 = "mhlo.reshape"(%378) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %380 = chlo.broadcast_add %379, %370 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %381 = chlo.broadcast_multiply %380, %cst_295 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %382 = chlo.broadcast_add %381, %cst_296 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %383 = "mhlo.reshape"(%382) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %384 = "mhlo.dot"(%383, %cst_291) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %385 = chlo.broadcast_add %384, %cst_292 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %386 = "mhlo.reshape"(%385) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %387 = chlo.broadcast_maximum %386, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %388 = "mhlo.reshape"(%387) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %389 = "mhlo.dot"(%388, %cst_287) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %390 = chlo.broadcast_add %389, %cst_288 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %391 = "mhlo.reshape"(%390) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %392 = chlo.broadcast_add %391, %382 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %393 = chlo.broadcast_multiply %392, %cst_289 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %394 = chlo.broadcast_add %393, %cst_290 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %395 = "mhlo.reshape"(%394) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %396 = "mhlo.dot"(%395, %cst_285) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %397 = chlo.broadcast_add %396, %cst_286 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %398 = "mhlo.reshape"(%397) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %399 = chlo.broadcast_maximum %398, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %400 = "mhlo.reshape"(%399) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %401 = "mhlo.dot"(%400, %cst_281) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %402 = chlo.broadcast_add %401, %cst_282 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %403 = "mhlo.reshape"(%402) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %404 = chlo.broadcast_add %403, %394 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %405 = chlo.broadcast_multiply %404, %cst_283 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %406 = chlo.broadcast_add %405, %cst_284 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %407 = "mhlo.reshape"(%406) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %408 = "mhlo.dot"(%407, %cst_279) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %409 = chlo.broadcast_add %408, %cst_280 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %410 = "mhlo.reshape"(%409) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %411 = chlo.broadcast_maximum %410, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %412 = "mhlo.reshape"(%411) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %413 = "mhlo.dot"(%412, %cst_271) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %414 = chlo.broadcast_add %413, %cst_272 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %415 = "mhlo.reshape"(%414) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %416 = chlo.broadcast_add %415, %406 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %417 = chlo.broadcast_multiply %416, %cst_277 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %418 = chlo.broadcast_add %417, %cst_278 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %419 = "mhlo.reshape"(%418) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %420 = "mhlo.dot"(%419, %cst_273) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %421 = chlo.broadcast_add %420, %cst_274 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %422 = "mhlo.reshape"(%421) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %423 = chlo.broadcast_add %422, %326 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %424 = chlo.broadcast_multiply %423, %cst_275 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %425 = chlo.broadcast_add %424, %cst_276 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %426 = "mhlo.reshape"(%425) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %427 = "mhlo.dot"(%426, %cst_261) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %428 = chlo.broadcast_add %427, %cst_262 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %429 = "mhlo.reshape"(%428) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %430 = "mhlo.transpose"(%429) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %431 = "mhlo.dot"(%426, %cst_257) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %432 = "mhlo.reshape"(%431) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %433 = "mhlo.broadcast_in_dim"(%cst_258) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %434 = mhlo.add %432, %433 : tensor<1x384x128xf32>
    %435 = chlo.broadcast_multiply %434, %cst_259 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %436 = chlo.broadcast_add %435, %cst_260 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %437 = "mhlo.reshape"(%436) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %438 = "mhlo.dot"(%437, %cst_265) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %439 = chlo.broadcast_add %438, %cst_266 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %440 = "mhlo.reshape"(%439) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %441 = "mhlo.transpose"(%440) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %442 = "mhlo.dot"(%437, %cst_263) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %443 = chlo.broadcast_add %442, %cst_264 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %444 = "mhlo.reshape"(%443) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %445 = "mhlo.transpose"(%444) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %446 = "mhlo.dot_general"(%445, %441) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %447 = chlo.broadcast_multiply %446, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %448 = chlo.broadcast_add %447, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %449 = "mhlo.reduce"(%448, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %450 = linalg.tensor_expand_shape %449 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %451 = chlo.broadcast_subtract %448, %450 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %452 = "mhlo.exponential"(%451) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %453 = "mhlo.reduce"(%452, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %454 = linalg.tensor_expand_shape %453 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %455 = chlo.broadcast_divide %452, %454 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %456 = "mhlo.dot_general"(%455, %430) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %457 = "mhlo.transpose"(%456) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %458 = "mhlo.reshape"(%457) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %459 = "mhlo.dot"(%458, %cst_267) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %460 = chlo.broadcast_add %459, %cst_268 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %461 = "mhlo.reshape"(%460) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %462 = "mhlo.dot"(%426, %cst_254) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %463 = chlo.broadcast_add %462, %cst_255 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %464 = "mhlo.reshape"(%463) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %465 = chlo.broadcast_multiply %464, %cst_256 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %466 = chlo.broadcast_add %465, %cst_268 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %467 = chlo.broadcast_add %461, %466 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %468 = chlo.broadcast_multiply %467, %cst_269 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %469 = chlo.broadcast_add %468, %cst_270 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %470 = "mhlo.reshape"(%469) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %471 = "mhlo.dot"(%470, %cst_252) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %472 = chlo.broadcast_add %471, %cst_253 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %473 = "mhlo.reshape"(%472) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %474 = chlo.broadcast_maximum %473, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %475 = "mhlo.reshape"(%474) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %476 = "mhlo.dot"(%475, %cst_248) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %477 = chlo.broadcast_add %476, %cst_249 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %478 = "mhlo.reshape"(%477) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %479 = chlo.broadcast_add %478, %469 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %480 = chlo.broadcast_multiply %479, %cst_250 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %481 = chlo.broadcast_add %480, %cst_251 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %482 = "mhlo.reshape"(%481) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %483 = "mhlo.dot"(%482, %cst_246) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %484 = chlo.broadcast_add %483, %cst_247 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %485 = "mhlo.reshape"(%484) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %486 = chlo.broadcast_maximum %485, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %487 = "mhlo.reshape"(%486) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %488 = "mhlo.dot"(%487, %cst_242) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %489 = chlo.broadcast_add %488, %cst_243 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %490 = "mhlo.reshape"(%489) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %491 = chlo.broadcast_add %490, %481 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %492 = chlo.broadcast_multiply %491, %cst_244 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %493 = chlo.broadcast_add %492, %cst_245 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %494 = "mhlo.reshape"(%493) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %495 = "mhlo.dot"(%494, %cst_240) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %496 = chlo.broadcast_add %495, %cst_241 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %497 = "mhlo.reshape"(%496) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %498 = chlo.broadcast_maximum %497, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %499 = "mhlo.reshape"(%498) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %500 = "mhlo.dot"(%499, %cst_236) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %501 = chlo.broadcast_add %500, %cst_237 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %502 = "mhlo.reshape"(%501) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %503 = chlo.broadcast_add %502, %493 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %504 = chlo.broadcast_multiply %503, %cst_238 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %505 = chlo.broadcast_add %504, %cst_239 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %506 = "mhlo.reshape"(%505) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %507 = "mhlo.dot"(%506, %cst_234) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %508 = chlo.broadcast_add %507, %cst_235 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %509 = "mhlo.reshape"(%508) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %510 = chlo.broadcast_maximum %509, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %511 = "mhlo.reshape"(%510) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %512 = "mhlo.dot"(%511, %cst_226) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %513 = chlo.broadcast_add %512, %cst_227 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %514 = "mhlo.reshape"(%513) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %515 = chlo.broadcast_add %514, %505 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %516 = chlo.broadcast_multiply %515, %cst_232 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %517 = chlo.broadcast_add %516, %cst_233 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %518 = "mhlo.reshape"(%517) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %519 = "mhlo.dot"(%518, %cst_228) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %520 = chlo.broadcast_add %519, %cst_229 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %521 = "mhlo.reshape"(%520) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %522 = chlo.broadcast_add %521, %425 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %523 = chlo.broadcast_multiply %522, %cst_230 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %524 = chlo.broadcast_add %523, %cst_231 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %525 = "mhlo.reshape"(%524) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %526 = "mhlo.dot"(%525, %cst_216) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %527 = chlo.broadcast_add %526, %cst_217 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %528 = "mhlo.reshape"(%527) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %529 = "mhlo.transpose"(%528) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %530 = "mhlo.dot"(%525, %cst_212) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %531 = "mhlo.reshape"(%530) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %532 = "mhlo.broadcast_in_dim"(%cst_213) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %533 = mhlo.add %531, %532 : tensor<1x384x128xf32>
    %534 = chlo.broadcast_multiply %533, %cst_214 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %535 = chlo.broadcast_add %534, %cst_215 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %536 = "mhlo.reshape"(%535) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %537 = "mhlo.dot"(%536, %cst_220) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %538 = chlo.broadcast_add %537, %cst_221 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %539 = "mhlo.reshape"(%538) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %540 = "mhlo.transpose"(%539) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %541 = "mhlo.dot"(%536, %cst_218) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %542 = chlo.broadcast_add %541, %cst_219 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %543 = "mhlo.reshape"(%542) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %544 = "mhlo.transpose"(%543) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %545 = "mhlo.dot_general"(%544, %540) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %546 = chlo.broadcast_multiply %545, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %547 = chlo.broadcast_add %546, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %548 = "mhlo.reduce"(%547, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %549 = linalg.tensor_expand_shape %548 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %550 = chlo.broadcast_subtract %547, %549 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %551 = "mhlo.exponential"(%550) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %552 = "mhlo.reduce"(%551, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %553 = linalg.tensor_expand_shape %552 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %554 = chlo.broadcast_divide %551, %553 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %555 = "mhlo.dot_general"(%554, %529) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %556 = "mhlo.transpose"(%555) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %557 = "mhlo.reshape"(%556) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %558 = "mhlo.dot"(%557, %cst_222) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %559 = chlo.broadcast_add %558, %cst_223 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %560 = "mhlo.reshape"(%559) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %561 = "mhlo.dot"(%525, %cst_209) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %562 = chlo.broadcast_add %561, %cst_210 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %563 = "mhlo.reshape"(%562) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %564 = chlo.broadcast_multiply %563, %cst_211 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %565 = chlo.broadcast_add %564, %cst_223 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %566 = chlo.broadcast_add %560, %565 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %567 = chlo.broadcast_multiply %566, %cst_224 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %568 = chlo.broadcast_add %567, %cst_225 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %569 = "mhlo.reshape"(%568) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %570 = "mhlo.dot"(%569, %cst_207) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %571 = chlo.broadcast_add %570, %cst_208 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %572 = "mhlo.reshape"(%571) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %573 = chlo.broadcast_maximum %572, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %574 = "mhlo.reshape"(%573) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %575 = "mhlo.dot"(%574, %cst_203) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %576 = chlo.broadcast_add %575, %cst_204 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %577 = "mhlo.reshape"(%576) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %578 = chlo.broadcast_add %577, %568 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %579 = chlo.broadcast_multiply %578, %cst_205 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %580 = chlo.broadcast_add %579, %cst_206 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %581 = "mhlo.reshape"(%580) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %582 = "mhlo.dot"(%581, %cst_201) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %583 = chlo.broadcast_add %582, %cst_202 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %584 = "mhlo.reshape"(%583) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %585 = chlo.broadcast_maximum %584, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %586 = "mhlo.reshape"(%585) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %587 = "mhlo.dot"(%586, %cst_197) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %588 = chlo.broadcast_add %587, %cst_198 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %589 = "mhlo.reshape"(%588) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %590 = chlo.broadcast_add %589, %580 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %591 = chlo.broadcast_multiply %590, %cst_199 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %592 = chlo.broadcast_add %591, %cst_200 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %593 = "mhlo.reshape"(%592) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %594 = "mhlo.dot"(%593, %cst_195) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %595 = chlo.broadcast_add %594, %cst_196 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %596 = "mhlo.reshape"(%595) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %597 = chlo.broadcast_maximum %596, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %598 = "mhlo.reshape"(%597) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %599 = "mhlo.dot"(%598, %cst_191) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %600 = chlo.broadcast_add %599, %cst_192 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %601 = "mhlo.reshape"(%600) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %602 = chlo.broadcast_add %601, %592 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %603 = chlo.broadcast_multiply %602, %cst_193 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %604 = chlo.broadcast_add %603, %cst_194 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %605 = "mhlo.reshape"(%604) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %606 = "mhlo.dot"(%605, %cst_189) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %607 = chlo.broadcast_add %606, %cst_190 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %608 = "mhlo.reshape"(%607) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %609 = chlo.broadcast_maximum %608, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %610 = "mhlo.reshape"(%609) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %611 = "mhlo.dot"(%610, %cst_181) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %612 = chlo.broadcast_add %611, %cst_182 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %613 = "mhlo.reshape"(%612) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %614 = chlo.broadcast_add %613, %604 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %615 = chlo.broadcast_multiply %614, %cst_187 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %616 = chlo.broadcast_add %615, %cst_188 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %617 = "mhlo.reshape"(%616) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %618 = "mhlo.dot"(%617, %cst_183) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %619 = chlo.broadcast_add %618, %cst_184 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %620 = "mhlo.reshape"(%619) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %621 = chlo.broadcast_add %620, %524 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %622 = chlo.broadcast_multiply %621, %cst_185 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %623 = chlo.broadcast_add %622, %cst_186 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %624 = "mhlo.reshape"(%623) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %625 = "mhlo.dot"(%624, %cst_171) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %626 = chlo.broadcast_add %625, %cst_172 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %627 = "mhlo.reshape"(%626) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %628 = "mhlo.transpose"(%627) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %629 = "mhlo.dot"(%624, %cst_167) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %630 = "mhlo.reshape"(%629) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %631 = "mhlo.broadcast_in_dim"(%cst_168) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %632 = mhlo.add %630, %631 : tensor<1x384x128xf32>
    %633 = chlo.broadcast_multiply %632, %cst_169 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %634 = chlo.broadcast_add %633, %cst_170 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %635 = "mhlo.reshape"(%634) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %636 = "mhlo.dot"(%635, %cst_175) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %637 = chlo.broadcast_add %636, %cst_176 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %638 = "mhlo.reshape"(%637) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %639 = "mhlo.transpose"(%638) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %640 = "mhlo.dot"(%635, %cst_173) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %641 = chlo.broadcast_add %640, %cst_174 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %642 = "mhlo.reshape"(%641) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %643 = "mhlo.transpose"(%642) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %644 = "mhlo.dot_general"(%643, %639) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %645 = chlo.broadcast_multiply %644, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %646 = chlo.broadcast_add %645, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %647 = "mhlo.reduce"(%646, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %648 = linalg.tensor_expand_shape %647 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %649 = chlo.broadcast_subtract %646, %648 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %650 = "mhlo.exponential"(%649) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %651 = "mhlo.reduce"(%650, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %652 = linalg.tensor_expand_shape %651 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %653 = chlo.broadcast_divide %650, %652 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %654 = "mhlo.dot_general"(%653, %628) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %655 = "mhlo.transpose"(%654) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %656 = "mhlo.reshape"(%655) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %657 = "mhlo.dot"(%656, %cst_177) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %658 = chlo.broadcast_add %657, %cst_178 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %659 = "mhlo.reshape"(%658) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %660 = "mhlo.dot"(%624, %cst_164) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %661 = chlo.broadcast_add %660, %cst_165 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %662 = "mhlo.reshape"(%661) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %663 = chlo.broadcast_multiply %662, %cst_166 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %664 = chlo.broadcast_add %663, %cst_178 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %665 = chlo.broadcast_add %659, %664 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %666 = chlo.broadcast_multiply %665, %cst_179 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %667 = chlo.broadcast_add %666, %cst_180 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %668 = "mhlo.reshape"(%667) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %669 = "mhlo.dot"(%668, %cst_162) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %670 = chlo.broadcast_add %669, %cst_163 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %671 = "mhlo.reshape"(%670) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %672 = chlo.broadcast_maximum %671, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %673 = "mhlo.reshape"(%672) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %674 = "mhlo.dot"(%673, %cst_158) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %675 = chlo.broadcast_add %674, %cst_159 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %676 = "mhlo.reshape"(%675) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %677 = chlo.broadcast_add %676, %667 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %678 = chlo.broadcast_multiply %677, %cst_160 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %679 = chlo.broadcast_add %678, %cst_161 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %680 = "mhlo.reshape"(%679) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %681 = "mhlo.dot"(%680, %cst_156) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %682 = chlo.broadcast_add %681, %cst_157 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %683 = "mhlo.reshape"(%682) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %684 = chlo.broadcast_maximum %683, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %685 = "mhlo.reshape"(%684) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %686 = "mhlo.dot"(%685, %cst_152) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %687 = chlo.broadcast_add %686, %cst_153 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %688 = "mhlo.reshape"(%687) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %689 = chlo.broadcast_add %688, %679 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %690 = chlo.broadcast_multiply %689, %cst_154 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %691 = chlo.broadcast_add %690, %cst_155 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %692 = "mhlo.reshape"(%691) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %693 = "mhlo.dot"(%692, %cst_150) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %694 = chlo.broadcast_add %693, %cst_151 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %695 = "mhlo.reshape"(%694) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %696 = chlo.broadcast_maximum %695, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %697 = "mhlo.reshape"(%696) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %698 = "mhlo.dot"(%697, %cst_146) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %699 = chlo.broadcast_add %698, %cst_147 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %700 = "mhlo.reshape"(%699) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %701 = chlo.broadcast_add %700, %691 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %702 = chlo.broadcast_multiply %701, %cst_148 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %703 = chlo.broadcast_add %702, %cst_149 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %704 = "mhlo.reshape"(%703) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %705 = "mhlo.dot"(%704, %cst_144) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %706 = chlo.broadcast_add %705, %cst_145 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %707 = "mhlo.reshape"(%706) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %708 = chlo.broadcast_maximum %707, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %709 = "mhlo.reshape"(%708) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %710 = "mhlo.dot"(%709, %cst_136) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %711 = chlo.broadcast_add %710, %cst_137 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %712 = "mhlo.reshape"(%711) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %713 = chlo.broadcast_add %712, %703 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %714 = chlo.broadcast_multiply %713, %cst_142 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %715 = chlo.broadcast_add %714, %cst_143 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %716 = "mhlo.reshape"(%715) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %717 = "mhlo.dot"(%716, %cst_138) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %718 = chlo.broadcast_add %717, %cst_139 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %719 = "mhlo.reshape"(%718) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %720 = chlo.broadcast_add %719, %623 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %721 = chlo.broadcast_multiply %720, %cst_140 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %722 = chlo.broadcast_add %721, %cst_141 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %723 = "mhlo.reshape"(%722) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %724 = "mhlo.dot"(%723, %cst_126) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %725 = chlo.broadcast_add %724, %cst_127 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %726 = "mhlo.reshape"(%725) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %727 = "mhlo.transpose"(%726) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %728 = "mhlo.dot"(%723, %cst_122) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %729 = "mhlo.reshape"(%728) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %730 = "mhlo.broadcast_in_dim"(%cst_123) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %731 = mhlo.add %729, %730 : tensor<1x384x128xf32>
    %732 = chlo.broadcast_multiply %731, %cst_124 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %733 = chlo.broadcast_add %732, %cst_125 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %734 = "mhlo.reshape"(%733) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %735 = "mhlo.dot"(%734, %cst_130) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %736 = chlo.broadcast_add %735, %cst_131 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %737 = "mhlo.reshape"(%736) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %738 = "mhlo.transpose"(%737) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %739 = "mhlo.dot"(%734, %cst_128) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %740 = chlo.broadcast_add %739, %cst_129 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %741 = "mhlo.reshape"(%740) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %742 = "mhlo.transpose"(%741) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %743 = "mhlo.dot_general"(%742, %738) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %744 = chlo.broadcast_multiply %743, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %745 = chlo.broadcast_add %744, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %746 = "mhlo.reduce"(%745, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %747 = linalg.tensor_expand_shape %746 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %748 = chlo.broadcast_subtract %745, %747 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %749 = "mhlo.exponential"(%748) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %750 = "mhlo.reduce"(%749, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %751 = linalg.tensor_expand_shape %750 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %752 = chlo.broadcast_divide %749, %751 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %753 = "mhlo.dot_general"(%752, %727) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %754 = "mhlo.transpose"(%753) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %755 = "mhlo.reshape"(%754) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %756 = "mhlo.dot"(%755, %cst_132) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %757 = chlo.broadcast_add %756, %cst_133 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %758 = "mhlo.reshape"(%757) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %759 = "mhlo.dot"(%723, %cst_119) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %760 = chlo.broadcast_add %759, %cst_120 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %761 = "mhlo.reshape"(%760) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %762 = chlo.broadcast_multiply %761, %cst_121 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %763 = chlo.broadcast_add %762, %cst_133 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %764 = chlo.broadcast_add %758, %763 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %765 = chlo.broadcast_multiply %764, %cst_134 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %766 = chlo.broadcast_add %765, %cst_135 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %767 = "mhlo.reshape"(%766) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %768 = "mhlo.dot"(%767, %cst_117) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %769 = chlo.broadcast_add %768, %cst_118 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %770 = "mhlo.reshape"(%769) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %771 = chlo.broadcast_maximum %770, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %772 = "mhlo.reshape"(%771) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %773 = "mhlo.dot"(%772, %cst_113) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %774 = chlo.broadcast_add %773, %cst_114 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %775 = "mhlo.reshape"(%774) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %776 = chlo.broadcast_add %775, %766 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %777 = chlo.broadcast_multiply %776, %cst_115 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %778 = chlo.broadcast_add %777, %cst_116 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %779 = "mhlo.reshape"(%778) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %780 = "mhlo.dot"(%779, %cst_111) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %781 = chlo.broadcast_add %780, %cst_112 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %782 = "mhlo.reshape"(%781) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %783 = chlo.broadcast_maximum %782, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %784 = "mhlo.reshape"(%783) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %785 = "mhlo.dot"(%784, %cst_107) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %786 = chlo.broadcast_add %785, %cst_108 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %787 = "mhlo.reshape"(%786) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %788 = chlo.broadcast_add %787, %778 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %789 = chlo.broadcast_multiply %788, %cst_109 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %790 = chlo.broadcast_add %789, %cst_110 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %791 = "mhlo.reshape"(%790) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %792 = "mhlo.dot"(%791, %cst_105) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %793 = chlo.broadcast_add %792, %cst_106 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %794 = "mhlo.reshape"(%793) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %795 = chlo.broadcast_maximum %794, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %796 = "mhlo.reshape"(%795) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %797 = "mhlo.dot"(%796, %cst_101) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %798 = chlo.broadcast_add %797, %cst_102 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %799 = "mhlo.reshape"(%798) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %800 = chlo.broadcast_add %799, %790 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %801 = chlo.broadcast_multiply %800, %cst_103 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %802 = chlo.broadcast_add %801, %cst_104 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %803 = "mhlo.reshape"(%802) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %804 = "mhlo.dot"(%803, %cst_99) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %805 = chlo.broadcast_add %804, %cst_100 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %806 = "mhlo.reshape"(%805) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %807 = chlo.broadcast_maximum %806, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %808 = "mhlo.reshape"(%807) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %809 = "mhlo.dot"(%808, %cst_91) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %810 = chlo.broadcast_add %809, %cst_92 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %811 = "mhlo.reshape"(%810) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %812 = chlo.broadcast_add %811, %802 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %813 = chlo.broadcast_multiply %812, %cst_97 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %814 = chlo.broadcast_add %813, %cst_98 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %815 = "mhlo.reshape"(%814) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %816 = "mhlo.dot"(%815, %cst_93) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %817 = chlo.broadcast_add %816, %cst_94 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %818 = "mhlo.reshape"(%817) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %819 = chlo.broadcast_add %818, %722 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %820 = chlo.broadcast_multiply %819, %cst_95 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %821 = chlo.broadcast_add %820, %cst_96 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %822 = "mhlo.reshape"(%821) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %823 = "mhlo.dot"(%822, %cst_81) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %824 = chlo.broadcast_add %823, %cst_82 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %825 = "mhlo.reshape"(%824) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %826 = "mhlo.transpose"(%825) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %827 = "mhlo.dot"(%822, %cst_77) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %828 = "mhlo.reshape"(%827) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %829 = "mhlo.broadcast_in_dim"(%cst_78) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %830 = mhlo.add %828, %829 : tensor<1x384x128xf32>
    %831 = chlo.broadcast_multiply %830, %cst_79 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %832 = chlo.broadcast_add %831, %cst_80 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %833 = "mhlo.reshape"(%832) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %834 = "mhlo.dot"(%833, %cst_85) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %835 = chlo.broadcast_add %834, %cst_86 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %836 = "mhlo.reshape"(%835) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %837 = "mhlo.transpose"(%836) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %838 = "mhlo.dot"(%833, %cst_83) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %839 = chlo.broadcast_add %838, %cst_84 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %840 = "mhlo.reshape"(%839) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %841 = "mhlo.transpose"(%840) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %842 = "mhlo.dot_general"(%841, %837) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %843 = chlo.broadcast_multiply %842, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %844 = chlo.broadcast_add %843, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %845 = "mhlo.reduce"(%844, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %846 = linalg.tensor_expand_shape %845 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %847 = chlo.broadcast_subtract %844, %846 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %848 = "mhlo.exponential"(%847) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %849 = "mhlo.reduce"(%848, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %850 = linalg.tensor_expand_shape %849 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %851 = chlo.broadcast_divide %848, %850 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %852 = "mhlo.dot_general"(%851, %826) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %853 = "mhlo.transpose"(%852) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %854 = "mhlo.reshape"(%853) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %855 = "mhlo.dot"(%854, %cst_87) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %856 = chlo.broadcast_add %855, %cst_88 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %857 = "mhlo.reshape"(%856) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %858 = "mhlo.dot"(%822, %cst_74) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %859 = chlo.broadcast_add %858, %cst_75 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %860 = "mhlo.reshape"(%859) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %861 = chlo.broadcast_multiply %860, %cst_76 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %862 = chlo.broadcast_add %861, %cst_88 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %863 = chlo.broadcast_add %857, %862 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %864 = chlo.broadcast_multiply %863, %cst_89 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %865 = chlo.broadcast_add %864, %cst_90 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %866 = "mhlo.reshape"(%865) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %867 = "mhlo.dot"(%866, %cst_72) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %868 = chlo.broadcast_add %867, %cst_73 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %869 = "mhlo.reshape"(%868) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %870 = chlo.broadcast_maximum %869, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %871 = "mhlo.reshape"(%870) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %872 = "mhlo.dot"(%871, %cst_68) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %873 = chlo.broadcast_add %872, %cst_69 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %874 = "mhlo.reshape"(%873) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %875 = chlo.broadcast_add %874, %865 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %876 = chlo.broadcast_multiply %875, %cst_70 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %877 = chlo.broadcast_add %876, %cst_71 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %878 = "mhlo.reshape"(%877) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %879 = "mhlo.dot"(%878, %cst_66) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %880 = chlo.broadcast_add %879, %cst_67 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %881 = "mhlo.reshape"(%880) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %882 = chlo.broadcast_maximum %881, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %883 = "mhlo.reshape"(%882) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %884 = "mhlo.dot"(%883, %cst_62) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %885 = chlo.broadcast_add %884, %cst_63 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %886 = "mhlo.reshape"(%885) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %887 = chlo.broadcast_add %886, %877 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %888 = chlo.broadcast_multiply %887, %cst_64 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %889 = chlo.broadcast_add %888, %cst_65 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %890 = "mhlo.reshape"(%889) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %891 = "mhlo.dot"(%890, %cst_60) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %892 = chlo.broadcast_add %891, %cst_61 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %893 = "mhlo.reshape"(%892) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %894 = chlo.broadcast_maximum %893, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %895 = "mhlo.reshape"(%894) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %896 = "mhlo.dot"(%895, %cst_56) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %897 = chlo.broadcast_add %896, %cst_57 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %898 = "mhlo.reshape"(%897) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %899 = chlo.broadcast_add %898, %889 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %900 = chlo.broadcast_multiply %899, %cst_58 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %901 = chlo.broadcast_add %900, %cst_59 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %902 = "mhlo.reshape"(%901) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %903 = "mhlo.dot"(%902, %cst_54) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %904 = chlo.broadcast_add %903, %cst_55 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %905 = "mhlo.reshape"(%904) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %906 = chlo.broadcast_maximum %905, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %907 = "mhlo.reshape"(%906) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %908 = "mhlo.dot"(%907, %cst_46) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %909 = chlo.broadcast_add %908, %cst_47 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %910 = "mhlo.reshape"(%909) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %911 = chlo.broadcast_add %910, %901 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %912 = chlo.broadcast_multiply %911, %cst_52 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %913 = chlo.broadcast_add %912, %cst_53 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %914 = "mhlo.reshape"(%913) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %915 = "mhlo.dot"(%914, %cst_48) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %916 = chlo.broadcast_add %915, %cst_49 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %917 = "mhlo.reshape"(%916) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %918 = chlo.broadcast_add %917, %821 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %919 = chlo.broadcast_multiply %918, %cst_50 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %920 = chlo.broadcast_add %919, %cst_51 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %921 = "mhlo.reshape"(%920) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %922 = "mhlo.dot"(%921, %cst_36) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %923 = chlo.broadcast_add %922, %cst_37 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %924 = "mhlo.reshape"(%923) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %925 = "mhlo.transpose"(%924) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %926 = "mhlo.dot"(%921, %cst_32) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %927 = "mhlo.reshape"(%926) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %928 = "mhlo.broadcast_in_dim"(%cst_33) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %929 = mhlo.add %927, %928 : tensor<1x384x128xf32>
    %930 = chlo.broadcast_multiply %929, %cst_34 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %931 = chlo.broadcast_add %930, %cst_35 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %932 = "mhlo.reshape"(%931) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %933 = "mhlo.dot"(%932, %cst_40) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %934 = chlo.broadcast_add %933, %cst_41 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %935 = "mhlo.reshape"(%934) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %936 = "mhlo.transpose"(%935) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %937 = "mhlo.dot"(%932, %cst_38) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %938 = chlo.broadcast_add %937, %cst_39 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %939 = "mhlo.reshape"(%938) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %940 = "mhlo.transpose"(%939) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %941 = "mhlo.dot_general"(%940, %936) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %942 = chlo.broadcast_multiply %941, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %943 = chlo.broadcast_add %942, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %944 = "mhlo.reduce"(%943, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %945 = linalg.tensor_expand_shape %944 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %946 = chlo.broadcast_subtract %943, %945 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %947 = "mhlo.exponential"(%946) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %948 = "mhlo.reduce"(%947, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %949 = linalg.tensor_expand_shape %948 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %950 = chlo.broadcast_divide %947, %949 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %951 = "mhlo.dot_general"(%950, %925) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %952 = "mhlo.transpose"(%951) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %953 = "mhlo.reshape"(%952) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %954 = "mhlo.dot"(%953, %cst_42) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %955 = chlo.broadcast_add %954, %cst_43 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %956 = "mhlo.reshape"(%955) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %957 = "mhlo.dot"(%921, %cst_29) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %958 = chlo.broadcast_add %957, %cst_30 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %959 = "mhlo.reshape"(%958) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %960 = chlo.broadcast_multiply %959, %cst_31 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %961 = chlo.broadcast_add %960, %cst_43 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %962 = chlo.broadcast_add %956, %961 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %963 = chlo.broadcast_multiply %962, %cst_44 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %964 = chlo.broadcast_add %963, %cst_45 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %965 = "mhlo.reshape"(%964) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %966 = "mhlo.dot"(%965, %cst_27) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %967 = chlo.broadcast_add %966, %cst_28 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %968 = "mhlo.reshape"(%967) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %969 = chlo.broadcast_maximum %968, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %970 = "mhlo.reshape"(%969) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %971 = "mhlo.dot"(%970, %cst_23) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %972 = chlo.broadcast_add %971, %cst_24 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %973 = "mhlo.reshape"(%972) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %974 = chlo.broadcast_add %973, %964 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %975 = chlo.broadcast_multiply %974, %cst_25 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %976 = chlo.broadcast_add %975, %cst_26 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %977 = "mhlo.reshape"(%976) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %978 = "mhlo.dot"(%977, %cst_21) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %979 = chlo.broadcast_add %978, %cst_22 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %980 = "mhlo.reshape"(%979) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %981 = chlo.broadcast_maximum %980, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %982 = "mhlo.reshape"(%981) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %983 = "mhlo.dot"(%982, %cst_17) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %984 = chlo.broadcast_add %983, %cst_18 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %985 = "mhlo.reshape"(%984) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %986 = chlo.broadcast_add %985, %976 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %987 = chlo.broadcast_multiply %986, %cst_19 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %988 = chlo.broadcast_add %987, %cst_20 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %989 = "mhlo.reshape"(%988) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %990 = "mhlo.dot"(%989, %cst_15) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %991 = chlo.broadcast_add %990, %cst_16 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %992 = "mhlo.reshape"(%991) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %993 = chlo.broadcast_maximum %992, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %994 = "mhlo.reshape"(%993) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %995 = "mhlo.dot"(%994, %cst_11) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %996 = chlo.broadcast_add %995, %cst_12 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %997 = "mhlo.reshape"(%996) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %998 = chlo.broadcast_add %997, %988 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %999 = chlo.broadcast_multiply %998, %cst_13 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1000 = chlo.broadcast_add %999, %cst_14 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1001 = "mhlo.reshape"(%1000) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1002 = "mhlo.dot"(%1001, %cst_9) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1003 = chlo.broadcast_add %1002, %cst_10 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1004 = "mhlo.reshape"(%1003) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1005 = chlo.broadcast_maximum %1004, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1006 = "mhlo.reshape"(%1005) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1007 = "mhlo.dot"(%1006, %cst_1) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1008 = chlo.broadcast_add %1007, %cst_2 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1009 = "mhlo.reshape"(%1008) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1010 = chlo.broadcast_add %1009, %1000 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1011 = chlo.broadcast_multiply %1010, %cst_7 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1012 = chlo.broadcast_add %1011, %cst_8 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1013 = "mhlo.reshape"(%1012) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1014 = "mhlo.dot"(%1013, %cst_3) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1015 = chlo.broadcast_add %1014, %cst_4 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1016 = "mhlo.reshape"(%1015) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1017 = chlo.broadcast_add %1016, %920 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1018 = chlo.broadcast_multiply %1017, %cst_5 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1019 = chlo.broadcast_add %1018, %cst_6 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1020 = "mhlo.reshape"(%1019) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1021 = "mhlo.dot"(%1020, %cst_981) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1022 = chlo.broadcast_add %1021, %cst_982 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1023 = "mhlo.reshape"(%1022) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1024 = "mhlo.transpose"(%1023) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1025 = "mhlo.dot"(%1020, %cst_977) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1026 = "mhlo.reshape"(%1025) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1027 = "mhlo.broadcast_in_dim"(%cst_978) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1028 = mhlo.add %1026, %1027 : tensor<1x384x128xf32>
    %1029 = chlo.broadcast_multiply %1028, %cst_979 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1030 = chlo.broadcast_add %1029, %cst_980 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1031 = "mhlo.reshape"(%1030) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1032 = "mhlo.dot"(%1031, %cst_985) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1033 = chlo.broadcast_add %1032, %cst_986 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1034 = "mhlo.reshape"(%1033) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1035 = "mhlo.transpose"(%1034) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1036 = "mhlo.dot"(%1031, %cst_983) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1037 = chlo.broadcast_add %1036, %cst_984 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1038 = "mhlo.reshape"(%1037) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1039 = "mhlo.transpose"(%1038) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1040 = "mhlo.dot_general"(%1039, %1035) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1041 = chlo.broadcast_multiply %1040, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1042 = chlo.broadcast_add %1041, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1043 = "mhlo.reduce"(%1042, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1044 = linalg.tensor_expand_shape %1043 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1045 = chlo.broadcast_subtract %1042, %1044 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1046 = "mhlo.exponential"(%1045) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1047 = "mhlo.reduce"(%1046, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1048 = linalg.tensor_expand_shape %1047 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1049 = chlo.broadcast_divide %1046, %1048 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1050 = "mhlo.dot_general"(%1049, %1024) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1051 = "mhlo.transpose"(%1050) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1052 = "mhlo.reshape"(%1051) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1053 = "mhlo.dot"(%1052, %cst_987) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1054 = chlo.broadcast_add %1053, %cst_988 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1055 = "mhlo.reshape"(%1054) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1056 = "mhlo.dot"(%1020, %cst_974) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1057 = chlo.broadcast_add %1056, %cst_975 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1058 = "mhlo.reshape"(%1057) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1059 = chlo.broadcast_multiply %1058, %cst_976 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1060 = chlo.broadcast_add %1059, %cst_988 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1061 = chlo.broadcast_add %1055, %1060 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1062 = chlo.broadcast_multiply %1061, %cst_989 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1063 = chlo.broadcast_add %1062, %cst_990 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1064 = "mhlo.reshape"(%1063) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1065 = "mhlo.dot"(%1064, %cst_972) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1066 = chlo.broadcast_add %1065, %cst_973 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1067 = "mhlo.reshape"(%1066) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1068 = chlo.broadcast_maximum %1067, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1069 = "mhlo.reshape"(%1068) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1070 = "mhlo.dot"(%1069, %cst_968) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1071 = chlo.broadcast_add %1070, %cst_969 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1072 = "mhlo.reshape"(%1071) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1073 = chlo.broadcast_add %1072, %1063 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1074 = chlo.broadcast_multiply %1073, %cst_970 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1075 = chlo.broadcast_add %1074, %cst_971 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1076 = "mhlo.reshape"(%1075) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1077 = "mhlo.dot"(%1076, %cst_966) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1078 = chlo.broadcast_add %1077, %cst_967 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1079 = "mhlo.reshape"(%1078) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1080 = chlo.broadcast_maximum %1079, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1081 = "mhlo.reshape"(%1080) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1082 = "mhlo.dot"(%1081, %cst_962) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1083 = chlo.broadcast_add %1082, %cst_963 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1084 = "mhlo.reshape"(%1083) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1085 = chlo.broadcast_add %1084, %1075 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1086 = chlo.broadcast_multiply %1085, %cst_964 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1087 = chlo.broadcast_add %1086, %cst_965 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1088 = "mhlo.reshape"(%1087) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1089 = "mhlo.dot"(%1088, %cst_960) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1090 = chlo.broadcast_add %1089, %cst_961 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1091 = "mhlo.reshape"(%1090) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1092 = chlo.broadcast_maximum %1091, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1093 = "mhlo.reshape"(%1092) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1094 = "mhlo.dot"(%1093, %cst_956) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1095 = chlo.broadcast_add %1094, %cst_957 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1096 = "mhlo.reshape"(%1095) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1097 = chlo.broadcast_add %1096, %1087 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1098 = chlo.broadcast_multiply %1097, %cst_958 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1099 = chlo.broadcast_add %1098, %cst_959 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1100 = "mhlo.reshape"(%1099) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1101 = "mhlo.dot"(%1100, %cst_954) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1102 = chlo.broadcast_add %1101, %cst_955 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1103 = "mhlo.reshape"(%1102) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1104 = chlo.broadcast_maximum %1103, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1105 = "mhlo.reshape"(%1104) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1106 = "mhlo.dot"(%1105, %cst_946) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1107 = chlo.broadcast_add %1106, %cst_947 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1108 = "mhlo.reshape"(%1107) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1109 = chlo.broadcast_add %1108, %1099 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1110 = chlo.broadcast_multiply %1109, %cst_952 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1111 = chlo.broadcast_add %1110, %cst_953 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1112 = "mhlo.reshape"(%1111) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1113 = "mhlo.dot"(%1112, %cst_948) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1114 = chlo.broadcast_add %1113, %cst_949 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1115 = "mhlo.reshape"(%1114) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1116 = chlo.broadcast_add %1115, %1019 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1117 = chlo.broadcast_multiply %1116, %cst_950 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1118 = chlo.broadcast_add %1117, %cst_951 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1119 = "mhlo.reshape"(%1118) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1120 = "mhlo.dot"(%1119, %cst_936) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1121 = chlo.broadcast_add %1120, %cst_937 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1122 = "mhlo.reshape"(%1121) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1123 = "mhlo.transpose"(%1122) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1124 = "mhlo.dot"(%1119, %cst_932) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1125 = "mhlo.reshape"(%1124) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1126 = "mhlo.broadcast_in_dim"(%cst_933) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1127 = mhlo.add %1125, %1126 : tensor<1x384x128xf32>
    %1128 = chlo.broadcast_multiply %1127, %cst_934 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1129 = chlo.broadcast_add %1128, %cst_935 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1130 = "mhlo.reshape"(%1129) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1131 = "mhlo.dot"(%1130, %cst_940) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1132 = chlo.broadcast_add %1131, %cst_941 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1133 = "mhlo.reshape"(%1132) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1134 = "mhlo.transpose"(%1133) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1135 = "mhlo.dot"(%1130, %cst_938) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1136 = chlo.broadcast_add %1135, %cst_939 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1137 = "mhlo.reshape"(%1136) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1138 = "mhlo.transpose"(%1137) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1139 = "mhlo.dot_general"(%1138, %1134) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1140 = chlo.broadcast_multiply %1139, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1141 = chlo.broadcast_add %1140, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1142 = "mhlo.reduce"(%1141, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1143 = linalg.tensor_expand_shape %1142 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1144 = chlo.broadcast_subtract %1141, %1143 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1145 = "mhlo.exponential"(%1144) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1146 = "mhlo.reduce"(%1145, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1147 = linalg.tensor_expand_shape %1146 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1148 = chlo.broadcast_divide %1145, %1147 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1149 = "mhlo.dot_general"(%1148, %1123) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1150 = "mhlo.transpose"(%1149) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1151 = "mhlo.reshape"(%1150) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1152 = "mhlo.dot"(%1151, %cst_942) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1153 = chlo.broadcast_add %1152, %cst_943 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1154 = "mhlo.reshape"(%1153) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1155 = "mhlo.dot"(%1119, %cst_929) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1156 = chlo.broadcast_add %1155, %cst_930 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1157 = "mhlo.reshape"(%1156) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1158 = chlo.broadcast_multiply %1157, %cst_931 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1159 = chlo.broadcast_add %1158, %cst_943 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1160 = chlo.broadcast_add %1154, %1159 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1161 = chlo.broadcast_multiply %1160, %cst_944 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1162 = chlo.broadcast_add %1161, %cst_945 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1163 = "mhlo.reshape"(%1162) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1164 = "mhlo.dot"(%1163, %cst_927) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1165 = chlo.broadcast_add %1164, %cst_928 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1166 = "mhlo.reshape"(%1165) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1167 = chlo.broadcast_maximum %1166, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1168 = "mhlo.reshape"(%1167) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1169 = "mhlo.dot"(%1168, %cst_923) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1170 = chlo.broadcast_add %1169, %cst_924 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1171 = "mhlo.reshape"(%1170) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1172 = chlo.broadcast_add %1171, %1162 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1173 = chlo.broadcast_multiply %1172, %cst_925 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1174 = chlo.broadcast_add %1173, %cst_926 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1175 = "mhlo.reshape"(%1174) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1176 = "mhlo.dot"(%1175, %cst_921) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1177 = chlo.broadcast_add %1176, %cst_922 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1178 = "mhlo.reshape"(%1177) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1179 = chlo.broadcast_maximum %1178, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1180 = "mhlo.reshape"(%1179) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1181 = "mhlo.dot"(%1180, %cst_917) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1182 = chlo.broadcast_add %1181, %cst_918 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1183 = "mhlo.reshape"(%1182) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1184 = chlo.broadcast_add %1183, %1174 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1185 = chlo.broadcast_multiply %1184, %cst_919 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1186 = chlo.broadcast_add %1185, %cst_920 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1187 = "mhlo.reshape"(%1186) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1188 = "mhlo.dot"(%1187, %cst_915) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1189 = chlo.broadcast_add %1188, %cst_916 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1190 = "mhlo.reshape"(%1189) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1191 = chlo.broadcast_maximum %1190, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1192 = "mhlo.reshape"(%1191) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1193 = "mhlo.dot"(%1192, %cst_911) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1194 = chlo.broadcast_add %1193, %cst_912 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1195 = "mhlo.reshape"(%1194) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1196 = chlo.broadcast_add %1195, %1186 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1197 = chlo.broadcast_multiply %1196, %cst_913 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1198 = chlo.broadcast_add %1197, %cst_914 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1199 = "mhlo.reshape"(%1198) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1200 = "mhlo.dot"(%1199, %cst_909) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1201 = chlo.broadcast_add %1200, %cst_910 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1202 = "mhlo.reshape"(%1201) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1203 = chlo.broadcast_maximum %1202, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1204 = "mhlo.reshape"(%1203) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1205 = "mhlo.dot"(%1204, %cst_901) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1206 = chlo.broadcast_add %1205, %cst_902 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1207 = "mhlo.reshape"(%1206) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1208 = chlo.broadcast_add %1207, %1198 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1209 = chlo.broadcast_multiply %1208, %cst_907 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1210 = chlo.broadcast_add %1209, %cst_908 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1211 = "mhlo.reshape"(%1210) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1212 = "mhlo.dot"(%1211, %cst_903) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1213 = chlo.broadcast_add %1212, %cst_904 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1214 = "mhlo.reshape"(%1213) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1215 = chlo.broadcast_add %1214, %1118 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1216 = chlo.broadcast_multiply %1215, %cst_905 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1217 = chlo.broadcast_add %1216, %cst_906 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1218 = "mhlo.reshape"(%1217) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1219 = "mhlo.dot"(%1218, %cst_891) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1220 = chlo.broadcast_add %1219, %cst_892 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1221 = "mhlo.reshape"(%1220) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1222 = "mhlo.transpose"(%1221) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1223 = "mhlo.dot"(%1218, %cst_887) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1224 = "mhlo.reshape"(%1223) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1225 = "mhlo.broadcast_in_dim"(%cst_888) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1226 = mhlo.add %1224, %1225 : tensor<1x384x128xf32>
    %1227 = chlo.broadcast_multiply %1226, %cst_889 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1228 = chlo.broadcast_add %1227, %cst_890 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1229 = "mhlo.reshape"(%1228) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1230 = "mhlo.dot"(%1229, %cst_895) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1231 = chlo.broadcast_add %1230, %cst_896 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1232 = "mhlo.reshape"(%1231) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1233 = "mhlo.transpose"(%1232) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1234 = "mhlo.dot"(%1229, %cst_893) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1235 = chlo.broadcast_add %1234, %cst_894 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1236 = "mhlo.reshape"(%1235) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1237 = "mhlo.transpose"(%1236) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1238 = "mhlo.dot_general"(%1237, %1233) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1239 = chlo.broadcast_multiply %1238, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1240 = chlo.broadcast_add %1239, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1241 = "mhlo.reduce"(%1240, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1242 = linalg.tensor_expand_shape %1241 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1243 = chlo.broadcast_subtract %1240, %1242 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1244 = "mhlo.exponential"(%1243) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1245 = "mhlo.reduce"(%1244, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1246 = linalg.tensor_expand_shape %1245 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1247 = chlo.broadcast_divide %1244, %1246 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1248 = "mhlo.dot_general"(%1247, %1222) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1249 = "mhlo.transpose"(%1248) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1250 = "mhlo.reshape"(%1249) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1251 = "mhlo.dot"(%1250, %cst_897) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1252 = chlo.broadcast_add %1251, %cst_898 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1253 = "mhlo.reshape"(%1252) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1254 = "mhlo.dot"(%1218, %cst_884) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1255 = chlo.broadcast_add %1254, %cst_885 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1256 = "mhlo.reshape"(%1255) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1257 = chlo.broadcast_multiply %1256, %cst_886 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1258 = chlo.broadcast_add %1257, %cst_898 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1259 = chlo.broadcast_add %1253, %1258 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1260 = chlo.broadcast_multiply %1259, %cst_899 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1261 = chlo.broadcast_add %1260, %cst_900 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1262 = "mhlo.reshape"(%1261) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1263 = "mhlo.dot"(%1262, %cst_882) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1264 = chlo.broadcast_add %1263, %cst_883 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1265 = "mhlo.reshape"(%1264) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1266 = chlo.broadcast_maximum %1265, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1267 = "mhlo.reshape"(%1266) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1268 = "mhlo.dot"(%1267, %cst_878) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1269 = chlo.broadcast_add %1268, %cst_879 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1270 = "mhlo.reshape"(%1269) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1271 = chlo.broadcast_add %1270, %1261 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1272 = chlo.broadcast_multiply %1271, %cst_880 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1273 = chlo.broadcast_add %1272, %cst_881 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1274 = "mhlo.reshape"(%1273) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1275 = "mhlo.dot"(%1274, %cst_876) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1276 = chlo.broadcast_add %1275, %cst_877 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1277 = "mhlo.reshape"(%1276) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1278 = chlo.broadcast_maximum %1277, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1279 = "mhlo.reshape"(%1278) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1280 = "mhlo.dot"(%1279, %cst_872) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1281 = chlo.broadcast_add %1280, %cst_873 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1282 = "mhlo.reshape"(%1281) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1283 = chlo.broadcast_add %1282, %1273 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1284 = chlo.broadcast_multiply %1283, %cst_874 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1285 = chlo.broadcast_add %1284, %cst_875 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1286 = "mhlo.reshape"(%1285) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1287 = "mhlo.dot"(%1286, %cst_870) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1288 = chlo.broadcast_add %1287, %cst_871 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1289 = "mhlo.reshape"(%1288) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1290 = chlo.broadcast_maximum %1289, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1291 = "mhlo.reshape"(%1290) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1292 = "mhlo.dot"(%1291, %cst_866) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1293 = chlo.broadcast_add %1292, %cst_867 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1294 = "mhlo.reshape"(%1293) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1295 = chlo.broadcast_add %1294, %1285 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1296 = chlo.broadcast_multiply %1295, %cst_868 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1297 = chlo.broadcast_add %1296, %cst_869 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1298 = "mhlo.reshape"(%1297) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1299 = "mhlo.dot"(%1298, %cst_864) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1300 = chlo.broadcast_add %1299, %cst_865 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1301 = "mhlo.reshape"(%1300) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1302 = chlo.broadcast_maximum %1301, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1303 = "mhlo.reshape"(%1302) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1304 = "mhlo.dot"(%1303, %cst_856) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1305 = chlo.broadcast_add %1304, %cst_857 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1306 = "mhlo.reshape"(%1305) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1307 = chlo.broadcast_add %1306, %1297 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1308 = chlo.broadcast_multiply %1307, %cst_862 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1309 = chlo.broadcast_add %1308, %cst_863 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1310 = "mhlo.reshape"(%1309) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1311 = "mhlo.dot"(%1310, %cst_858) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1312 = chlo.broadcast_add %1311, %cst_859 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1313 = "mhlo.reshape"(%1312) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1314 = chlo.broadcast_add %1313, %1217 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1315 = chlo.broadcast_multiply %1314, %cst_860 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1316 = chlo.broadcast_add %1315, %cst_861 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1317 = "mhlo.reshape"(%1316) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1318 = "mhlo.dot"(%1317, %cst_846) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1319 = chlo.broadcast_add %1318, %cst_847 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1320 = "mhlo.reshape"(%1319) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1321 = "mhlo.transpose"(%1320) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1322 = "mhlo.dot"(%1317, %cst_842) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1323 = "mhlo.reshape"(%1322) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1324 = "mhlo.broadcast_in_dim"(%cst_843) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1325 = mhlo.add %1323, %1324 : tensor<1x384x128xf32>
    %1326 = chlo.broadcast_multiply %1325, %cst_844 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1327 = chlo.broadcast_add %1326, %cst_845 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1328 = "mhlo.reshape"(%1327) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1329 = "mhlo.dot"(%1328, %cst_850) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1330 = chlo.broadcast_add %1329, %cst_851 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1331 = "mhlo.reshape"(%1330) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1332 = "mhlo.transpose"(%1331) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1333 = "mhlo.dot"(%1328, %cst_848) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1334 = chlo.broadcast_add %1333, %cst_849 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1335 = "mhlo.reshape"(%1334) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1336 = "mhlo.transpose"(%1335) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1337 = "mhlo.dot_general"(%1336, %1332) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1338 = chlo.broadcast_multiply %1337, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1339 = chlo.broadcast_add %1338, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1340 = "mhlo.reduce"(%1339, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1341 = linalg.tensor_expand_shape %1340 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1342 = chlo.broadcast_subtract %1339, %1341 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1343 = "mhlo.exponential"(%1342) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1344 = "mhlo.reduce"(%1343, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1345 = linalg.tensor_expand_shape %1344 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1346 = chlo.broadcast_divide %1343, %1345 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1347 = "mhlo.dot_general"(%1346, %1321) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1348 = "mhlo.transpose"(%1347) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1349 = "mhlo.reshape"(%1348) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1350 = "mhlo.dot"(%1349, %cst_852) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1351 = chlo.broadcast_add %1350, %cst_853 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1352 = "mhlo.reshape"(%1351) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1353 = "mhlo.dot"(%1317, %cst_839) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1354 = chlo.broadcast_add %1353, %cst_840 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1355 = "mhlo.reshape"(%1354) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1356 = chlo.broadcast_multiply %1355, %cst_841 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1357 = chlo.broadcast_add %1356, %cst_853 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1358 = chlo.broadcast_add %1352, %1357 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1359 = chlo.broadcast_multiply %1358, %cst_854 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1360 = chlo.broadcast_add %1359, %cst_855 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1361 = "mhlo.reshape"(%1360) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1362 = "mhlo.dot"(%1361, %cst_837) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1363 = chlo.broadcast_add %1362, %cst_838 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1364 = "mhlo.reshape"(%1363) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1365 = chlo.broadcast_maximum %1364, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1366 = "mhlo.reshape"(%1365) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1367 = "mhlo.dot"(%1366, %cst_833) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1368 = chlo.broadcast_add %1367, %cst_834 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1369 = "mhlo.reshape"(%1368) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1370 = chlo.broadcast_add %1369, %1360 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1371 = chlo.broadcast_multiply %1370, %cst_835 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1372 = chlo.broadcast_add %1371, %cst_836 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1373 = "mhlo.reshape"(%1372) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1374 = "mhlo.dot"(%1373, %cst_831) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1375 = chlo.broadcast_add %1374, %cst_832 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1376 = "mhlo.reshape"(%1375) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1377 = chlo.broadcast_maximum %1376, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1378 = "mhlo.reshape"(%1377) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1379 = "mhlo.dot"(%1378, %cst_827) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1380 = chlo.broadcast_add %1379, %cst_828 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1381 = "mhlo.reshape"(%1380) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1382 = chlo.broadcast_add %1381, %1372 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1383 = chlo.broadcast_multiply %1382, %cst_829 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1384 = chlo.broadcast_add %1383, %cst_830 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1385 = "mhlo.reshape"(%1384) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1386 = "mhlo.dot"(%1385, %cst_825) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1387 = chlo.broadcast_add %1386, %cst_826 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1388 = "mhlo.reshape"(%1387) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1389 = chlo.broadcast_maximum %1388, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1390 = "mhlo.reshape"(%1389) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1391 = "mhlo.dot"(%1390, %cst_821) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1392 = chlo.broadcast_add %1391, %cst_822 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1393 = "mhlo.reshape"(%1392) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1394 = chlo.broadcast_add %1393, %1384 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1395 = chlo.broadcast_multiply %1394, %cst_823 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1396 = chlo.broadcast_add %1395, %cst_824 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1397 = "mhlo.reshape"(%1396) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1398 = "mhlo.dot"(%1397, %cst_819) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1399 = chlo.broadcast_add %1398, %cst_820 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1400 = "mhlo.reshape"(%1399) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1401 = chlo.broadcast_maximum %1400, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1402 = "mhlo.reshape"(%1401) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1403 = "mhlo.dot"(%1402, %cst_811) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1404 = chlo.broadcast_add %1403, %cst_812 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1405 = "mhlo.reshape"(%1404) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1406 = chlo.broadcast_add %1405, %1396 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1407 = chlo.broadcast_multiply %1406, %cst_817 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1408 = chlo.broadcast_add %1407, %cst_818 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1409 = "mhlo.reshape"(%1408) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1410 = "mhlo.dot"(%1409, %cst_813) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1411 = chlo.broadcast_add %1410, %cst_814 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1412 = "mhlo.reshape"(%1411) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1413 = chlo.broadcast_add %1412, %1316 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1414 = chlo.broadcast_multiply %1413, %cst_815 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1415 = chlo.broadcast_add %1414, %cst_816 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1416 = "mhlo.reshape"(%1415) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1417 = "mhlo.dot"(%1416, %cst_801) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1418 = chlo.broadcast_add %1417, %cst_802 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1419 = "mhlo.reshape"(%1418) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1420 = "mhlo.transpose"(%1419) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1421 = "mhlo.dot"(%1416, %cst_797) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1422 = "mhlo.reshape"(%1421) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1423 = "mhlo.broadcast_in_dim"(%cst_798) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1424 = mhlo.add %1422, %1423 : tensor<1x384x128xf32>
    %1425 = chlo.broadcast_multiply %1424, %cst_799 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1426 = chlo.broadcast_add %1425, %cst_800 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1427 = "mhlo.reshape"(%1426) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1428 = "mhlo.dot"(%1427, %cst_805) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1429 = chlo.broadcast_add %1428, %cst_806 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1430 = "mhlo.reshape"(%1429) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1431 = "mhlo.transpose"(%1430) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1432 = "mhlo.dot"(%1427, %cst_803) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1433 = chlo.broadcast_add %1432, %cst_804 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1434 = "mhlo.reshape"(%1433) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1435 = "mhlo.transpose"(%1434) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1436 = "mhlo.dot_general"(%1435, %1431) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1437 = chlo.broadcast_multiply %1436, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1438 = chlo.broadcast_add %1437, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1439 = "mhlo.reduce"(%1438, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1440 = linalg.tensor_expand_shape %1439 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1441 = chlo.broadcast_subtract %1438, %1440 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1442 = "mhlo.exponential"(%1441) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1443 = "mhlo.reduce"(%1442, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1444 = linalg.tensor_expand_shape %1443 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1445 = chlo.broadcast_divide %1442, %1444 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1446 = "mhlo.dot_general"(%1445, %1420) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1447 = "mhlo.transpose"(%1446) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1448 = "mhlo.reshape"(%1447) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1449 = "mhlo.dot"(%1448, %cst_807) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1450 = chlo.broadcast_add %1449, %cst_808 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1451 = "mhlo.reshape"(%1450) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1452 = "mhlo.dot"(%1416, %cst_794) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1453 = chlo.broadcast_add %1452, %cst_795 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1454 = "mhlo.reshape"(%1453) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1455 = chlo.broadcast_multiply %1454, %cst_796 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1456 = chlo.broadcast_add %1455, %cst_808 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1457 = chlo.broadcast_add %1451, %1456 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1458 = chlo.broadcast_multiply %1457, %cst_809 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1459 = chlo.broadcast_add %1458, %cst_810 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1460 = "mhlo.reshape"(%1459) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1461 = "mhlo.dot"(%1460, %cst_792) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1462 = chlo.broadcast_add %1461, %cst_793 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1463 = "mhlo.reshape"(%1462) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1464 = chlo.broadcast_maximum %1463, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1465 = "mhlo.reshape"(%1464) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1466 = "mhlo.dot"(%1465, %cst_788) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1467 = chlo.broadcast_add %1466, %cst_789 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1468 = "mhlo.reshape"(%1467) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1469 = chlo.broadcast_add %1468, %1459 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1470 = chlo.broadcast_multiply %1469, %cst_790 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1471 = chlo.broadcast_add %1470, %cst_791 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1472 = "mhlo.reshape"(%1471) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1473 = "mhlo.dot"(%1472, %cst_786) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1474 = chlo.broadcast_add %1473, %cst_787 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1475 = "mhlo.reshape"(%1474) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1476 = chlo.broadcast_maximum %1475, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1477 = "mhlo.reshape"(%1476) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1478 = "mhlo.dot"(%1477, %cst_782) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1479 = chlo.broadcast_add %1478, %cst_783 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1480 = "mhlo.reshape"(%1479) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1481 = chlo.broadcast_add %1480, %1471 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1482 = chlo.broadcast_multiply %1481, %cst_784 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1483 = chlo.broadcast_add %1482, %cst_785 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1484 = "mhlo.reshape"(%1483) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1485 = "mhlo.dot"(%1484, %cst_780) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1486 = chlo.broadcast_add %1485, %cst_781 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1487 = "mhlo.reshape"(%1486) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1488 = chlo.broadcast_maximum %1487, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1489 = "mhlo.reshape"(%1488) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1490 = "mhlo.dot"(%1489, %cst_776) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1491 = chlo.broadcast_add %1490, %cst_777 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1492 = "mhlo.reshape"(%1491) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1493 = chlo.broadcast_add %1492, %1483 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1494 = chlo.broadcast_multiply %1493, %cst_778 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1495 = chlo.broadcast_add %1494, %cst_779 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1496 = "mhlo.reshape"(%1495) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1497 = "mhlo.dot"(%1496, %cst_774) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1498 = chlo.broadcast_add %1497, %cst_775 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1499 = "mhlo.reshape"(%1498) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1500 = chlo.broadcast_maximum %1499, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1501 = "mhlo.reshape"(%1500) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1502 = "mhlo.dot"(%1501, %cst_766) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1503 = chlo.broadcast_add %1502, %cst_767 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1504 = "mhlo.reshape"(%1503) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1505 = chlo.broadcast_add %1504, %1495 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1506 = chlo.broadcast_multiply %1505, %cst_772 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1507 = chlo.broadcast_add %1506, %cst_773 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1508 = "mhlo.reshape"(%1507) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1509 = "mhlo.dot"(%1508, %cst_768) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1510 = chlo.broadcast_add %1509, %cst_769 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1511 = "mhlo.reshape"(%1510) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1512 = chlo.broadcast_add %1511, %1415 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1513 = chlo.broadcast_multiply %1512, %cst_770 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1514 = chlo.broadcast_add %1513, %cst_771 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1515 = "mhlo.reshape"(%1514) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1516 = "mhlo.dot"(%1515, %cst_756) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1517 = chlo.broadcast_add %1516, %cst_757 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1518 = "mhlo.reshape"(%1517) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1519 = "mhlo.transpose"(%1518) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1520 = "mhlo.dot"(%1515, %cst_752) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1521 = "mhlo.reshape"(%1520) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1522 = "mhlo.broadcast_in_dim"(%cst_753) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1523 = mhlo.add %1521, %1522 : tensor<1x384x128xf32>
    %1524 = chlo.broadcast_multiply %1523, %cst_754 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1525 = chlo.broadcast_add %1524, %cst_755 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1526 = "mhlo.reshape"(%1525) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1527 = "mhlo.dot"(%1526, %cst_760) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1528 = chlo.broadcast_add %1527, %cst_761 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1529 = "mhlo.reshape"(%1528) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1530 = "mhlo.transpose"(%1529) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1531 = "mhlo.dot"(%1526, %cst_758) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1532 = chlo.broadcast_add %1531, %cst_759 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1533 = "mhlo.reshape"(%1532) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1534 = "mhlo.transpose"(%1533) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1535 = "mhlo.dot_general"(%1534, %1530) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1536 = chlo.broadcast_multiply %1535, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1537 = chlo.broadcast_add %1536, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1538 = "mhlo.reduce"(%1537, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1539 = linalg.tensor_expand_shape %1538 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1540 = chlo.broadcast_subtract %1537, %1539 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1541 = "mhlo.exponential"(%1540) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1542 = "mhlo.reduce"(%1541, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1543 = linalg.tensor_expand_shape %1542 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1544 = chlo.broadcast_divide %1541, %1543 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1545 = "mhlo.dot_general"(%1544, %1519) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1546 = "mhlo.transpose"(%1545) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1547 = "mhlo.reshape"(%1546) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1548 = "mhlo.dot"(%1547, %cst_762) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1549 = chlo.broadcast_add %1548, %cst_763 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1550 = "mhlo.reshape"(%1549) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1551 = "mhlo.dot"(%1515, %cst_749) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1552 = chlo.broadcast_add %1551, %cst_750 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1553 = "mhlo.reshape"(%1552) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1554 = chlo.broadcast_multiply %1553, %cst_751 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1555 = chlo.broadcast_add %1554, %cst_763 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1556 = chlo.broadcast_add %1550, %1555 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1557 = chlo.broadcast_multiply %1556, %cst_764 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1558 = chlo.broadcast_add %1557, %cst_765 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1559 = "mhlo.reshape"(%1558) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1560 = "mhlo.dot"(%1559, %cst_747) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1561 = chlo.broadcast_add %1560, %cst_748 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1562 = "mhlo.reshape"(%1561) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1563 = chlo.broadcast_maximum %1562, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1564 = "mhlo.reshape"(%1563) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1565 = "mhlo.dot"(%1564, %cst_743) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1566 = chlo.broadcast_add %1565, %cst_744 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1567 = "mhlo.reshape"(%1566) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1568 = chlo.broadcast_add %1567, %1558 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1569 = chlo.broadcast_multiply %1568, %cst_745 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1570 = chlo.broadcast_add %1569, %cst_746 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1571 = "mhlo.reshape"(%1570) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1572 = "mhlo.dot"(%1571, %cst_741) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1573 = chlo.broadcast_add %1572, %cst_742 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1574 = "mhlo.reshape"(%1573) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1575 = chlo.broadcast_maximum %1574, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1576 = "mhlo.reshape"(%1575) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1577 = "mhlo.dot"(%1576, %cst_737) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1578 = chlo.broadcast_add %1577, %cst_738 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1579 = "mhlo.reshape"(%1578) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1580 = chlo.broadcast_add %1579, %1570 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1581 = chlo.broadcast_multiply %1580, %cst_739 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1582 = chlo.broadcast_add %1581, %cst_740 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1583 = "mhlo.reshape"(%1582) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1584 = "mhlo.dot"(%1583, %cst_735) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1585 = chlo.broadcast_add %1584, %cst_736 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1586 = "mhlo.reshape"(%1585) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1587 = chlo.broadcast_maximum %1586, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1588 = "mhlo.reshape"(%1587) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1589 = "mhlo.dot"(%1588, %cst_731) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1590 = chlo.broadcast_add %1589, %cst_732 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1591 = "mhlo.reshape"(%1590) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1592 = chlo.broadcast_add %1591, %1582 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1593 = chlo.broadcast_multiply %1592, %cst_733 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1594 = chlo.broadcast_add %1593, %cst_734 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1595 = "mhlo.reshape"(%1594) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1596 = "mhlo.dot"(%1595, %cst_729) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1597 = chlo.broadcast_add %1596, %cst_730 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1598 = "mhlo.reshape"(%1597) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1599 = chlo.broadcast_maximum %1598, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1600 = "mhlo.reshape"(%1599) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1601 = "mhlo.dot"(%1600, %cst_721) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1602 = chlo.broadcast_add %1601, %cst_722 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1603 = "mhlo.reshape"(%1602) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1604 = chlo.broadcast_add %1603, %1594 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1605 = chlo.broadcast_multiply %1604, %cst_727 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1606 = chlo.broadcast_add %1605, %cst_728 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1607 = "mhlo.reshape"(%1606) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1608 = "mhlo.dot"(%1607, %cst_723) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1609 = chlo.broadcast_add %1608, %cst_724 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1610 = "mhlo.reshape"(%1609) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1611 = chlo.broadcast_add %1610, %1514 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1612 = chlo.broadcast_multiply %1611, %cst_725 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1613 = chlo.broadcast_add %1612, %cst_726 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1614 = "mhlo.reshape"(%1613) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1615 = "mhlo.dot"(%1614, %cst_711) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1616 = chlo.broadcast_add %1615, %cst_712 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1617 = "mhlo.reshape"(%1616) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1618 = "mhlo.transpose"(%1617) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1619 = "mhlo.dot"(%1614, %cst_707) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1620 = "mhlo.reshape"(%1619) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1621 = "mhlo.broadcast_in_dim"(%cst_708) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1622 = mhlo.add %1620, %1621 : tensor<1x384x128xf32>
    %1623 = chlo.broadcast_multiply %1622, %cst_709 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1624 = chlo.broadcast_add %1623, %cst_710 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1625 = "mhlo.reshape"(%1624) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1626 = "mhlo.dot"(%1625, %cst_715) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1627 = chlo.broadcast_add %1626, %cst_716 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1628 = "mhlo.reshape"(%1627) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1629 = "mhlo.transpose"(%1628) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1630 = "mhlo.dot"(%1625, %cst_713) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1631 = chlo.broadcast_add %1630, %cst_714 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1632 = "mhlo.reshape"(%1631) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1633 = "mhlo.transpose"(%1632) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1634 = "mhlo.dot_general"(%1633, %1629) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1635 = chlo.broadcast_multiply %1634, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1636 = chlo.broadcast_add %1635, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1637 = "mhlo.reduce"(%1636, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1638 = linalg.tensor_expand_shape %1637 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1639 = chlo.broadcast_subtract %1636, %1638 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1640 = "mhlo.exponential"(%1639) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1641 = "mhlo.reduce"(%1640, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1642 = linalg.tensor_expand_shape %1641 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1643 = chlo.broadcast_divide %1640, %1642 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1644 = "mhlo.dot_general"(%1643, %1618) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1645 = "mhlo.transpose"(%1644) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1646 = "mhlo.reshape"(%1645) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1647 = "mhlo.dot"(%1646, %cst_717) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1648 = chlo.broadcast_add %1647, %cst_718 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1649 = "mhlo.reshape"(%1648) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1650 = "mhlo.dot"(%1614, %cst_704) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1651 = chlo.broadcast_add %1650, %cst_705 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1652 = "mhlo.reshape"(%1651) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1653 = chlo.broadcast_multiply %1652, %cst_706 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1654 = chlo.broadcast_add %1653, %cst_718 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1655 = chlo.broadcast_add %1649, %1654 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1656 = chlo.broadcast_multiply %1655, %cst_719 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1657 = chlo.broadcast_add %1656, %cst_720 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1658 = "mhlo.reshape"(%1657) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1659 = "mhlo.dot"(%1658, %cst_702) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1660 = chlo.broadcast_add %1659, %cst_703 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1661 = "mhlo.reshape"(%1660) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1662 = chlo.broadcast_maximum %1661, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1663 = "mhlo.reshape"(%1662) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1664 = "mhlo.dot"(%1663, %cst_698) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1665 = chlo.broadcast_add %1664, %cst_699 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1666 = "mhlo.reshape"(%1665) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1667 = chlo.broadcast_add %1666, %1657 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1668 = chlo.broadcast_multiply %1667, %cst_700 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1669 = chlo.broadcast_add %1668, %cst_701 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1670 = "mhlo.reshape"(%1669) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1671 = "mhlo.dot"(%1670, %cst_696) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1672 = chlo.broadcast_add %1671, %cst_697 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1673 = "mhlo.reshape"(%1672) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1674 = chlo.broadcast_maximum %1673, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1675 = "mhlo.reshape"(%1674) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1676 = "mhlo.dot"(%1675, %cst_692) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1677 = chlo.broadcast_add %1676, %cst_693 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1678 = "mhlo.reshape"(%1677) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1679 = chlo.broadcast_add %1678, %1669 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1680 = chlo.broadcast_multiply %1679, %cst_694 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1681 = chlo.broadcast_add %1680, %cst_695 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1682 = "mhlo.reshape"(%1681) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1683 = "mhlo.dot"(%1682, %cst_690) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1684 = chlo.broadcast_add %1683, %cst_691 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1685 = "mhlo.reshape"(%1684) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1686 = chlo.broadcast_maximum %1685, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1687 = "mhlo.reshape"(%1686) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1688 = "mhlo.dot"(%1687, %cst_686) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1689 = chlo.broadcast_add %1688, %cst_687 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1690 = "mhlo.reshape"(%1689) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1691 = chlo.broadcast_add %1690, %1681 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1692 = chlo.broadcast_multiply %1691, %cst_688 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1693 = chlo.broadcast_add %1692, %cst_689 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1694 = "mhlo.reshape"(%1693) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1695 = "mhlo.dot"(%1694, %cst_684) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1696 = chlo.broadcast_add %1695, %cst_685 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1697 = "mhlo.reshape"(%1696) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1698 = chlo.broadcast_maximum %1697, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1699 = "mhlo.reshape"(%1698) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1700 = "mhlo.dot"(%1699, %cst_676) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1701 = chlo.broadcast_add %1700, %cst_677 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1702 = "mhlo.reshape"(%1701) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1703 = chlo.broadcast_add %1702, %1693 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1704 = chlo.broadcast_multiply %1703, %cst_682 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1705 = chlo.broadcast_add %1704, %cst_683 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1706 = "mhlo.reshape"(%1705) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1707 = "mhlo.dot"(%1706, %cst_678) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1708 = chlo.broadcast_add %1707, %cst_679 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1709 = "mhlo.reshape"(%1708) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1710 = chlo.broadcast_add %1709, %1613 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1711 = chlo.broadcast_multiply %1710, %cst_680 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1712 = chlo.broadcast_add %1711, %cst_681 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1713 = "mhlo.reshape"(%1712) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1714 = "mhlo.dot"(%1713, %cst_666) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1715 = chlo.broadcast_add %1714, %cst_667 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1716 = "mhlo.reshape"(%1715) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1717 = "mhlo.transpose"(%1716) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1718 = "mhlo.dot"(%1713, %cst_662) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1719 = "mhlo.reshape"(%1718) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1720 = "mhlo.broadcast_in_dim"(%cst_663) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1721 = mhlo.add %1719, %1720 : tensor<1x384x128xf32>
    %1722 = chlo.broadcast_multiply %1721, %cst_664 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1723 = chlo.broadcast_add %1722, %cst_665 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1724 = "mhlo.reshape"(%1723) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1725 = "mhlo.dot"(%1724, %cst_670) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1726 = chlo.broadcast_add %1725, %cst_671 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1727 = "mhlo.reshape"(%1726) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1728 = "mhlo.transpose"(%1727) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1729 = "mhlo.dot"(%1724, %cst_668) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1730 = chlo.broadcast_add %1729, %cst_669 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1731 = "mhlo.reshape"(%1730) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1732 = "mhlo.transpose"(%1731) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1733 = "mhlo.dot_general"(%1732, %1728) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1734 = chlo.broadcast_multiply %1733, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1735 = chlo.broadcast_add %1734, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1736 = "mhlo.reduce"(%1735, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1737 = linalg.tensor_expand_shape %1736 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1738 = chlo.broadcast_subtract %1735, %1737 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1739 = "mhlo.exponential"(%1738) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1740 = "mhlo.reduce"(%1739, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1741 = linalg.tensor_expand_shape %1740 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1742 = chlo.broadcast_divide %1739, %1741 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1743 = "mhlo.dot_general"(%1742, %1717) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1744 = "mhlo.transpose"(%1743) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1745 = "mhlo.reshape"(%1744) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1746 = "mhlo.dot"(%1745, %cst_672) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1747 = chlo.broadcast_add %1746, %cst_673 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1748 = "mhlo.reshape"(%1747) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1749 = "mhlo.dot"(%1713, %cst_659) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1750 = chlo.broadcast_add %1749, %cst_660 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1751 = "mhlo.reshape"(%1750) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1752 = chlo.broadcast_multiply %1751, %cst_661 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1753 = chlo.broadcast_add %1752, %cst_673 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1754 = chlo.broadcast_add %1748, %1753 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1755 = chlo.broadcast_multiply %1754, %cst_674 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1756 = chlo.broadcast_add %1755, %cst_675 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1757 = "mhlo.reshape"(%1756) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1758 = "mhlo.dot"(%1757, %cst_657) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1759 = chlo.broadcast_add %1758, %cst_658 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1760 = "mhlo.reshape"(%1759) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1761 = chlo.broadcast_maximum %1760, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1762 = "mhlo.reshape"(%1761) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1763 = "mhlo.dot"(%1762, %cst_653) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1764 = chlo.broadcast_add %1763, %cst_654 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1765 = "mhlo.reshape"(%1764) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1766 = chlo.broadcast_add %1765, %1756 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1767 = chlo.broadcast_multiply %1766, %cst_655 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1768 = chlo.broadcast_add %1767, %cst_656 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1769 = "mhlo.reshape"(%1768) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1770 = "mhlo.dot"(%1769, %cst_651) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1771 = chlo.broadcast_add %1770, %cst_652 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1772 = "mhlo.reshape"(%1771) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1773 = chlo.broadcast_maximum %1772, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1774 = "mhlo.reshape"(%1773) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1775 = "mhlo.dot"(%1774, %cst_647) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1776 = chlo.broadcast_add %1775, %cst_648 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1777 = "mhlo.reshape"(%1776) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1778 = chlo.broadcast_add %1777, %1768 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1779 = chlo.broadcast_multiply %1778, %cst_649 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1780 = chlo.broadcast_add %1779, %cst_650 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1781 = "mhlo.reshape"(%1780) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1782 = "mhlo.dot"(%1781, %cst_645) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1783 = chlo.broadcast_add %1782, %cst_646 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1784 = "mhlo.reshape"(%1783) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1785 = chlo.broadcast_maximum %1784, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1786 = "mhlo.reshape"(%1785) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1787 = "mhlo.dot"(%1786, %cst_641) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1788 = chlo.broadcast_add %1787, %cst_642 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1789 = "mhlo.reshape"(%1788) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1790 = chlo.broadcast_add %1789, %1780 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1791 = chlo.broadcast_multiply %1790, %cst_643 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1792 = chlo.broadcast_add %1791, %cst_644 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1793 = "mhlo.reshape"(%1792) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1794 = "mhlo.dot"(%1793, %cst_639) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1795 = chlo.broadcast_add %1794, %cst_640 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1796 = "mhlo.reshape"(%1795) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1797 = chlo.broadcast_maximum %1796, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1798 = "mhlo.reshape"(%1797) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1799 = "mhlo.dot"(%1798, %cst_631) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1800 = chlo.broadcast_add %1799, %cst_632 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1801 = "mhlo.reshape"(%1800) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1802 = chlo.broadcast_add %1801, %1792 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1803 = chlo.broadcast_multiply %1802, %cst_637 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1804 = chlo.broadcast_add %1803, %cst_638 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1805 = "mhlo.reshape"(%1804) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1806 = "mhlo.dot"(%1805, %cst_633) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1807 = chlo.broadcast_add %1806, %cst_634 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1808 = "mhlo.reshape"(%1807) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1809 = chlo.broadcast_add %1808, %1712 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1810 = chlo.broadcast_multiply %1809, %cst_635 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1811 = chlo.broadcast_add %1810, %cst_636 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1812 = "mhlo.reshape"(%1811) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1813 = "mhlo.dot"(%1812, %cst_621) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1814 = chlo.broadcast_add %1813, %cst_622 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1815 = "mhlo.reshape"(%1814) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1816 = "mhlo.transpose"(%1815) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1817 = "mhlo.dot"(%1812, %cst_617) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1818 = "mhlo.reshape"(%1817) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1819 = "mhlo.broadcast_in_dim"(%cst_618) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1820 = mhlo.add %1818, %1819 : tensor<1x384x128xf32>
    %1821 = chlo.broadcast_multiply %1820, %cst_619 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1822 = chlo.broadcast_add %1821, %cst_620 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1823 = "mhlo.reshape"(%1822) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1824 = "mhlo.dot"(%1823, %cst_625) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1825 = chlo.broadcast_add %1824, %cst_626 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1826 = "mhlo.reshape"(%1825) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1827 = "mhlo.transpose"(%1826) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1828 = "mhlo.dot"(%1823, %cst_623) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1829 = chlo.broadcast_add %1828, %cst_624 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1830 = "mhlo.reshape"(%1829) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1831 = "mhlo.transpose"(%1830) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1832 = "mhlo.dot_general"(%1831, %1827) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1833 = chlo.broadcast_multiply %1832, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1834 = chlo.broadcast_add %1833, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1835 = "mhlo.reduce"(%1834, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1836 = linalg.tensor_expand_shape %1835 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1837 = chlo.broadcast_subtract %1834, %1836 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1838 = "mhlo.exponential"(%1837) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1839 = "mhlo.reduce"(%1838, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1840 = linalg.tensor_expand_shape %1839 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1841 = chlo.broadcast_divide %1838, %1840 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1842 = "mhlo.dot_general"(%1841, %1816) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1843 = "mhlo.transpose"(%1842) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1844 = "mhlo.reshape"(%1843) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1845 = "mhlo.dot"(%1844, %cst_627) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1846 = chlo.broadcast_add %1845, %cst_628 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1847 = "mhlo.reshape"(%1846) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1848 = "mhlo.dot"(%1812, %cst_614) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1849 = chlo.broadcast_add %1848, %cst_615 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1850 = "mhlo.reshape"(%1849) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1851 = chlo.broadcast_multiply %1850, %cst_616 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1852 = chlo.broadcast_add %1851, %cst_628 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1853 = chlo.broadcast_add %1847, %1852 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1854 = chlo.broadcast_multiply %1853, %cst_629 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1855 = chlo.broadcast_add %1854, %cst_630 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1856 = "mhlo.reshape"(%1855) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1857 = "mhlo.dot"(%1856, %cst_612) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1858 = chlo.broadcast_add %1857, %cst_613 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1859 = "mhlo.reshape"(%1858) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1860 = chlo.broadcast_maximum %1859, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1861 = "mhlo.reshape"(%1860) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1862 = "mhlo.dot"(%1861, %cst_608) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1863 = chlo.broadcast_add %1862, %cst_609 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1864 = "mhlo.reshape"(%1863) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1865 = chlo.broadcast_add %1864, %1855 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1866 = chlo.broadcast_multiply %1865, %cst_610 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1867 = chlo.broadcast_add %1866, %cst_611 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1868 = "mhlo.reshape"(%1867) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1869 = "mhlo.dot"(%1868, %cst_606) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1870 = chlo.broadcast_add %1869, %cst_607 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1871 = "mhlo.reshape"(%1870) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1872 = chlo.broadcast_maximum %1871, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1873 = "mhlo.reshape"(%1872) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1874 = "mhlo.dot"(%1873, %cst_602) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1875 = chlo.broadcast_add %1874, %cst_603 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1876 = "mhlo.reshape"(%1875) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1877 = chlo.broadcast_add %1876, %1867 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1878 = chlo.broadcast_multiply %1877, %cst_604 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1879 = chlo.broadcast_add %1878, %cst_605 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1880 = "mhlo.reshape"(%1879) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1881 = "mhlo.dot"(%1880, %cst_600) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1882 = chlo.broadcast_add %1881, %cst_601 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1883 = "mhlo.reshape"(%1882) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1884 = chlo.broadcast_maximum %1883, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1885 = "mhlo.reshape"(%1884) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1886 = "mhlo.dot"(%1885, %cst_596) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1887 = chlo.broadcast_add %1886, %cst_597 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1888 = "mhlo.reshape"(%1887) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1889 = chlo.broadcast_add %1888, %1879 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1890 = chlo.broadcast_multiply %1889, %cst_598 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1891 = chlo.broadcast_add %1890, %cst_599 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1892 = "mhlo.reshape"(%1891) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1893 = "mhlo.dot"(%1892, %cst_594) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1894 = chlo.broadcast_add %1893, %cst_595 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1895 = "mhlo.reshape"(%1894) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1896 = chlo.broadcast_maximum %1895, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1897 = "mhlo.reshape"(%1896) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1898 = "mhlo.dot"(%1897, %cst_586) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1899 = chlo.broadcast_add %1898, %cst_587 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1900 = "mhlo.reshape"(%1899) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1901 = chlo.broadcast_add %1900, %1891 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1902 = chlo.broadcast_multiply %1901, %cst_592 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1903 = chlo.broadcast_add %1902, %cst_593 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1904 = "mhlo.reshape"(%1903) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1905 = "mhlo.dot"(%1904, %cst_588) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1906 = chlo.broadcast_add %1905, %cst_589 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1907 = "mhlo.reshape"(%1906) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1908 = chlo.broadcast_add %1907, %1811 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %1909 = chlo.broadcast_multiply %1908, %cst_590 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1910 = chlo.broadcast_add %1909, %cst_591 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %1911 = "mhlo.reshape"(%1910) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1912 = "mhlo.dot"(%1911, %cst_576) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1913 = chlo.broadcast_add %1912, %cst_577 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1914 = "mhlo.reshape"(%1913) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1915 = "mhlo.transpose"(%1914) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1916 = "mhlo.dot"(%1911, %cst_572) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1917 = "mhlo.reshape"(%1916) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1918 = "mhlo.broadcast_in_dim"(%cst_573) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %1919 = mhlo.add %1917, %1918 : tensor<1x384x128xf32>
    %1920 = chlo.broadcast_multiply %1919, %cst_574 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1921 = chlo.broadcast_add %1920, %cst_575 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1922 = "mhlo.reshape"(%1921) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1923 = "mhlo.dot"(%1922, %cst_580) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1924 = chlo.broadcast_add %1923, %cst_581 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1925 = "mhlo.reshape"(%1924) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1926 = "mhlo.transpose"(%1925) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1927 = "mhlo.dot"(%1922, %cst_578) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1928 = chlo.broadcast_add %1927, %cst_579 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1929 = "mhlo.reshape"(%1928) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %1930 = "mhlo.transpose"(%1929) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %1931 = "mhlo.dot_general"(%1930, %1926) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %1932 = chlo.broadcast_multiply %1931, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %1933 = chlo.broadcast_add %1932, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1934 = "mhlo.reduce"(%1933, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1935 = linalg.tensor_expand_shape %1934 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1936 = chlo.broadcast_subtract %1933, %1935 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1937 = "mhlo.exponential"(%1936) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %1938 = "mhlo.reduce"(%1937, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %1939 = linalg.tensor_expand_shape %1938 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %1940 = chlo.broadcast_divide %1937, %1939 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %1941 = "mhlo.dot_general"(%1940, %1915) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %1942 = "mhlo.transpose"(%1941) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %1943 = "mhlo.reshape"(%1942) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %1944 = "mhlo.dot"(%1943, %cst_582) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %1945 = chlo.broadcast_add %1944, %cst_583 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1946 = "mhlo.reshape"(%1945) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1947 = "mhlo.dot"(%1911, %cst_569) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1948 = chlo.broadcast_add %1947, %cst_570 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1949 = "mhlo.reshape"(%1948) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1950 = chlo.broadcast_multiply %1949, %cst_571 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1951 = chlo.broadcast_add %1950, %cst_583 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1952 = chlo.broadcast_add %1946, %1951 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1953 = chlo.broadcast_multiply %1952, %cst_584 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1954 = chlo.broadcast_add %1953, %cst_585 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1955 = "mhlo.reshape"(%1954) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1956 = "mhlo.dot"(%1955, %cst_567) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1957 = chlo.broadcast_add %1956, %cst_568 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1958 = "mhlo.reshape"(%1957) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1959 = chlo.broadcast_maximum %1958, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1960 = "mhlo.reshape"(%1959) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1961 = "mhlo.dot"(%1960, %cst_563) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1962 = chlo.broadcast_add %1961, %cst_564 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1963 = "mhlo.reshape"(%1962) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1964 = chlo.broadcast_add %1963, %1954 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1965 = chlo.broadcast_multiply %1964, %cst_565 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1966 = chlo.broadcast_add %1965, %cst_566 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1967 = "mhlo.reshape"(%1966) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1968 = "mhlo.dot"(%1967, %cst_561) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1969 = chlo.broadcast_add %1968, %cst_562 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1970 = "mhlo.reshape"(%1969) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1971 = chlo.broadcast_maximum %1970, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1972 = "mhlo.reshape"(%1971) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1973 = "mhlo.dot"(%1972, %cst_557) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1974 = chlo.broadcast_add %1973, %cst_558 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1975 = "mhlo.reshape"(%1974) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1976 = chlo.broadcast_add %1975, %1966 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1977 = chlo.broadcast_multiply %1976, %cst_559 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1978 = chlo.broadcast_add %1977, %cst_560 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1979 = "mhlo.reshape"(%1978) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1980 = "mhlo.dot"(%1979, %cst_555) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1981 = chlo.broadcast_add %1980, %cst_556 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1982 = "mhlo.reshape"(%1981) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1983 = chlo.broadcast_maximum %1982, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1984 = "mhlo.reshape"(%1983) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1985 = "mhlo.dot"(%1984, %cst_551) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1986 = chlo.broadcast_add %1985, %cst_552 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1987 = "mhlo.reshape"(%1986) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %1988 = chlo.broadcast_add %1987, %1978 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %1989 = chlo.broadcast_multiply %1988, %cst_553 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1990 = chlo.broadcast_add %1989, %cst_554 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %1991 = "mhlo.reshape"(%1990) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %1992 = "mhlo.dot"(%1991, %cst_549) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %1993 = chlo.broadcast_add %1992, %cst_550 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %1994 = "mhlo.reshape"(%1993) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %1995 = chlo.broadcast_maximum %1994, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %1996 = "mhlo.reshape"(%1995) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %1997 = "mhlo.dot"(%1996, %cst_541) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %1998 = chlo.broadcast_add %1997, %cst_542 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %1999 = "mhlo.reshape"(%1998) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2000 = chlo.broadcast_add %1999, %1990 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2001 = chlo.broadcast_multiply %2000, %cst_547 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2002 = chlo.broadcast_add %2001, %cst_548 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2003 = "mhlo.reshape"(%2002) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2004 = "mhlo.dot"(%2003, %cst_543) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2005 = chlo.broadcast_add %2004, %cst_544 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2006 = "mhlo.reshape"(%2005) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2007 = chlo.broadcast_add %2006, %1910 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %2008 = chlo.broadcast_multiply %2007, %cst_545 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2009 = chlo.broadcast_add %2008, %cst_546 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2010 = "mhlo.reshape"(%2009) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2011 = "mhlo.dot"(%2010, %cst_486) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2012 = chlo.broadcast_add %2011, %cst_487 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2013 = "mhlo.reshape"(%2012) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2014 = "mhlo.transpose"(%2013) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2015 = "mhlo.dot"(%2010, %cst_482) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2016 = "mhlo.reshape"(%2015) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2017 = "mhlo.broadcast_in_dim"(%cst_483) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %2018 = mhlo.add %2016, %2017 : tensor<1x384x128xf32>
    %2019 = chlo.broadcast_multiply %2018, %cst_484 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2020 = chlo.broadcast_add %2019, %cst_485 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2021 = "mhlo.reshape"(%2020) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2022 = "mhlo.dot"(%2021, %cst_490) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2023 = chlo.broadcast_add %2022, %cst_491 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2024 = "mhlo.reshape"(%2023) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2025 = "mhlo.transpose"(%2024) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2026 = "mhlo.dot"(%2021, %cst_488) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2027 = chlo.broadcast_add %2026, %cst_489 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2028 = "mhlo.reshape"(%2027) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2029 = "mhlo.transpose"(%2028) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2030 = "mhlo.dot_general"(%2029, %2025) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %2031 = chlo.broadcast_multiply %2030, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %2032 = chlo.broadcast_add %2031, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2033 = "mhlo.reduce"(%2032, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2034 = linalg.tensor_expand_shape %2033 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2035 = chlo.broadcast_subtract %2032, %2034 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2036 = "mhlo.exponential"(%2035) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2037 = "mhlo.reduce"(%2036, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2038 = linalg.tensor_expand_shape %2037 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2039 = chlo.broadcast_divide %2036, %2038 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2040 = "mhlo.dot_general"(%2039, %2014) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %2041 = "mhlo.transpose"(%2040) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %2042 = "mhlo.reshape"(%2041) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %2043 = "mhlo.dot"(%2042, %cst_492) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2044 = chlo.broadcast_add %2043, %cst_493 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2045 = "mhlo.reshape"(%2044) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2046 = "mhlo.dot"(%2010, %cst_479) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2047 = chlo.broadcast_add %2046, %cst_480 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2048 = "mhlo.reshape"(%2047) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2049 = chlo.broadcast_multiply %2048, %cst_481 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2050 = chlo.broadcast_add %2049, %cst_493 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2051 = chlo.broadcast_add %2045, %2050 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2052 = chlo.broadcast_multiply %2051, %cst_494 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2053 = chlo.broadcast_add %2052, %cst_495 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2054 = "mhlo.reshape"(%2053) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2055 = "mhlo.dot"(%2054, %cst_477) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2056 = chlo.broadcast_add %2055, %cst_478 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2057 = "mhlo.reshape"(%2056) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2058 = chlo.broadcast_maximum %2057, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2059 = "mhlo.reshape"(%2058) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2060 = "mhlo.dot"(%2059, %cst_473) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2061 = chlo.broadcast_add %2060, %cst_474 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2062 = "mhlo.reshape"(%2061) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2063 = chlo.broadcast_add %2062, %2053 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2064 = chlo.broadcast_multiply %2063, %cst_475 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2065 = chlo.broadcast_add %2064, %cst_476 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2066 = "mhlo.reshape"(%2065) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2067 = "mhlo.dot"(%2066, %cst_471) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2068 = chlo.broadcast_add %2067, %cst_472 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2069 = "mhlo.reshape"(%2068) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2070 = chlo.broadcast_maximum %2069, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2071 = "mhlo.reshape"(%2070) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2072 = "mhlo.dot"(%2071, %cst_467) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2073 = chlo.broadcast_add %2072, %cst_468 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2074 = "mhlo.reshape"(%2073) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2075 = chlo.broadcast_add %2074, %2065 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2076 = chlo.broadcast_multiply %2075, %cst_469 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2077 = chlo.broadcast_add %2076, %cst_470 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2078 = "mhlo.reshape"(%2077) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2079 = "mhlo.dot"(%2078, %cst_465) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2080 = chlo.broadcast_add %2079, %cst_466 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2081 = "mhlo.reshape"(%2080) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2082 = chlo.broadcast_maximum %2081, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2083 = "mhlo.reshape"(%2082) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2084 = "mhlo.dot"(%2083, %cst_461) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2085 = chlo.broadcast_add %2084, %cst_462 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2086 = "mhlo.reshape"(%2085) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2087 = chlo.broadcast_add %2086, %2077 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2088 = chlo.broadcast_multiply %2087, %cst_463 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2089 = chlo.broadcast_add %2088, %cst_464 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2090 = "mhlo.reshape"(%2089) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2091 = "mhlo.dot"(%2090, %cst_459) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2092 = chlo.broadcast_add %2091, %cst_460 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2093 = "mhlo.reshape"(%2092) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2094 = chlo.broadcast_maximum %2093, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2095 = "mhlo.reshape"(%2094) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2096 = "mhlo.dot"(%2095, %cst_451) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2097 = chlo.broadcast_add %2096, %cst_452 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2098 = "mhlo.reshape"(%2097) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2099 = chlo.broadcast_add %2098, %2089 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2100 = chlo.broadcast_multiply %2099, %cst_457 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2101 = chlo.broadcast_add %2100, %cst_458 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2102 = "mhlo.reshape"(%2101) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2103 = "mhlo.dot"(%2102, %cst_453) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2104 = chlo.broadcast_add %2103, %cst_454 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2105 = "mhlo.reshape"(%2104) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2106 = chlo.broadcast_add %2105, %2009 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %2107 = chlo.broadcast_multiply %2106, %cst_455 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2108 = chlo.broadcast_add %2107, %cst_456 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2109 = "mhlo.reshape"(%2108) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2110 = "mhlo.dot"(%2109, %cst_441) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2111 = chlo.broadcast_add %2110, %cst_442 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2112 = "mhlo.reshape"(%2111) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2113 = "mhlo.transpose"(%2112) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2114 = "mhlo.dot"(%2109, %cst_437) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2115 = "mhlo.reshape"(%2114) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2116 = "mhlo.broadcast_in_dim"(%cst_438) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %2117 = mhlo.add %2115, %2116 : tensor<1x384x128xf32>
    %2118 = chlo.broadcast_multiply %2117, %cst_439 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2119 = chlo.broadcast_add %2118, %cst_440 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2120 = "mhlo.reshape"(%2119) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2121 = "mhlo.dot"(%2120, %cst_445) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2122 = chlo.broadcast_add %2121, %cst_446 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2123 = "mhlo.reshape"(%2122) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2124 = "mhlo.transpose"(%2123) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2125 = "mhlo.dot"(%2120, %cst_443) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2126 = chlo.broadcast_add %2125, %cst_444 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2127 = "mhlo.reshape"(%2126) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2128 = "mhlo.transpose"(%2127) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2129 = "mhlo.dot_general"(%2128, %2124) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %2130 = chlo.broadcast_multiply %2129, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %2131 = chlo.broadcast_add %2130, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2132 = "mhlo.reduce"(%2131, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2133 = linalg.tensor_expand_shape %2132 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2134 = chlo.broadcast_subtract %2131, %2133 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2135 = "mhlo.exponential"(%2134) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2136 = "mhlo.reduce"(%2135, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2137 = linalg.tensor_expand_shape %2136 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2138 = chlo.broadcast_divide %2135, %2137 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2139 = "mhlo.dot_general"(%2138, %2113) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %2140 = "mhlo.transpose"(%2139) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %2141 = "mhlo.reshape"(%2140) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %2142 = "mhlo.dot"(%2141, %cst_447) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2143 = chlo.broadcast_add %2142, %cst_448 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2144 = "mhlo.reshape"(%2143) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2145 = "mhlo.dot"(%2109, %cst_434) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2146 = chlo.broadcast_add %2145, %cst_435 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2147 = "mhlo.reshape"(%2146) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2148 = chlo.broadcast_multiply %2147, %cst_436 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2149 = chlo.broadcast_add %2148, %cst_448 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2150 = chlo.broadcast_add %2144, %2149 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2151 = chlo.broadcast_multiply %2150, %cst_449 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2152 = chlo.broadcast_add %2151, %cst_450 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2153 = "mhlo.reshape"(%2152) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2154 = "mhlo.dot"(%2153, %cst_432) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2155 = chlo.broadcast_add %2154, %cst_433 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2156 = "mhlo.reshape"(%2155) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2157 = chlo.broadcast_maximum %2156, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2158 = "mhlo.reshape"(%2157) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2159 = "mhlo.dot"(%2158, %cst_428) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2160 = chlo.broadcast_add %2159, %cst_429 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2161 = "mhlo.reshape"(%2160) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2162 = chlo.broadcast_add %2161, %2152 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2163 = chlo.broadcast_multiply %2162, %cst_430 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2164 = chlo.broadcast_add %2163, %cst_431 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2165 = "mhlo.reshape"(%2164) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2166 = "mhlo.dot"(%2165, %cst_426) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2167 = chlo.broadcast_add %2166, %cst_427 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2168 = "mhlo.reshape"(%2167) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2169 = chlo.broadcast_maximum %2168, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2170 = "mhlo.reshape"(%2169) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2171 = "mhlo.dot"(%2170, %cst_422) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2172 = chlo.broadcast_add %2171, %cst_423 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2173 = "mhlo.reshape"(%2172) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2174 = chlo.broadcast_add %2173, %2164 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2175 = chlo.broadcast_multiply %2174, %cst_424 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2176 = chlo.broadcast_add %2175, %cst_425 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2177 = "mhlo.reshape"(%2176) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2178 = "mhlo.dot"(%2177, %cst_420) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2179 = chlo.broadcast_add %2178, %cst_421 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2180 = "mhlo.reshape"(%2179) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2181 = chlo.broadcast_maximum %2180, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2182 = "mhlo.reshape"(%2181) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2183 = "mhlo.dot"(%2182, %cst_416) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2184 = chlo.broadcast_add %2183, %cst_417 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2185 = "mhlo.reshape"(%2184) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2186 = chlo.broadcast_add %2185, %2176 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2187 = chlo.broadcast_multiply %2186, %cst_418 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2188 = chlo.broadcast_add %2187, %cst_419 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2189 = "mhlo.reshape"(%2188) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2190 = "mhlo.dot"(%2189, %cst_414) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2191 = chlo.broadcast_add %2190, %cst_415 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2192 = "mhlo.reshape"(%2191) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2193 = chlo.broadcast_maximum %2192, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2194 = "mhlo.reshape"(%2193) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2195 = "mhlo.dot"(%2194, %cst_406) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2196 = chlo.broadcast_add %2195, %cst_407 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2197 = "mhlo.reshape"(%2196) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2198 = chlo.broadcast_add %2197, %2188 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2199 = chlo.broadcast_multiply %2198, %cst_412 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2200 = chlo.broadcast_add %2199, %cst_413 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2201 = "mhlo.reshape"(%2200) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2202 = "mhlo.dot"(%2201, %cst_408) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2203 = chlo.broadcast_add %2202, %cst_409 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2204 = "mhlo.reshape"(%2203) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2205 = chlo.broadcast_add %2204, %2108 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %2206 = chlo.broadcast_multiply %2205, %cst_410 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2207 = chlo.broadcast_add %2206, %cst_411 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2208 = "mhlo.reshape"(%2207) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2209 = "mhlo.dot"(%2208, %cst_396) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2210 = chlo.broadcast_add %2209, %cst_397 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2211 = "mhlo.reshape"(%2210) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2212 = "mhlo.transpose"(%2211) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2213 = "mhlo.dot"(%2208, %cst_392) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2214 = "mhlo.reshape"(%2213) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2215 = "mhlo.broadcast_in_dim"(%cst_393) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %2216 = mhlo.add %2214, %2215 : tensor<1x384x128xf32>
    %2217 = chlo.broadcast_multiply %2216, %cst_394 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2218 = chlo.broadcast_add %2217, %cst_395 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2219 = "mhlo.reshape"(%2218) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2220 = "mhlo.dot"(%2219, %cst_400) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2221 = chlo.broadcast_add %2220, %cst_401 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2222 = "mhlo.reshape"(%2221) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2223 = "mhlo.transpose"(%2222) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2224 = "mhlo.dot"(%2219, %cst_398) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2225 = chlo.broadcast_add %2224, %cst_399 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2226 = "mhlo.reshape"(%2225) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2227 = "mhlo.transpose"(%2226) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2228 = "mhlo.dot_general"(%2227, %2223) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %2229 = chlo.broadcast_multiply %2228, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %2230 = chlo.broadcast_add %2229, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2231 = "mhlo.reduce"(%2230, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2232 = linalg.tensor_expand_shape %2231 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2233 = chlo.broadcast_subtract %2230, %2232 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2234 = "mhlo.exponential"(%2233) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2235 = "mhlo.reduce"(%2234, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2236 = linalg.tensor_expand_shape %2235 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2237 = chlo.broadcast_divide %2234, %2236 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2238 = "mhlo.dot_general"(%2237, %2212) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %2239 = "mhlo.transpose"(%2238) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %2240 = "mhlo.reshape"(%2239) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %2241 = "mhlo.dot"(%2240, %cst_402) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2242 = chlo.broadcast_add %2241, %cst_403 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2243 = "mhlo.reshape"(%2242) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2244 = "mhlo.dot"(%2208, %cst_389) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2245 = chlo.broadcast_add %2244, %cst_390 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2246 = "mhlo.reshape"(%2245) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2247 = chlo.broadcast_multiply %2246, %cst_391 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2248 = chlo.broadcast_add %2247, %cst_403 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2249 = chlo.broadcast_add %2243, %2248 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2250 = chlo.broadcast_multiply %2249, %cst_404 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2251 = chlo.broadcast_add %2250, %cst_405 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2252 = "mhlo.reshape"(%2251) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2253 = "mhlo.dot"(%2252, %cst_387) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2254 = chlo.broadcast_add %2253, %cst_388 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2255 = "mhlo.reshape"(%2254) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2256 = chlo.broadcast_maximum %2255, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2257 = "mhlo.reshape"(%2256) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2258 = "mhlo.dot"(%2257, %cst_383) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2259 = chlo.broadcast_add %2258, %cst_384 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2260 = "mhlo.reshape"(%2259) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2261 = chlo.broadcast_add %2260, %2251 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2262 = chlo.broadcast_multiply %2261, %cst_385 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2263 = chlo.broadcast_add %2262, %cst_386 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2264 = "mhlo.reshape"(%2263) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2265 = "mhlo.dot"(%2264, %cst_381) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2266 = chlo.broadcast_add %2265, %cst_382 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2267 = "mhlo.reshape"(%2266) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2268 = chlo.broadcast_maximum %2267, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2269 = "mhlo.reshape"(%2268) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2270 = "mhlo.dot"(%2269, %cst_377) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2271 = chlo.broadcast_add %2270, %cst_378 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2272 = "mhlo.reshape"(%2271) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2273 = chlo.broadcast_add %2272, %2263 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2274 = chlo.broadcast_multiply %2273, %cst_379 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2275 = chlo.broadcast_add %2274, %cst_380 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2276 = "mhlo.reshape"(%2275) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2277 = "mhlo.dot"(%2276, %cst_375) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2278 = chlo.broadcast_add %2277, %cst_376 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2279 = "mhlo.reshape"(%2278) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2280 = chlo.broadcast_maximum %2279, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2281 = "mhlo.reshape"(%2280) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2282 = "mhlo.dot"(%2281, %cst_371) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2283 = chlo.broadcast_add %2282, %cst_372 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2284 = "mhlo.reshape"(%2283) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2285 = chlo.broadcast_add %2284, %2275 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2286 = chlo.broadcast_multiply %2285, %cst_373 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2287 = chlo.broadcast_add %2286, %cst_374 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2288 = "mhlo.reshape"(%2287) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2289 = "mhlo.dot"(%2288, %cst_369) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2290 = chlo.broadcast_add %2289, %cst_370 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2291 = "mhlo.reshape"(%2290) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2292 = chlo.broadcast_maximum %2291, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2293 = "mhlo.reshape"(%2292) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2294 = "mhlo.dot"(%2293, %cst_361) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2295 = chlo.broadcast_add %2294, %cst_362 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2296 = "mhlo.reshape"(%2295) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2297 = chlo.broadcast_add %2296, %2287 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2298 = chlo.broadcast_multiply %2297, %cst_367 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2299 = chlo.broadcast_add %2298, %cst_368 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2300 = "mhlo.reshape"(%2299) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2301 = "mhlo.dot"(%2300, %cst_363) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2302 = chlo.broadcast_add %2301, %cst_364 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2303 = "mhlo.reshape"(%2302) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2304 = chlo.broadcast_add %2303, %2207 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %2305 = chlo.broadcast_multiply %2304, %cst_365 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2306 = chlo.broadcast_add %2305, %cst_366 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2307 = "mhlo.reshape"(%2306) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2308 = "mhlo.dot"(%2307, %cst_351) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2309 = chlo.broadcast_add %2308, %cst_352 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2310 = "mhlo.reshape"(%2309) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2311 = "mhlo.transpose"(%2310) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2312 = "mhlo.dot"(%2307, %cst_347) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2313 = "mhlo.reshape"(%2312) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2314 = "mhlo.broadcast_in_dim"(%cst_348) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
    %2315 = mhlo.add %2313, %2314 : tensor<1x384x128xf32>
    %2316 = chlo.broadcast_multiply %2315, %cst_349 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2317 = chlo.broadcast_add %2316, %cst_350 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2318 = "mhlo.reshape"(%2317) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2319 = "mhlo.dot"(%2318, %cst_355) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2320 = chlo.broadcast_add %2319, %cst_356 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2321 = "mhlo.reshape"(%2320) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2322 = "mhlo.transpose"(%2321) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2323 = "mhlo.dot"(%2318, %cst_353) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2324 = chlo.broadcast_add %2323, %cst_354 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2325 = "mhlo.reshape"(%2324) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
    %2326 = "mhlo.transpose"(%2325) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
    %2327 = "mhlo.dot_general"(%2326, %2322) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
    %2328 = chlo.broadcast_multiply %2327, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
    %2329 = chlo.broadcast_add %2328, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2330 = "mhlo.reduce"(%2329, %2) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2331 = linalg.tensor_expand_shape %2330 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2332 = chlo.broadcast_subtract %2329, %2331 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2333 = "mhlo.exponential"(%2332) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
    %2334 = "mhlo.reduce"(%2333, %1) ( {
    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
      %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
      "mhlo.return"(%2417) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
    %2335 = linalg.tensor_expand_shape %2334 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
    %2336 = chlo.broadcast_divide %2333, %2335 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
    %2337 = "mhlo.dot_general"(%2336, %2311) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
    %2338 = "mhlo.transpose"(%2337) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
    %2339 = "mhlo.reshape"(%2338) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
    %2340 = "mhlo.dot"(%2339, %cst_357) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
    %2341 = chlo.broadcast_add %2340, %cst_358 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2342 = "mhlo.reshape"(%2341) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2343 = "mhlo.dot"(%2307, %cst_344) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2344 = chlo.broadcast_add %2343, %cst_345 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2345 = "mhlo.reshape"(%2344) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2346 = chlo.broadcast_multiply %2345, %cst_346 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2347 = chlo.broadcast_add %2346, %cst_358 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2348 = chlo.broadcast_add %2342, %2347 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2349 = chlo.broadcast_multiply %2348, %cst_359 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2350 = chlo.broadcast_add %2349, %cst_360 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2351 = "mhlo.reshape"(%2350) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2352 = "mhlo.dot"(%2351, %cst_342) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2353 = chlo.broadcast_add %2352, %cst_343 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2354 = "mhlo.reshape"(%2353) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2355 = chlo.broadcast_maximum %2354, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2356 = "mhlo.reshape"(%2355) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2357 = "mhlo.dot"(%2356, %cst_338) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2358 = chlo.broadcast_add %2357, %cst_339 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2359 = "mhlo.reshape"(%2358) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2360 = chlo.broadcast_add %2359, %2350 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2361 = chlo.broadcast_multiply %2360, %cst_340 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2362 = chlo.broadcast_add %2361, %cst_341 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2363 = "mhlo.reshape"(%2362) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2364 = "mhlo.dot"(%2363, %cst_336) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2365 = chlo.broadcast_add %2364, %cst_337 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2366 = "mhlo.reshape"(%2365) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2367 = chlo.broadcast_maximum %2366, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2368 = "mhlo.reshape"(%2367) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2369 = "mhlo.dot"(%2368, %cst_332) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2370 = chlo.broadcast_add %2369, %cst_333 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2371 = "mhlo.reshape"(%2370) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2372 = chlo.broadcast_add %2371, %2362 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2373 = chlo.broadcast_multiply %2372, %cst_334 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2374 = chlo.broadcast_add %2373, %cst_335 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2375 = "mhlo.reshape"(%2374) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2376 = "mhlo.dot"(%2375, %cst_330) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2377 = chlo.broadcast_add %2376, %cst_331 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2378 = "mhlo.reshape"(%2377) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2379 = chlo.broadcast_maximum %2378, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2380 = "mhlo.reshape"(%2379) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2381 = "mhlo.dot"(%2380, %cst_326) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2382 = chlo.broadcast_add %2381, %cst_327 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2383 = "mhlo.reshape"(%2382) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2384 = chlo.broadcast_add %2383, %2374 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2385 = chlo.broadcast_multiply %2384, %cst_328 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2386 = chlo.broadcast_add %2385, %cst_329 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2387 = "mhlo.reshape"(%2386) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2388 = "mhlo.dot"(%2387, %cst_324) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2389 = chlo.broadcast_add %2388, %cst_325 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2390 = "mhlo.reshape"(%2389) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2391 = chlo.broadcast_maximum %2390, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
    %2392 = "mhlo.reshape"(%2391) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2393 = "mhlo.dot"(%2392, %cst_316) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
    %2394 = chlo.broadcast_add %2393, %cst_317 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
    %2395 = "mhlo.reshape"(%2394) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
    %2396 = chlo.broadcast_add %2395, %2386 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
    %2397 = chlo.broadcast_multiply %2396, %cst_322 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2398 = chlo.broadcast_add %2397, %cst_323 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
    %2399 = "mhlo.reshape"(%2398) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
    %2400 = "mhlo.dot"(%2399, %cst_318) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
    %2401 = chlo.broadcast_add %2400, %cst_319 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
    %2402 = "mhlo.reshape"(%2401) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
    %2403 = chlo.broadcast_add %2402, %2306 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
    %2404 = chlo.broadcast_multiply %2403, %cst_320 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2405 = chlo.broadcast_add %2404, %cst_321 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
    %2406 = "mhlo.reshape"(%2405) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
    %2407 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x512xf32>) -> tensor<512x2xf32>
    %2408 = "mhlo.dot"(%2406, %2407) : (tensor<384x512xf32>, tensor<512x2xf32>) -> tensor<384x2xf32>
    %2409 = "mhlo.broadcast_in_dim"(%cst_0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2xf32>) -> tensor<384x2xf32>
    %2410 = mhlo.add %2408, %2409 : tensor<384x2xf32>
    %2411 = "mhlo.reshape"(%2410) : (tensor<384x2xf32>) -> tensor<1x384x2xf32>
    %2412 = "mhlo.transpose"(%2411) {permutation = dense<[2, 0, 1]> : tensor<3xi64>} : (tensor<1x384x2xf32>) -> tensor<2x1x384xf32>
    %2413 = "mhlo.slice"(%2412) {limit_indices = dense<[1, 1, 384]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<2x1x384xf32>) -> tensor<1x1x384xf32>
    %2414 = linalg.tensor_collapse_shape %2413 [[0], [1, 2]] : tensor<1x1x384xf32> into tensor<1x384xf32>
    %2415 = "mhlo.slice"(%2412) {limit_indices = dense<[2, 1, 384]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<2x1x384xf32>) -> tensor<1x1x384xf32>
    %2416 = linalg.tensor_collapse_shape %2415 [[0], [1, 2]] : tensor<1x1x384xf32> into tensor<1x384xf32>
    return %2416, %2414 : tensor<1x384xf32>, tensor<1x384xf32>
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 builtin.func private @serving_default__ireesm(%arg0: tensor<1x384xi32>, %arg1: tensor<1x384xi32>, %arg2: tensor<1x384xi32>) -> (tensor<1x384xf32>, tensor<1x384xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "segment_ids:0,input_mask:0,input_ids:0", outputs = "end_logits:0,start_logits:0"}} {
  %0 = mhlo.constant dense<1.000000e+00> : tensor<1x384x1xf32>
  %1 = mhlo.constant dense<1.000000e+04> : tensor<f32>
  %2 = mhlo.constant dense<0.176776692> : tensor<f32>
  %3 = mhlo.constant dense<-1.000000e+04> : tensor<f32>
  %4 = mhlo.constant dense<0xFF800000> : tensor<f32>
  %5 = mhlo.constant dense<0.000000e+00> : tensor<f32>
  %cst = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_0 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_2 = constant opaque<"_", "0xDEADBEEF"> : tensor<384x512xf32>
  %6 = mhlo.constant opaque<"_", "0xDEADBEEF"> : tensor<1x384x512xf32>
  %cst_3 = constant opaque<"_", "0xDEADBEEF"> : tensor<2x512xf32>
  %cst_4 = constant opaque<"_", "0xDEADBEEF"> : tensor<30522x128xf32>
  %cst_5 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_6 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_7 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_8 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_9 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_10 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_11 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_12 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_13 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_14 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_15 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_16 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_17 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_18 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_19 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_20 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_21 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_22 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_23 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_24 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_25 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_26 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_27 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_28 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_29 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_30 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_31 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_32 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_33 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_34 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_35 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_36 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_37 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_38 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_39 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_40 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_41 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_42 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_43 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_44 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_45 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_46 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_47 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_48 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_49 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_50 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_51 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_52 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_53 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_54 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_55 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_56 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_57 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_58 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_59 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_60 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_61 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_62 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_63 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_64 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_65 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_66 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_67 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_68 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_69 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_70 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_71 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_72 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_73 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_74 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_75 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_76 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_77 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_78 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_79 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_80 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_81 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_82 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_83 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_84 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_85 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_86 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_87 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_88 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_89 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_90 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_91 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_92 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_93 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_94 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_95 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_96 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_97 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_98 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_99 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_100 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_101 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_102 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_103 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_104 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_105 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_106 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_107 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_108 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_109 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_110 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_111 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_112 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_113 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_114 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_115 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_116 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_117 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_118 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_119 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_120 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_121 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_122 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_123 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_124 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_125 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_126 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_127 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_128 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_129 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_130 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_131 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_132 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_133 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_134 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_135 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_136 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_137 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_138 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_139 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_140 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_141 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_142 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_143 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_144 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_145 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_146 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_147 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_148 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_149 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_150 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_151 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_152 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_153 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_154 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_155 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_156 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_157 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_158 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_159 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_160 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_161 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_162 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_163 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_164 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_165 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_166 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_167 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_168 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_169 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_170 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_171 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_172 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_173 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_174 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_175 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_176 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_177 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_178 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_179 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_180 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_181 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_182 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_183 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_184 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_185 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_186 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_187 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_188 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_189 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_190 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_191 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_192 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_193 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_194 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_195 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_196 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_197 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_198 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_199 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_200 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_201 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_202 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_203 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_204 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_205 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_206 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_207 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_208 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_209 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_210 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_211 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_212 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_213 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_214 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_215 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_216 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_217 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_218 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_219 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_220 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_221 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_222 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_223 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_224 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_225 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_226 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_227 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_228 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_229 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_230 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_231 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_232 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_233 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_234 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_235 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_236 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_237 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_238 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_239 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_240 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_241 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_242 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_243 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_244 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_245 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_246 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_247 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_248 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_249 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_250 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_251 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_252 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_253 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_254 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_255 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_256 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_257 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_258 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_259 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_260 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_261 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_262 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_263 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_264 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_265 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_266 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_267 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_268 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_269 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_270 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_271 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_272 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_273 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_274 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_275 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_276 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_277 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_278 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_279 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_280 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_281 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_282 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_283 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_284 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_285 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_286 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_287 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_288 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_289 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_290 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_291 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_292 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_293 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_294 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_295 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_296 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_297 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_298 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_299 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_300 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_301 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_302 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_303 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_304 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_305 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_306 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_307 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_308 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_309 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_310 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_311 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_312 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_313 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_314 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_315 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_316 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_317 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_318 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_319 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_320 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_321 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_322 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_323 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_324 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_325 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_326 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_327 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_328 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_329 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_330 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_331 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_332 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_333 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_334 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_335 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_336 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_337 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_338 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_339 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_340 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_341 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_342 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_343 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_344 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_345 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_346 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_347 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_348 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_349 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_350 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_351 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_352 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_353 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_354 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_355 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_356 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_357 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_358 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_359 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_360 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_361 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_362 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_363 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_364 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_365 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_366 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_367 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_368 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_369 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_370 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_371 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_372 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_373 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_374 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_375 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_376 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_377 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_378 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_379 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_380 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_381 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_382 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_383 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_384 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_385 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_386 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_387 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_388 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_389 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_390 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_391 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_392 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_393 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_394 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_395 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_396 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_397 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_398 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_399 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_400 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_401 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_402 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_403 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_404 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_405 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_406 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_407 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_408 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_409 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_410 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_411 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_412 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_413 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_414 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_415 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_416 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_417 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_418 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_419 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_420 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_421 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_422 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_423 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_424 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_425 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_426 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_427 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_428 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_429 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_430 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_431 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_432 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_433 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_434 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_435 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_436 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_437 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_438 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_439 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_440 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_441 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_442 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_443 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_444 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_445 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_446 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_447 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_448 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_449 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_450 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_451 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_452 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_453 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_454 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_455 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_456 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_457 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_458 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_459 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_460 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_461 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_462 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_463 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_464 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_465 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_466 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_467 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_468 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_469 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_470 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_471 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_472 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_473 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_474 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_475 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_476 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_477 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_478 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_479 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_480 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_481 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_482 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_483 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_484 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_485 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_486 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_487 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_488 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_489 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_490 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_491 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_492 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_493 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_494 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_495 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_496 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_497 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_498 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_499 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_500 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_501 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_502 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_503 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_504 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_505 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_506 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_507 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_508 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_509 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_510 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_511 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_512 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_513 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_514 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_515 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_516 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_517 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_518 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_519 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_520 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_521 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_522 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_523 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_524 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_525 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_526 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_527 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_528 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_529 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_530 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_531 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_532 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_533 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_534 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_535 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_536 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_537 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_538 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_539 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_540 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_541 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_542 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_543 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_544 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_545 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_546 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_547 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_548 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_549 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_550 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_551 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_552 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_553 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_554 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_555 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_556 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_557 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_558 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_559 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_560 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_561 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_562 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_563 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_564 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_565 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_566 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_567 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_568 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_569 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_570 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_571 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_572 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_573 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_574 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_575 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_576 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_577 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_578 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_579 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_580 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_581 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_582 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_583 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_584 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_585 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_586 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_587 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_588 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_589 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_590 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_591 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_592 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_593 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_594 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_595 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_596 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_597 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_598 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_599 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_600 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_601 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_602 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_603 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_604 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_605 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_606 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_607 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_608 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_609 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_610 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_611 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_612 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_613 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_614 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_615 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_616 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_617 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_618 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_619 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_620 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_621 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_622 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_623 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_624 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_625 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_626 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_627 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_628 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_629 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_630 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_631 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_632 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_633 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_634 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_635 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_636 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_637 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_638 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_639 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_640 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_641 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_642 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_643 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_644 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_645 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_646 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_647 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_648 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_649 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_650 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_651 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_652 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_653 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_654 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_655 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_656 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_657 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_658 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_659 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_660 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_661 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_662 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_663 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_664 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_665 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_666 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_667 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_668 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_669 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_670 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_671 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_672 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_673 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_674 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_675 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_676 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_677 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_678 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_679 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_680 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_681 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_682 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_683 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_684 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_685 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_686 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_687 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_688 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_689 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_690 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_691 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_692 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_693 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_694 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_695 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_696 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_697 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_698 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_699 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_700 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_701 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_702 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_703 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_704 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_705 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_706 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_707 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_708 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_709 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_710 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_711 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_712 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_713 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_714 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_715 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_716 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_717 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_718 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_719 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_720 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_721 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_722 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_723 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_724 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_725 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_726 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_727 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_728 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_729 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_730 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_731 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_732 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_733 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_734 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_735 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_736 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_737 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_738 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_739 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_740 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_741 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_742 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_743 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_744 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_745 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_746 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_747 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_748 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_749 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_750 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_751 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_752 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_753 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_754 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_755 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_756 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_757 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_758 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_759 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_760 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_761 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_762 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_763 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_764 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_765 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_766 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_767 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_768 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_769 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_770 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_771 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_772 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_773 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_774 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_775 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_776 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_777 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_778 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_779 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_780 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_781 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_782 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_783 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_784 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_785 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_786 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_787 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_788 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_789 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_790 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_791 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_792 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_793 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_794 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_795 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_796 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_797 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_798 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_799 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_800 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_801 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_802 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_803 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_804 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_805 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_806 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_807 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_808 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_809 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_810 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_811 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_812 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_813 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_814 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_815 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_816 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_817 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_818 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_819 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_820 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_821 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_822 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_823 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_824 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_825 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_826 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_827 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_828 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_829 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_830 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_831 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_832 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_833 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_834 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_835 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_836 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_837 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_838 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_839 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_840 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_841 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_842 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_843 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_844 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_845 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_846 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_847 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_848 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_849 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_850 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_851 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_852 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_853 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_854 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_855 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_856 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_857 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_858 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_859 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_860 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_861 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_862 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_863 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_864 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_865 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_866 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_867 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_868 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_869 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_870 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_871 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_872 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_873 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_874 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_875 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_876 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_877 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_878 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_879 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_880 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_881 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_882 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_883 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_884 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_885 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_886 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_887 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_888 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_889 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_890 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_891 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_892 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_893 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_894 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_895 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_896 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_897 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_898 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_899 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_900 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_901 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_902 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_903 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_904 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_905 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_906 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_907 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_908 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_909 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_910 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_911 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_912 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_913 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_914 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_915 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_916 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_917 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_918 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_919 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_920 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_921 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_922 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_923 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_924 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_925 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_926 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_927 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_928 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_929 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_930 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_931 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_932 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_933 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_934 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_935 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_936 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_937 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_938 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_939 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_940 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_941 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_942 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_943 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_944 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_945 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_946 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_947 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_948 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_949 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_950 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_951 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_952 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_953 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_954 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_955 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_956 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_957 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_958 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_959 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_960 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_961 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_962 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_963 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_964 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_965 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_966 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_967 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_968 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_969 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_970 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_971 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_972 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_973 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_974 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_975 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_976 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_977 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_978 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_979 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_980 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_981 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_982 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_983 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_984 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_985 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_986 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_987 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_988 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_989 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_990 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_991 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_992 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_993 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_994 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_995 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_996 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_997 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_998 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_999 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1000 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_1001 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1002 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_1003 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1004 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1005 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1006 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1007 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1008 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1009 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1010 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1011 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1012 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1013 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1014 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1015 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1016 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1017 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1018 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1019 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1020 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1021 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1022 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1023 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1024 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1025 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1026 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1027 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1028 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1029 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1030 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1031 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1032 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1033 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1034 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1035 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1036 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1037 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1038 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1039 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1040 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1041 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1042 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1043 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_1044 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1045 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_1046 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1047 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x128xf32>
  %cst_1048 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1049 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1050 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1051 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1052 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1053 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1054 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1055 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1056 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1057 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1058 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1059 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1060 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1061 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1062 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1063 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1064 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1065 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1066 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1067 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1068 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1069 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1070 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1071 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1072 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1073 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1074 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1075 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1076 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1077 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1078 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1079 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1080 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1081 = constant opaque<"_", "0xDEADBEEF"> : tensor<512xf32>
  %cst_1082 = constant opaque<"_", "0xDEADBEEF"> : tensor<128x512xf32>
  %cst_1083 = constant opaque<"_", "0xDEADBEEF"> : tensor<128xf32>
  %cst_1084 = constant opaque<"_", "0xDEADBEEF"> : tensor<512x128xf32>
  %cst_1085 = constant dense<[0.0287729427, 0.0297581609]> : tensor<2xf32>
  %cst_1086 = constant opaque<"_", "0xDEADBEEF"> : tensor<2x512xf32>
  %7 = linalg.tensor_expand_shape %arg2 [[0], [1, 2]] : tensor<1x384xi32> into tensor<1x384x1xi32>
  %8 = "mhlo.torch_index_select"(%cst_4, %7) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<30522x128xf32>, tensor<1x384x1xi32>) -> tensor<1x384x1x128xf32>
  %9 = "mhlo.reshape"(%8) : (tensor<1x384x1x128xf32>) -> tensor<1x384x128xf32>
  %10 = "mhlo.slice"(%9) {limit_indices = dense<[1, 384, 128]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x384x128xf32>) -> tensor<1x383x128xf32>
  %11 = "mhlo.pad"(%10, %5) {edge_padding_high = dense<[0, 1, 0]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x383x128xf32>, tensor<f32>) -> tensor<1x384x128xf32>
  %12 = "mhlo.slice"(%9) {limit_indices = dense<[1, 383, 128]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x384x128xf32>) -> tensor<1x383x128xf32>
  %13 = "mhlo.pad"(%12, %5) {edge_padding_high = dense<0> : tensor<3xi64>, edge_padding_low = dense<[0, 1, 0]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x383x128xf32>, tensor<f32>) -> tensor<1x384x128xf32>
  %14 = "mhlo.concatenate"(%11, %9, %13) {dimension = 2 : i64} : (tensor<1x384x128xf32>, tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x384xf32>
  %15 = "mhlo.reshape"(%14) : (tensor<1x384x384xf32>) -> tensor<384x384xf32>
  %16 = "mhlo.dot"(%15, %cst_2) : (tensor<384x384xf32>, tensor<384x512xf32>) -> tensor<384x512xf32>
  %17 = chlo.broadcast_add %16, %cst_1 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %18 = "mhlo.reshape"(%17) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %19 = "mhlo.convert"(%arg1) : (tensor<1x384xi32>) -> tensor<1x384xf32>
  %20 = "mhlo.reshape"(%19) : (tensor<1x384xf32>) -> tensor<1x1x384xf32>
  %21 = chlo.broadcast_multiply %20, %0 : (tensor<1x1x384xf32>, tensor<1x384x1xf32>) -> tensor<1x384x384xf32>
  %22 = linalg.tensor_expand_shape %21 [[0], [1, 2], [3]] : tensor<1x384x384xf32> into tensor<1x1x384x384xf32>
  %23 = chlo.broadcast_multiply %22, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x384x384xf32>, tensor<f32>) -> tensor<1x1x384x384xf32>
  %24 = chlo.broadcast_add %23, %3 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x384x384xf32>, tensor<f32>) -> tensor<1x1x384x384xf32>
  %25 = "mhlo.torch_index_select"(%cst_3, %arg0) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<2x512xf32>, tensor<1x384xi32>) -> tensor<1x384x512xf32>
  %26 = chlo.broadcast_add %18, %25 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %27 = chlo.broadcast_add %26, %6 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %28 = chlo.broadcast_multiply %27, %cst_0 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %29 = chlo.broadcast_add %28, %cst {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %30 = "mhlo.reshape"(%29) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %31 = "mhlo.dot"(%30, %cst_14) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %32 = chlo.broadcast_add %31, %cst_13 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %33 = "mhlo.reshape"(%32) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %34 = "mhlo.transpose"(%33) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %35 = "mhlo.dot"(%30, %cst_18) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %36 = "mhlo.reshape"(%35) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %37 = "mhlo.broadcast_in_dim"(%cst_17) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %38 = mhlo.add %36, %37 : tensor<1x384x128xf32>
  %39 = chlo.broadcast_multiply %38, %cst_16 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %40 = chlo.broadcast_add %39, %cst_15 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %41 = "mhlo.reshape"(%40) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %42 = "mhlo.dot"(%41, %cst_10) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %43 = chlo.broadcast_add %42, %cst_9 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %44 = "mhlo.reshape"(%43) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %45 = "mhlo.transpose"(%44) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %46 = "mhlo.dot"(%41, %cst_12) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %47 = chlo.broadcast_add %46, %cst_11 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %48 = "mhlo.reshape"(%47) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %49 = "mhlo.transpose"(%48) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %50 = "mhlo.dot_general"(%49, %45) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %51 = chlo.broadcast_multiply %50, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %52 = chlo.broadcast_add %51, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %53 = "mhlo.reduce"(%52, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %54 = linalg.tensor_expand_shape %53 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %55 = chlo.broadcast_subtract %52, %54 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %56 = "mhlo.exponential"(%55) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %57 = "mhlo.reduce"(%56, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %58 = linalg.tensor_expand_shape %57 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %59 = chlo.broadcast_divide %56, %58 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %60 = "mhlo.dot_general"(%59, %34) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %61 = "mhlo.transpose"(%60) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %62 = "mhlo.reshape"(%61) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %63 = "mhlo.dot"(%62, %cst_8) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %64 = chlo.broadcast_add %63, %cst_7 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %65 = "mhlo.reshape"(%64) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %66 = "mhlo.dot"(%30, %cst_21) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %67 = chlo.broadcast_add %66, %cst_20 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %68 = "mhlo.reshape"(%67) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %69 = chlo.broadcast_multiply %68, %cst_19 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %70 = chlo.broadcast_add %69, %cst_7 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %71 = chlo.broadcast_add %65, %70 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %72 = chlo.broadcast_multiply %71, %cst_6 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %73 = chlo.broadcast_add %72, %cst_5 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %74 = "mhlo.reshape"(%73) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %75 = "mhlo.dot"(%74, %cst_23) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %76 = chlo.broadcast_add %75, %cst_22 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %77 = "mhlo.reshape"(%76) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %78 = chlo.broadcast_maximum %77, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %79 = "mhlo.reshape"(%78) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %80 = "mhlo.dot"(%79, %cst_27) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %81 = chlo.broadcast_add %80, %cst_26 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %82 = "mhlo.reshape"(%81) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %83 = chlo.broadcast_add %82, %73 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %84 = chlo.broadcast_multiply %83, %cst_25 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %85 = chlo.broadcast_add %84, %cst_24 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %86 = "mhlo.reshape"(%85) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %87 = "mhlo.dot"(%86, %cst_29) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %88 = chlo.broadcast_add %87, %cst_28 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %89 = "mhlo.reshape"(%88) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %90 = chlo.broadcast_maximum %89, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %91 = "mhlo.reshape"(%90) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %92 = "mhlo.dot"(%91, %cst_33) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %93 = chlo.broadcast_add %92, %cst_32 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %94 = "mhlo.reshape"(%93) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %95 = chlo.broadcast_add %94, %85 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %96 = chlo.broadcast_multiply %95, %cst_31 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %97 = chlo.broadcast_add %96, %cst_30 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %98 = "mhlo.reshape"(%97) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %99 = "mhlo.dot"(%98, %cst_35) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %100 = chlo.broadcast_add %99, %cst_34 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %101 = "mhlo.reshape"(%100) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %102 = chlo.broadcast_maximum %101, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %103 = "mhlo.reshape"(%102) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %104 = "mhlo.dot"(%103, %cst_39) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %105 = chlo.broadcast_add %104, %cst_38 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %106 = "mhlo.reshape"(%105) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %107 = chlo.broadcast_add %106, %97 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %108 = chlo.broadcast_multiply %107, %cst_37 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %109 = chlo.broadcast_add %108, %cst_36 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %110 = "mhlo.reshape"(%109) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %111 = "mhlo.dot"(%110, %cst_41) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %112 = chlo.broadcast_add %111, %cst_40 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %113 = "mhlo.reshape"(%112) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %114 = chlo.broadcast_maximum %113, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %115 = "mhlo.reshape"(%114) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %116 = "mhlo.dot"(%115, %cst_49) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %117 = chlo.broadcast_add %116, %cst_48 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %118 = "mhlo.reshape"(%117) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %119 = chlo.broadcast_add %118, %109 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %120 = chlo.broadcast_multiply %119, %cst_43 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %121 = chlo.broadcast_add %120, %cst_42 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %122 = "mhlo.reshape"(%121) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %123 = "mhlo.dot"(%122, %cst_47) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %124 = chlo.broadcast_add %123, %cst_46 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %125 = "mhlo.reshape"(%124) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %126 = chlo.broadcast_add %125, %29 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %127 = chlo.broadcast_multiply %126, %cst_45 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %128 = chlo.broadcast_add %127, %cst_44 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %129 = "mhlo.reshape"(%128) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %130 = "mhlo.dot"(%129, %cst_59) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %131 = chlo.broadcast_add %130, %cst_58 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %132 = "mhlo.reshape"(%131) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %133 = "mhlo.transpose"(%132) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %134 = "mhlo.dot"(%129, %cst_63) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %135 = "mhlo.reshape"(%134) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %136 = "mhlo.broadcast_in_dim"(%cst_62) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %137 = mhlo.add %135, %136 : tensor<1x384x128xf32>
  %138 = chlo.broadcast_multiply %137, %cst_61 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %139 = chlo.broadcast_add %138, %cst_60 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %140 = "mhlo.reshape"(%139) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %141 = "mhlo.dot"(%140, %cst_55) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %142 = chlo.broadcast_add %141, %cst_54 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %143 = "mhlo.reshape"(%142) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %144 = "mhlo.transpose"(%143) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %145 = "mhlo.dot"(%140, %cst_57) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %146 = chlo.broadcast_add %145, %cst_56 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %147 = "mhlo.reshape"(%146) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %148 = "mhlo.transpose"(%147) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %149 = "mhlo.dot_general"(%148, %144) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %150 = chlo.broadcast_multiply %149, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %151 = chlo.broadcast_add %150, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %152 = "mhlo.reduce"(%151, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %153 = linalg.tensor_expand_shape %152 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %154 = chlo.broadcast_subtract %151, %153 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %155 = "mhlo.exponential"(%154) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %156 = "mhlo.reduce"(%155, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %157 = linalg.tensor_expand_shape %156 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %158 = chlo.broadcast_divide %155, %157 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %159 = "mhlo.dot_general"(%158, %133) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %160 = "mhlo.transpose"(%159) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %161 = "mhlo.reshape"(%160) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %162 = "mhlo.dot"(%161, %cst_53) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %163 = chlo.broadcast_add %162, %cst_52 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %164 = "mhlo.reshape"(%163) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %165 = "mhlo.dot"(%129, %cst_66) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %166 = chlo.broadcast_add %165, %cst_65 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %167 = "mhlo.reshape"(%166) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %168 = chlo.broadcast_multiply %167, %cst_64 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %169 = chlo.broadcast_add %168, %cst_52 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %170 = chlo.broadcast_add %164, %169 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %171 = chlo.broadcast_multiply %170, %cst_51 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %172 = chlo.broadcast_add %171, %cst_50 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %173 = "mhlo.reshape"(%172) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %174 = "mhlo.dot"(%173, %cst_68) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %175 = chlo.broadcast_add %174, %cst_67 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %176 = "mhlo.reshape"(%175) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %177 = chlo.broadcast_maximum %176, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %178 = "mhlo.reshape"(%177) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %179 = "mhlo.dot"(%178, %cst_72) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %180 = chlo.broadcast_add %179, %cst_71 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %181 = "mhlo.reshape"(%180) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %182 = chlo.broadcast_add %181, %172 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %183 = chlo.broadcast_multiply %182, %cst_70 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %184 = chlo.broadcast_add %183, %cst_69 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %185 = "mhlo.reshape"(%184) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %186 = "mhlo.dot"(%185, %cst_74) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %187 = chlo.broadcast_add %186, %cst_73 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %188 = "mhlo.reshape"(%187) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %189 = chlo.broadcast_maximum %188, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %190 = "mhlo.reshape"(%189) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %191 = "mhlo.dot"(%190, %cst_78) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %192 = chlo.broadcast_add %191, %cst_77 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %193 = "mhlo.reshape"(%192) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %194 = chlo.broadcast_add %193, %184 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %195 = chlo.broadcast_multiply %194, %cst_76 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %196 = chlo.broadcast_add %195, %cst_75 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %197 = "mhlo.reshape"(%196) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %198 = "mhlo.dot"(%197, %cst_80) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %199 = chlo.broadcast_add %198, %cst_79 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %200 = "mhlo.reshape"(%199) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %201 = chlo.broadcast_maximum %200, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %202 = "mhlo.reshape"(%201) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %203 = "mhlo.dot"(%202, %cst_84) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %204 = chlo.broadcast_add %203, %cst_83 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %205 = "mhlo.reshape"(%204) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %206 = chlo.broadcast_add %205, %196 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %207 = chlo.broadcast_multiply %206, %cst_82 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %208 = chlo.broadcast_add %207, %cst_81 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %209 = "mhlo.reshape"(%208) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %210 = "mhlo.dot"(%209, %cst_86) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %211 = chlo.broadcast_add %210, %cst_85 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %212 = "mhlo.reshape"(%211) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %213 = chlo.broadcast_maximum %212, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %214 = "mhlo.reshape"(%213) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %215 = "mhlo.dot"(%214, %cst_94) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %216 = chlo.broadcast_add %215, %cst_93 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %217 = "mhlo.reshape"(%216) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %218 = chlo.broadcast_add %217, %208 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %219 = chlo.broadcast_multiply %218, %cst_88 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %220 = chlo.broadcast_add %219, %cst_87 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %221 = "mhlo.reshape"(%220) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %222 = "mhlo.dot"(%221, %cst_92) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %223 = chlo.broadcast_add %222, %cst_91 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %224 = "mhlo.reshape"(%223) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %225 = chlo.broadcast_add %224, %128 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %226 = chlo.broadcast_multiply %225, %cst_90 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %227 = chlo.broadcast_add %226, %cst_89 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %228 = "mhlo.reshape"(%227) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %229 = "mhlo.dot"(%228, %cst_554) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %230 = chlo.broadcast_add %229, %cst_553 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %231 = "mhlo.reshape"(%230) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %232 = "mhlo.transpose"(%231) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %233 = "mhlo.dot"(%228, %cst_558) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %234 = "mhlo.reshape"(%233) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %235 = "mhlo.broadcast_in_dim"(%cst_557) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %236 = mhlo.add %234, %235 : tensor<1x384x128xf32>
  %237 = chlo.broadcast_multiply %236, %cst_556 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %238 = chlo.broadcast_add %237, %cst_555 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %239 = "mhlo.reshape"(%238) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %240 = "mhlo.dot"(%239, %cst_550) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %241 = chlo.broadcast_add %240, %cst_549 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %242 = "mhlo.reshape"(%241) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %243 = "mhlo.transpose"(%242) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %244 = "mhlo.dot"(%239, %cst_552) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %245 = chlo.broadcast_add %244, %cst_551 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %246 = "mhlo.reshape"(%245) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %247 = "mhlo.transpose"(%246) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %248 = "mhlo.dot_general"(%247, %243) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %249 = chlo.broadcast_multiply %248, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %250 = chlo.broadcast_add %249, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %251 = "mhlo.reduce"(%250, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %252 = linalg.tensor_expand_shape %251 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %253 = chlo.broadcast_subtract %250, %252 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %254 = "mhlo.exponential"(%253) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %255 = "mhlo.reduce"(%254, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %256 = linalg.tensor_expand_shape %255 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %257 = chlo.broadcast_divide %254, %256 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %258 = "mhlo.dot_general"(%257, %232) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %259 = "mhlo.transpose"(%258) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %260 = "mhlo.reshape"(%259) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %261 = "mhlo.dot"(%260, %cst_548) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %262 = chlo.broadcast_add %261, %cst_547 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %263 = "mhlo.reshape"(%262) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %264 = "mhlo.dot"(%228, %cst_561) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %265 = chlo.broadcast_add %264, %cst_560 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %266 = "mhlo.reshape"(%265) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %267 = chlo.broadcast_multiply %266, %cst_559 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %268 = chlo.broadcast_add %267, %cst_547 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %269 = chlo.broadcast_add %263, %268 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %270 = chlo.broadcast_multiply %269, %cst_546 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %271 = chlo.broadcast_add %270, %cst_545 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %272 = "mhlo.reshape"(%271) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %273 = "mhlo.dot"(%272, %cst_563) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %274 = chlo.broadcast_add %273, %cst_562 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %275 = "mhlo.reshape"(%274) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %276 = chlo.broadcast_maximum %275, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %277 = "mhlo.reshape"(%276) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %278 = "mhlo.dot"(%277, %cst_567) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %279 = chlo.broadcast_add %278, %cst_566 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %280 = "mhlo.reshape"(%279) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %281 = chlo.broadcast_add %280, %271 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %282 = chlo.broadcast_multiply %281, %cst_565 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %283 = chlo.broadcast_add %282, %cst_564 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %284 = "mhlo.reshape"(%283) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %285 = "mhlo.dot"(%284, %cst_569) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %286 = chlo.broadcast_add %285, %cst_568 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %287 = "mhlo.reshape"(%286) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %288 = chlo.broadcast_maximum %287, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %289 = "mhlo.reshape"(%288) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %290 = "mhlo.dot"(%289, %cst_573) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %291 = chlo.broadcast_add %290, %cst_572 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %292 = "mhlo.reshape"(%291) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %293 = chlo.broadcast_add %292, %283 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %294 = chlo.broadcast_multiply %293, %cst_571 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %295 = chlo.broadcast_add %294, %cst_570 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %296 = "mhlo.reshape"(%295) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %297 = "mhlo.dot"(%296, %cst_575) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %298 = chlo.broadcast_add %297, %cst_574 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %299 = "mhlo.reshape"(%298) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %300 = chlo.broadcast_maximum %299, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %301 = "mhlo.reshape"(%300) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %302 = "mhlo.dot"(%301, %cst_579) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %303 = chlo.broadcast_add %302, %cst_578 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %304 = "mhlo.reshape"(%303) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %305 = chlo.broadcast_add %304, %295 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %306 = chlo.broadcast_multiply %305, %cst_577 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %307 = chlo.broadcast_add %306, %cst_576 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %308 = "mhlo.reshape"(%307) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %309 = "mhlo.dot"(%308, %cst_581) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %310 = chlo.broadcast_add %309, %cst_580 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %311 = "mhlo.reshape"(%310) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %312 = chlo.broadcast_maximum %311, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %313 = "mhlo.reshape"(%312) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %314 = "mhlo.dot"(%313, %cst_589) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %315 = chlo.broadcast_add %314, %cst_588 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %316 = "mhlo.reshape"(%315) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %317 = chlo.broadcast_add %316, %307 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %318 = chlo.broadcast_multiply %317, %cst_583 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %319 = chlo.broadcast_add %318, %cst_582 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %320 = "mhlo.reshape"(%319) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %321 = "mhlo.dot"(%320, %cst_587) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %322 = chlo.broadcast_add %321, %cst_586 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %323 = "mhlo.reshape"(%322) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %324 = chlo.broadcast_add %323, %227 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %325 = chlo.broadcast_multiply %324, %cst_585 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %326 = chlo.broadcast_add %325, %cst_584 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %327 = "mhlo.reshape"(%326) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %328 = "mhlo.dot"(%327, %cst_779) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %329 = chlo.broadcast_add %328, %cst_778 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %330 = "mhlo.reshape"(%329) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %331 = "mhlo.transpose"(%330) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %332 = "mhlo.dot"(%327, %cst_783) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %333 = "mhlo.reshape"(%332) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %334 = "mhlo.broadcast_in_dim"(%cst_782) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %335 = mhlo.add %333, %334 : tensor<1x384x128xf32>
  %336 = chlo.broadcast_multiply %335, %cst_781 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %337 = chlo.broadcast_add %336, %cst_780 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %338 = "mhlo.reshape"(%337) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %339 = "mhlo.dot"(%338, %cst_775) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %340 = chlo.broadcast_add %339, %cst_774 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %341 = "mhlo.reshape"(%340) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %342 = "mhlo.transpose"(%341) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %343 = "mhlo.dot"(%338, %cst_777) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %344 = chlo.broadcast_add %343, %cst_776 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %345 = "mhlo.reshape"(%344) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %346 = "mhlo.transpose"(%345) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %347 = "mhlo.dot_general"(%346, %342) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %348 = chlo.broadcast_multiply %347, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %349 = chlo.broadcast_add %348, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %350 = "mhlo.reduce"(%349, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %351 = linalg.tensor_expand_shape %350 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %352 = chlo.broadcast_subtract %349, %351 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %353 = "mhlo.exponential"(%352) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %354 = "mhlo.reduce"(%353, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %355 = linalg.tensor_expand_shape %354 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %356 = chlo.broadcast_divide %353, %355 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %357 = "mhlo.dot_general"(%356, %331) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %358 = "mhlo.transpose"(%357) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %359 = "mhlo.reshape"(%358) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %360 = "mhlo.dot"(%359, %cst_773) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %361 = chlo.broadcast_add %360, %cst_772 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %362 = "mhlo.reshape"(%361) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %363 = "mhlo.dot"(%327, %cst_786) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %364 = chlo.broadcast_add %363, %cst_785 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %365 = "mhlo.reshape"(%364) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %366 = chlo.broadcast_multiply %365, %cst_784 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %367 = chlo.broadcast_add %366, %cst_772 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %368 = chlo.broadcast_add %362, %367 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %369 = chlo.broadcast_multiply %368, %cst_771 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %370 = chlo.broadcast_add %369, %cst_770 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %371 = "mhlo.reshape"(%370) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %372 = "mhlo.dot"(%371, %cst_788) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %373 = chlo.broadcast_add %372, %cst_787 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %374 = "mhlo.reshape"(%373) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %375 = chlo.broadcast_maximum %374, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %376 = "mhlo.reshape"(%375) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %377 = "mhlo.dot"(%376, %cst_792) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %378 = chlo.broadcast_add %377, %cst_791 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %379 = "mhlo.reshape"(%378) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %380 = chlo.broadcast_add %379, %370 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %381 = chlo.broadcast_multiply %380, %cst_790 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %382 = chlo.broadcast_add %381, %cst_789 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %383 = "mhlo.reshape"(%382) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %384 = "mhlo.dot"(%383, %cst_794) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %385 = chlo.broadcast_add %384, %cst_793 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %386 = "mhlo.reshape"(%385) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %387 = chlo.broadcast_maximum %386, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %388 = "mhlo.reshape"(%387) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %389 = "mhlo.dot"(%388, %cst_798) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %390 = chlo.broadcast_add %389, %cst_797 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %391 = "mhlo.reshape"(%390) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %392 = chlo.broadcast_add %391, %382 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %393 = chlo.broadcast_multiply %392, %cst_796 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %394 = chlo.broadcast_add %393, %cst_795 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %395 = "mhlo.reshape"(%394) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %396 = "mhlo.dot"(%395, %cst_800) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %397 = chlo.broadcast_add %396, %cst_799 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %398 = "mhlo.reshape"(%397) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %399 = chlo.broadcast_maximum %398, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %400 = "mhlo.reshape"(%399) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %401 = "mhlo.dot"(%400, %cst_804) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %402 = chlo.broadcast_add %401, %cst_803 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %403 = "mhlo.reshape"(%402) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %404 = chlo.broadcast_add %403, %394 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %405 = chlo.broadcast_multiply %404, %cst_802 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %406 = chlo.broadcast_add %405, %cst_801 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %407 = "mhlo.reshape"(%406) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %408 = "mhlo.dot"(%407, %cst_806) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %409 = chlo.broadcast_add %408, %cst_805 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %410 = "mhlo.reshape"(%409) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %411 = chlo.broadcast_maximum %410, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %412 = "mhlo.reshape"(%411) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %413 = "mhlo.dot"(%412, %cst_814) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %414 = chlo.broadcast_add %413, %cst_813 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %415 = "mhlo.reshape"(%414) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %416 = chlo.broadcast_add %415, %406 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %417 = chlo.broadcast_multiply %416, %cst_808 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %418 = chlo.broadcast_add %417, %cst_807 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %419 = "mhlo.reshape"(%418) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %420 = "mhlo.dot"(%419, %cst_812) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %421 = chlo.broadcast_add %420, %cst_811 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %422 = "mhlo.reshape"(%421) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %423 = chlo.broadcast_add %422, %326 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %424 = chlo.broadcast_multiply %423, %cst_810 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %425 = chlo.broadcast_add %424, %cst_809 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %426 = "mhlo.reshape"(%425) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %427 = "mhlo.dot"(%426, %cst_824) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %428 = chlo.broadcast_add %427, %cst_823 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %429 = "mhlo.reshape"(%428) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %430 = "mhlo.transpose"(%429) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %431 = "mhlo.dot"(%426, %cst_828) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %432 = "mhlo.reshape"(%431) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %433 = "mhlo.broadcast_in_dim"(%cst_827) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %434 = mhlo.add %432, %433 : tensor<1x384x128xf32>
  %435 = chlo.broadcast_multiply %434, %cst_826 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %436 = chlo.broadcast_add %435, %cst_825 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %437 = "mhlo.reshape"(%436) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %438 = "mhlo.dot"(%437, %cst_820) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %439 = chlo.broadcast_add %438, %cst_819 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %440 = "mhlo.reshape"(%439) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %441 = "mhlo.transpose"(%440) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %442 = "mhlo.dot"(%437, %cst_822) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %443 = chlo.broadcast_add %442, %cst_821 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %444 = "mhlo.reshape"(%443) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %445 = "mhlo.transpose"(%444) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %446 = "mhlo.dot_general"(%445, %441) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %447 = chlo.broadcast_multiply %446, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %448 = chlo.broadcast_add %447, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %449 = "mhlo.reduce"(%448, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %450 = linalg.tensor_expand_shape %449 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %451 = chlo.broadcast_subtract %448, %450 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %452 = "mhlo.exponential"(%451) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %453 = "mhlo.reduce"(%452, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %454 = linalg.tensor_expand_shape %453 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %455 = chlo.broadcast_divide %452, %454 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %456 = "mhlo.dot_general"(%455, %430) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %457 = "mhlo.transpose"(%456) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %458 = "mhlo.reshape"(%457) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %459 = "mhlo.dot"(%458, %cst_818) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %460 = chlo.broadcast_add %459, %cst_817 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %461 = "mhlo.reshape"(%460) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %462 = "mhlo.dot"(%426, %cst_831) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %463 = chlo.broadcast_add %462, %cst_830 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %464 = "mhlo.reshape"(%463) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %465 = chlo.broadcast_multiply %464, %cst_829 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %466 = chlo.broadcast_add %465, %cst_817 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %467 = chlo.broadcast_add %461, %466 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %468 = chlo.broadcast_multiply %467, %cst_816 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %469 = chlo.broadcast_add %468, %cst_815 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %470 = "mhlo.reshape"(%469) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %471 = "mhlo.dot"(%470, %cst_833) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %472 = chlo.broadcast_add %471, %cst_832 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %473 = "mhlo.reshape"(%472) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %474 = chlo.broadcast_maximum %473, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %475 = "mhlo.reshape"(%474) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %476 = "mhlo.dot"(%475, %cst_837) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %477 = chlo.broadcast_add %476, %cst_836 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %478 = "mhlo.reshape"(%477) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %479 = chlo.broadcast_add %478, %469 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %480 = chlo.broadcast_multiply %479, %cst_835 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %481 = chlo.broadcast_add %480, %cst_834 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %482 = "mhlo.reshape"(%481) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %483 = "mhlo.dot"(%482, %cst_839) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %484 = chlo.broadcast_add %483, %cst_838 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %485 = "mhlo.reshape"(%484) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %486 = chlo.broadcast_maximum %485, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %487 = "mhlo.reshape"(%486) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %488 = "mhlo.dot"(%487, %cst_843) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %489 = chlo.broadcast_add %488, %cst_842 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %490 = "mhlo.reshape"(%489) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %491 = chlo.broadcast_add %490, %481 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %492 = chlo.broadcast_multiply %491, %cst_841 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %493 = chlo.broadcast_add %492, %cst_840 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %494 = "mhlo.reshape"(%493) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %495 = "mhlo.dot"(%494, %cst_845) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %496 = chlo.broadcast_add %495, %cst_844 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %497 = "mhlo.reshape"(%496) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %498 = chlo.broadcast_maximum %497, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %499 = "mhlo.reshape"(%498) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %500 = "mhlo.dot"(%499, %cst_849) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %501 = chlo.broadcast_add %500, %cst_848 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %502 = "mhlo.reshape"(%501) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %503 = chlo.broadcast_add %502, %493 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %504 = chlo.broadcast_multiply %503, %cst_847 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %505 = chlo.broadcast_add %504, %cst_846 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %506 = "mhlo.reshape"(%505) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %507 = "mhlo.dot"(%506, %cst_851) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %508 = chlo.broadcast_add %507, %cst_850 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %509 = "mhlo.reshape"(%508) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %510 = chlo.broadcast_maximum %509, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %511 = "mhlo.reshape"(%510) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %512 = "mhlo.dot"(%511, %cst_859) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %513 = chlo.broadcast_add %512, %cst_858 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %514 = "mhlo.reshape"(%513) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %515 = chlo.broadcast_add %514, %505 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %516 = chlo.broadcast_multiply %515, %cst_853 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %517 = chlo.broadcast_add %516, %cst_852 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %518 = "mhlo.reshape"(%517) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %519 = "mhlo.dot"(%518, %cst_857) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %520 = chlo.broadcast_add %519, %cst_856 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %521 = "mhlo.reshape"(%520) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %522 = chlo.broadcast_add %521, %425 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %523 = chlo.broadcast_multiply %522, %cst_855 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %524 = chlo.broadcast_add %523, %cst_854 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %525 = "mhlo.reshape"(%524) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %526 = "mhlo.dot"(%525, %cst_869) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %527 = chlo.broadcast_add %526, %cst_868 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %528 = "mhlo.reshape"(%527) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %529 = "mhlo.transpose"(%528) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %530 = "mhlo.dot"(%525, %cst_873) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %531 = "mhlo.reshape"(%530) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %532 = "mhlo.broadcast_in_dim"(%cst_872) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %533 = mhlo.add %531, %532 : tensor<1x384x128xf32>
  %534 = chlo.broadcast_multiply %533, %cst_871 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %535 = chlo.broadcast_add %534, %cst_870 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %536 = "mhlo.reshape"(%535) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %537 = "mhlo.dot"(%536, %cst_865) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %538 = chlo.broadcast_add %537, %cst_864 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %539 = "mhlo.reshape"(%538) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %540 = "mhlo.transpose"(%539) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %541 = "mhlo.dot"(%536, %cst_867) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %542 = chlo.broadcast_add %541, %cst_866 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %543 = "mhlo.reshape"(%542) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %544 = "mhlo.transpose"(%543) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %545 = "mhlo.dot_general"(%544, %540) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %546 = chlo.broadcast_multiply %545, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %547 = chlo.broadcast_add %546, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %548 = "mhlo.reduce"(%547, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %549 = linalg.tensor_expand_shape %548 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %550 = chlo.broadcast_subtract %547, %549 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %551 = "mhlo.exponential"(%550) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %552 = "mhlo.reduce"(%551, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %553 = linalg.tensor_expand_shape %552 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %554 = chlo.broadcast_divide %551, %553 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %555 = "mhlo.dot_general"(%554, %529) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %556 = "mhlo.transpose"(%555) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %557 = "mhlo.reshape"(%556) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %558 = "mhlo.dot"(%557, %cst_863) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %559 = chlo.broadcast_add %558, %cst_862 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %560 = "mhlo.reshape"(%559) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %561 = "mhlo.dot"(%525, %cst_876) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %562 = chlo.broadcast_add %561, %cst_875 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %563 = "mhlo.reshape"(%562) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %564 = chlo.broadcast_multiply %563, %cst_874 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %565 = chlo.broadcast_add %564, %cst_862 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %566 = chlo.broadcast_add %560, %565 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %567 = chlo.broadcast_multiply %566, %cst_861 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %568 = chlo.broadcast_add %567, %cst_860 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %569 = "mhlo.reshape"(%568) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %570 = "mhlo.dot"(%569, %cst_878) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %571 = chlo.broadcast_add %570, %cst_877 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %572 = "mhlo.reshape"(%571) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %573 = chlo.broadcast_maximum %572, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %574 = "mhlo.reshape"(%573) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %575 = "mhlo.dot"(%574, %cst_882) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %576 = chlo.broadcast_add %575, %cst_881 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %577 = "mhlo.reshape"(%576) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %578 = chlo.broadcast_add %577, %568 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %579 = chlo.broadcast_multiply %578, %cst_880 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %580 = chlo.broadcast_add %579, %cst_879 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %581 = "mhlo.reshape"(%580) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %582 = "mhlo.dot"(%581, %cst_884) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %583 = chlo.broadcast_add %582, %cst_883 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %584 = "mhlo.reshape"(%583) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %585 = chlo.broadcast_maximum %584, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %586 = "mhlo.reshape"(%585) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %587 = "mhlo.dot"(%586, %cst_888) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %588 = chlo.broadcast_add %587, %cst_887 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %589 = "mhlo.reshape"(%588) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %590 = chlo.broadcast_add %589, %580 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %591 = chlo.broadcast_multiply %590, %cst_886 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %592 = chlo.broadcast_add %591, %cst_885 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %593 = "mhlo.reshape"(%592) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %594 = "mhlo.dot"(%593, %cst_890) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %595 = chlo.broadcast_add %594, %cst_889 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %596 = "mhlo.reshape"(%595) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %597 = chlo.broadcast_maximum %596, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %598 = "mhlo.reshape"(%597) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %599 = "mhlo.dot"(%598, %cst_894) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %600 = chlo.broadcast_add %599, %cst_893 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %601 = "mhlo.reshape"(%600) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %602 = chlo.broadcast_add %601, %592 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %603 = chlo.broadcast_multiply %602, %cst_892 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %604 = chlo.broadcast_add %603, %cst_891 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %605 = "mhlo.reshape"(%604) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %606 = "mhlo.dot"(%605, %cst_896) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %607 = chlo.broadcast_add %606, %cst_895 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %608 = "mhlo.reshape"(%607) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %609 = chlo.broadcast_maximum %608, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %610 = "mhlo.reshape"(%609) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %611 = "mhlo.dot"(%610, %cst_904) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %612 = chlo.broadcast_add %611, %cst_903 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %613 = "mhlo.reshape"(%612) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %614 = chlo.broadcast_add %613, %604 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %615 = chlo.broadcast_multiply %614, %cst_898 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %616 = chlo.broadcast_add %615, %cst_897 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %617 = "mhlo.reshape"(%616) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %618 = "mhlo.dot"(%617, %cst_902) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %619 = chlo.broadcast_add %618, %cst_901 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %620 = "mhlo.reshape"(%619) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %621 = chlo.broadcast_add %620, %524 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %622 = chlo.broadcast_multiply %621, %cst_900 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %623 = chlo.broadcast_add %622, %cst_899 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %624 = "mhlo.reshape"(%623) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %625 = "mhlo.dot"(%624, %cst_914) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %626 = chlo.broadcast_add %625, %cst_913 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %627 = "mhlo.reshape"(%626) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %628 = "mhlo.transpose"(%627) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %629 = "mhlo.dot"(%624, %cst_918) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %630 = "mhlo.reshape"(%629) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %631 = "mhlo.broadcast_in_dim"(%cst_917) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %632 = mhlo.add %630, %631 : tensor<1x384x128xf32>
  %633 = chlo.broadcast_multiply %632, %cst_916 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %634 = chlo.broadcast_add %633, %cst_915 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %635 = "mhlo.reshape"(%634) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %636 = "mhlo.dot"(%635, %cst_910) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %637 = chlo.broadcast_add %636, %cst_909 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %638 = "mhlo.reshape"(%637) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %639 = "mhlo.transpose"(%638) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %640 = "mhlo.dot"(%635, %cst_912) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %641 = chlo.broadcast_add %640, %cst_911 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %642 = "mhlo.reshape"(%641) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %643 = "mhlo.transpose"(%642) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %644 = "mhlo.dot_general"(%643, %639) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %645 = chlo.broadcast_multiply %644, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %646 = chlo.broadcast_add %645, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %647 = "mhlo.reduce"(%646, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %648 = linalg.tensor_expand_shape %647 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %649 = chlo.broadcast_subtract %646, %648 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %650 = "mhlo.exponential"(%649) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %651 = "mhlo.reduce"(%650, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %652 = linalg.tensor_expand_shape %651 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %653 = chlo.broadcast_divide %650, %652 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %654 = "mhlo.dot_general"(%653, %628) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %655 = "mhlo.transpose"(%654) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %656 = "mhlo.reshape"(%655) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %657 = "mhlo.dot"(%656, %cst_908) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %658 = chlo.broadcast_add %657, %cst_907 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %659 = "mhlo.reshape"(%658) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %660 = "mhlo.dot"(%624, %cst_921) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %661 = chlo.broadcast_add %660, %cst_920 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %662 = "mhlo.reshape"(%661) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %663 = chlo.broadcast_multiply %662, %cst_919 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %664 = chlo.broadcast_add %663, %cst_907 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %665 = chlo.broadcast_add %659, %664 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %666 = chlo.broadcast_multiply %665, %cst_906 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %667 = chlo.broadcast_add %666, %cst_905 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %668 = "mhlo.reshape"(%667) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %669 = "mhlo.dot"(%668, %cst_923) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %670 = chlo.broadcast_add %669, %cst_922 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %671 = "mhlo.reshape"(%670) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %672 = chlo.broadcast_maximum %671, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %673 = "mhlo.reshape"(%672) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %674 = "mhlo.dot"(%673, %cst_927) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %675 = chlo.broadcast_add %674, %cst_926 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %676 = "mhlo.reshape"(%675) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %677 = chlo.broadcast_add %676, %667 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %678 = chlo.broadcast_multiply %677, %cst_925 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %679 = chlo.broadcast_add %678, %cst_924 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %680 = "mhlo.reshape"(%679) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %681 = "mhlo.dot"(%680, %cst_929) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %682 = chlo.broadcast_add %681, %cst_928 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %683 = "mhlo.reshape"(%682) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %684 = chlo.broadcast_maximum %683, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %685 = "mhlo.reshape"(%684) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %686 = "mhlo.dot"(%685, %cst_933) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %687 = chlo.broadcast_add %686, %cst_932 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %688 = "mhlo.reshape"(%687) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %689 = chlo.broadcast_add %688, %679 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %690 = chlo.broadcast_multiply %689, %cst_931 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %691 = chlo.broadcast_add %690, %cst_930 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %692 = "mhlo.reshape"(%691) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %693 = "mhlo.dot"(%692, %cst_935) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %694 = chlo.broadcast_add %693, %cst_934 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %695 = "mhlo.reshape"(%694) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %696 = chlo.broadcast_maximum %695, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %697 = "mhlo.reshape"(%696) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %698 = "mhlo.dot"(%697, %cst_939) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %699 = chlo.broadcast_add %698, %cst_938 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %700 = "mhlo.reshape"(%699) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %701 = chlo.broadcast_add %700, %691 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %702 = chlo.broadcast_multiply %701, %cst_937 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %703 = chlo.broadcast_add %702, %cst_936 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %704 = "mhlo.reshape"(%703) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %705 = "mhlo.dot"(%704, %cst_941) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %706 = chlo.broadcast_add %705, %cst_940 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %707 = "mhlo.reshape"(%706) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %708 = chlo.broadcast_maximum %707, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %709 = "mhlo.reshape"(%708) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %710 = "mhlo.dot"(%709, %cst_949) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %711 = chlo.broadcast_add %710, %cst_948 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %712 = "mhlo.reshape"(%711) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %713 = chlo.broadcast_add %712, %703 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %714 = chlo.broadcast_multiply %713, %cst_943 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %715 = chlo.broadcast_add %714, %cst_942 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %716 = "mhlo.reshape"(%715) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %717 = "mhlo.dot"(%716, %cst_947) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %718 = chlo.broadcast_add %717, %cst_946 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %719 = "mhlo.reshape"(%718) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %720 = chlo.broadcast_add %719, %623 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %721 = chlo.broadcast_multiply %720, %cst_945 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %722 = chlo.broadcast_add %721, %cst_944 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %723 = "mhlo.reshape"(%722) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %724 = "mhlo.dot"(%723, %cst_959) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %725 = chlo.broadcast_add %724, %cst_958 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %726 = "mhlo.reshape"(%725) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %727 = "mhlo.transpose"(%726) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %728 = "mhlo.dot"(%723, %cst_963) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %729 = "mhlo.reshape"(%728) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %730 = "mhlo.broadcast_in_dim"(%cst_962) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %731 = mhlo.add %729, %730 : tensor<1x384x128xf32>
  %732 = chlo.broadcast_multiply %731, %cst_961 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %733 = chlo.broadcast_add %732, %cst_960 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %734 = "mhlo.reshape"(%733) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %735 = "mhlo.dot"(%734, %cst_955) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %736 = chlo.broadcast_add %735, %cst_954 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %737 = "mhlo.reshape"(%736) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %738 = "mhlo.transpose"(%737) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %739 = "mhlo.dot"(%734, %cst_957) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %740 = chlo.broadcast_add %739, %cst_956 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %741 = "mhlo.reshape"(%740) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %742 = "mhlo.transpose"(%741) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %743 = "mhlo.dot_general"(%742, %738) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %744 = chlo.broadcast_multiply %743, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %745 = chlo.broadcast_add %744, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %746 = "mhlo.reduce"(%745, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %747 = linalg.tensor_expand_shape %746 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %748 = chlo.broadcast_subtract %745, %747 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %749 = "mhlo.exponential"(%748) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %750 = "mhlo.reduce"(%749, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %751 = linalg.tensor_expand_shape %750 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %752 = chlo.broadcast_divide %749, %751 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %753 = "mhlo.dot_general"(%752, %727) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %754 = "mhlo.transpose"(%753) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %755 = "mhlo.reshape"(%754) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %756 = "mhlo.dot"(%755, %cst_953) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %757 = chlo.broadcast_add %756, %cst_952 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %758 = "mhlo.reshape"(%757) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %759 = "mhlo.dot"(%723, %cst_966) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %760 = chlo.broadcast_add %759, %cst_965 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %761 = "mhlo.reshape"(%760) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %762 = chlo.broadcast_multiply %761, %cst_964 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %763 = chlo.broadcast_add %762, %cst_952 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %764 = chlo.broadcast_add %758, %763 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %765 = chlo.broadcast_multiply %764, %cst_951 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %766 = chlo.broadcast_add %765, %cst_950 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %767 = "mhlo.reshape"(%766) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %768 = "mhlo.dot"(%767, %cst_968) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %769 = chlo.broadcast_add %768, %cst_967 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %770 = "mhlo.reshape"(%769) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %771 = chlo.broadcast_maximum %770, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %772 = "mhlo.reshape"(%771) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %773 = "mhlo.dot"(%772, %cst_972) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %774 = chlo.broadcast_add %773, %cst_971 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %775 = "mhlo.reshape"(%774) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %776 = chlo.broadcast_add %775, %766 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %777 = chlo.broadcast_multiply %776, %cst_970 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %778 = chlo.broadcast_add %777, %cst_969 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %779 = "mhlo.reshape"(%778) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %780 = "mhlo.dot"(%779, %cst_974) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %781 = chlo.broadcast_add %780, %cst_973 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %782 = "mhlo.reshape"(%781) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %783 = chlo.broadcast_maximum %782, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %784 = "mhlo.reshape"(%783) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %785 = "mhlo.dot"(%784, %cst_978) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %786 = chlo.broadcast_add %785, %cst_977 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %787 = "mhlo.reshape"(%786) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %788 = chlo.broadcast_add %787, %778 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %789 = chlo.broadcast_multiply %788, %cst_976 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %790 = chlo.broadcast_add %789, %cst_975 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %791 = "mhlo.reshape"(%790) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %792 = "mhlo.dot"(%791, %cst_980) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %793 = chlo.broadcast_add %792, %cst_979 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %794 = "mhlo.reshape"(%793) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %795 = chlo.broadcast_maximum %794, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %796 = "mhlo.reshape"(%795) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %797 = "mhlo.dot"(%796, %cst_984) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %798 = chlo.broadcast_add %797, %cst_983 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %799 = "mhlo.reshape"(%798) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %800 = chlo.broadcast_add %799, %790 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %801 = chlo.broadcast_multiply %800, %cst_982 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %802 = chlo.broadcast_add %801, %cst_981 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %803 = "mhlo.reshape"(%802) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %804 = "mhlo.dot"(%803, %cst_986) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %805 = chlo.broadcast_add %804, %cst_985 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %806 = "mhlo.reshape"(%805) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %807 = chlo.broadcast_maximum %806, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %808 = "mhlo.reshape"(%807) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %809 = "mhlo.dot"(%808, %cst_994) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %810 = chlo.broadcast_add %809, %cst_993 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %811 = "mhlo.reshape"(%810) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %812 = chlo.broadcast_add %811, %802 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %813 = chlo.broadcast_multiply %812, %cst_988 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %814 = chlo.broadcast_add %813, %cst_987 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %815 = "mhlo.reshape"(%814) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %816 = "mhlo.dot"(%815, %cst_992) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %817 = chlo.broadcast_add %816, %cst_991 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %818 = "mhlo.reshape"(%817) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %819 = chlo.broadcast_add %818, %722 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %820 = chlo.broadcast_multiply %819, %cst_990 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %821 = chlo.broadcast_add %820, %cst_989 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %822 = "mhlo.reshape"(%821) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %823 = "mhlo.dot"(%822, %cst_1004) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %824 = chlo.broadcast_add %823, %cst_1003 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %825 = "mhlo.reshape"(%824) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %826 = "mhlo.transpose"(%825) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %827 = "mhlo.dot"(%822, %cst_1008) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %828 = "mhlo.reshape"(%827) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %829 = "mhlo.broadcast_in_dim"(%cst_1007) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %830 = mhlo.add %828, %829 : tensor<1x384x128xf32>
  %831 = chlo.broadcast_multiply %830, %cst_1006 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %832 = chlo.broadcast_add %831, %cst_1005 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %833 = "mhlo.reshape"(%832) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %834 = "mhlo.dot"(%833, %cst_1000) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %835 = chlo.broadcast_add %834, %cst_999 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %836 = "mhlo.reshape"(%835) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %837 = "mhlo.transpose"(%836) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %838 = "mhlo.dot"(%833, %cst_1002) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %839 = chlo.broadcast_add %838, %cst_1001 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %840 = "mhlo.reshape"(%839) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %841 = "mhlo.transpose"(%840) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %842 = "mhlo.dot_general"(%841, %837) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %843 = chlo.broadcast_multiply %842, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %844 = chlo.broadcast_add %843, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %845 = "mhlo.reduce"(%844, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %846 = linalg.tensor_expand_shape %845 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %847 = chlo.broadcast_subtract %844, %846 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %848 = "mhlo.exponential"(%847) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %849 = "mhlo.reduce"(%848, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %850 = linalg.tensor_expand_shape %849 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %851 = chlo.broadcast_divide %848, %850 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %852 = "mhlo.dot_general"(%851, %826) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %853 = "mhlo.transpose"(%852) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %854 = "mhlo.reshape"(%853) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %855 = "mhlo.dot"(%854, %cst_998) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %856 = chlo.broadcast_add %855, %cst_997 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %857 = "mhlo.reshape"(%856) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %858 = "mhlo.dot"(%822, %cst_1011) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %859 = chlo.broadcast_add %858, %cst_1010 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %860 = "mhlo.reshape"(%859) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %861 = chlo.broadcast_multiply %860, %cst_1009 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %862 = chlo.broadcast_add %861, %cst_997 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %863 = chlo.broadcast_add %857, %862 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %864 = chlo.broadcast_multiply %863, %cst_996 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %865 = chlo.broadcast_add %864, %cst_995 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %866 = "mhlo.reshape"(%865) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %867 = "mhlo.dot"(%866, %cst_1013) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %868 = chlo.broadcast_add %867, %cst_1012 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %869 = "mhlo.reshape"(%868) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %870 = chlo.broadcast_maximum %869, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %871 = "mhlo.reshape"(%870) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %872 = "mhlo.dot"(%871, %cst_1017) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %873 = chlo.broadcast_add %872, %cst_1016 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %874 = "mhlo.reshape"(%873) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %875 = chlo.broadcast_add %874, %865 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %876 = chlo.broadcast_multiply %875, %cst_1015 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %877 = chlo.broadcast_add %876, %cst_1014 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %878 = "mhlo.reshape"(%877) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %879 = "mhlo.dot"(%878, %cst_1019) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %880 = chlo.broadcast_add %879, %cst_1018 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %881 = "mhlo.reshape"(%880) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %882 = chlo.broadcast_maximum %881, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %883 = "mhlo.reshape"(%882) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %884 = "mhlo.dot"(%883, %cst_1023) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %885 = chlo.broadcast_add %884, %cst_1022 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %886 = "mhlo.reshape"(%885) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %887 = chlo.broadcast_add %886, %877 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %888 = chlo.broadcast_multiply %887, %cst_1021 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %889 = chlo.broadcast_add %888, %cst_1020 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %890 = "mhlo.reshape"(%889) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %891 = "mhlo.dot"(%890, %cst_1025) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %892 = chlo.broadcast_add %891, %cst_1024 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %893 = "mhlo.reshape"(%892) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %894 = chlo.broadcast_maximum %893, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %895 = "mhlo.reshape"(%894) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %896 = "mhlo.dot"(%895, %cst_1029) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %897 = chlo.broadcast_add %896, %cst_1028 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %898 = "mhlo.reshape"(%897) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %899 = chlo.broadcast_add %898, %889 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %900 = chlo.broadcast_multiply %899, %cst_1027 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %901 = chlo.broadcast_add %900, %cst_1026 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %902 = "mhlo.reshape"(%901) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %903 = "mhlo.dot"(%902, %cst_1031) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %904 = chlo.broadcast_add %903, %cst_1030 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %905 = "mhlo.reshape"(%904) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %906 = chlo.broadcast_maximum %905, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %907 = "mhlo.reshape"(%906) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %908 = "mhlo.dot"(%907, %cst_1039) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %909 = chlo.broadcast_add %908, %cst_1038 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %910 = "mhlo.reshape"(%909) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %911 = chlo.broadcast_add %910, %901 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %912 = chlo.broadcast_multiply %911, %cst_1033 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %913 = chlo.broadcast_add %912, %cst_1032 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %914 = "mhlo.reshape"(%913) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %915 = "mhlo.dot"(%914, %cst_1037) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %916 = chlo.broadcast_add %915, %cst_1036 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %917 = "mhlo.reshape"(%916) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %918 = chlo.broadcast_add %917, %821 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %919 = chlo.broadcast_multiply %918, %cst_1035 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %920 = chlo.broadcast_add %919, %cst_1034 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %921 = "mhlo.reshape"(%920) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %922 = "mhlo.dot"(%921, %cst_1049) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %923 = chlo.broadcast_add %922, %cst_1048 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %924 = "mhlo.reshape"(%923) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %925 = "mhlo.transpose"(%924) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %926 = "mhlo.dot"(%921, %cst_1053) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %927 = "mhlo.reshape"(%926) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %928 = "mhlo.broadcast_in_dim"(%cst_1052) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %929 = mhlo.add %927, %928 : tensor<1x384x128xf32>
  %930 = chlo.broadcast_multiply %929, %cst_1051 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %931 = chlo.broadcast_add %930, %cst_1050 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %932 = "mhlo.reshape"(%931) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %933 = "mhlo.dot"(%932, %cst_1045) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %934 = chlo.broadcast_add %933, %cst_1044 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %935 = "mhlo.reshape"(%934) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %936 = "mhlo.transpose"(%935) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %937 = "mhlo.dot"(%932, %cst_1047) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %938 = chlo.broadcast_add %937, %cst_1046 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %939 = "mhlo.reshape"(%938) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %940 = "mhlo.transpose"(%939) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %941 = "mhlo.dot_general"(%940, %936) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %942 = chlo.broadcast_multiply %941, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %943 = chlo.broadcast_add %942, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %944 = "mhlo.reduce"(%943, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %945 = linalg.tensor_expand_shape %944 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %946 = chlo.broadcast_subtract %943, %945 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %947 = "mhlo.exponential"(%946) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %948 = "mhlo.reduce"(%947, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %949 = linalg.tensor_expand_shape %948 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %950 = chlo.broadcast_divide %947, %949 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %951 = "mhlo.dot_general"(%950, %925) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %952 = "mhlo.transpose"(%951) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %953 = "mhlo.reshape"(%952) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %954 = "mhlo.dot"(%953, %cst_1043) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %955 = chlo.broadcast_add %954, %cst_1042 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %956 = "mhlo.reshape"(%955) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %957 = "mhlo.dot"(%921, %cst_1056) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %958 = chlo.broadcast_add %957, %cst_1055 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %959 = "mhlo.reshape"(%958) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %960 = chlo.broadcast_multiply %959, %cst_1054 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %961 = chlo.broadcast_add %960, %cst_1042 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %962 = chlo.broadcast_add %956, %961 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %963 = chlo.broadcast_multiply %962, %cst_1041 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %964 = chlo.broadcast_add %963, %cst_1040 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %965 = "mhlo.reshape"(%964) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %966 = "mhlo.dot"(%965, %cst_1058) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %967 = chlo.broadcast_add %966, %cst_1057 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %968 = "mhlo.reshape"(%967) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %969 = chlo.broadcast_maximum %968, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %970 = "mhlo.reshape"(%969) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %971 = "mhlo.dot"(%970, %cst_1062) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %972 = chlo.broadcast_add %971, %cst_1061 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %973 = "mhlo.reshape"(%972) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %974 = chlo.broadcast_add %973, %964 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %975 = chlo.broadcast_multiply %974, %cst_1060 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %976 = chlo.broadcast_add %975, %cst_1059 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %977 = "mhlo.reshape"(%976) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %978 = "mhlo.dot"(%977, %cst_1064) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %979 = chlo.broadcast_add %978, %cst_1063 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %980 = "mhlo.reshape"(%979) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %981 = chlo.broadcast_maximum %980, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %982 = "mhlo.reshape"(%981) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %983 = "mhlo.dot"(%982, %cst_1068) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %984 = chlo.broadcast_add %983, %cst_1067 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %985 = "mhlo.reshape"(%984) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %986 = chlo.broadcast_add %985, %976 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %987 = chlo.broadcast_multiply %986, %cst_1066 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %988 = chlo.broadcast_add %987, %cst_1065 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %989 = "mhlo.reshape"(%988) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %990 = "mhlo.dot"(%989, %cst_1070) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %991 = chlo.broadcast_add %990, %cst_1069 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %992 = "mhlo.reshape"(%991) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %993 = chlo.broadcast_maximum %992, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %994 = "mhlo.reshape"(%993) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %995 = "mhlo.dot"(%994, %cst_1074) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %996 = chlo.broadcast_add %995, %cst_1073 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %997 = "mhlo.reshape"(%996) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %998 = chlo.broadcast_add %997, %988 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %999 = chlo.broadcast_multiply %998, %cst_1072 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1000 = chlo.broadcast_add %999, %cst_1071 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1001 = "mhlo.reshape"(%1000) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1002 = "mhlo.dot"(%1001, %cst_1076) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1003 = chlo.broadcast_add %1002, %cst_1075 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1004 = "mhlo.reshape"(%1003) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1005 = chlo.broadcast_maximum %1004, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1006 = "mhlo.reshape"(%1005) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1007 = "mhlo.dot"(%1006, %cst_1084) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1008 = chlo.broadcast_add %1007, %cst_1083 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1009 = "mhlo.reshape"(%1008) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1010 = chlo.broadcast_add %1009, %1000 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1011 = chlo.broadcast_multiply %1010, %cst_1078 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1012 = chlo.broadcast_add %1011, %cst_1077 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1013 = "mhlo.reshape"(%1012) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1014 = "mhlo.dot"(%1013, %cst_1082) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1015 = chlo.broadcast_add %1014, %cst_1081 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1016 = "mhlo.reshape"(%1015) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1017 = chlo.broadcast_add %1016, %920 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1018 = chlo.broadcast_multiply %1017, %cst_1080 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1019 = chlo.broadcast_add %1018, %cst_1079 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1020 = "mhlo.reshape"(%1019) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1021 = "mhlo.dot"(%1020, %cst_104) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1022 = chlo.broadcast_add %1021, %cst_103 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1023 = "mhlo.reshape"(%1022) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1024 = "mhlo.transpose"(%1023) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1025 = "mhlo.dot"(%1020, %cst_108) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1026 = "mhlo.reshape"(%1025) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1027 = "mhlo.broadcast_in_dim"(%cst_107) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1028 = mhlo.add %1026, %1027 : tensor<1x384x128xf32>
  %1029 = chlo.broadcast_multiply %1028, %cst_106 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1030 = chlo.broadcast_add %1029, %cst_105 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1031 = "mhlo.reshape"(%1030) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1032 = "mhlo.dot"(%1031, %cst_100) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1033 = chlo.broadcast_add %1032, %cst_99 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1034 = "mhlo.reshape"(%1033) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1035 = "mhlo.transpose"(%1034) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1036 = "mhlo.dot"(%1031, %cst_102) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1037 = chlo.broadcast_add %1036, %cst_101 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1038 = "mhlo.reshape"(%1037) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1039 = "mhlo.transpose"(%1038) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1040 = "mhlo.dot_general"(%1039, %1035) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1041 = chlo.broadcast_multiply %1040, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1042 = chlo.broadcast_add %1041, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1043 = "mhlo.reduce"(%1042, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1044 = linalg.tensor_expand_shape %1043 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1045 = chlo.broadcast_subtract %1042, %1044 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1046 = "mhlo.exponential"(%1045) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1047 = "mhlo.reduce"(%1046, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1048 = linalg.tensor_expand_shape %1047 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1049 = chlo.broadcast_divide %1046, %1048 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1050 = "mhlo.dot_general"(%1049, %1024) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1051 = "mhlo.transpose"(%1050) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1052 = "mhlo.reshape"(%1051) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1053 = "mhlo.dot"(%1052, %cst_98) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1054 = chlo.broadcast_add %1053, %cst_97 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1055 = "mhlo.reshape"(%1054) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1056 = "mhlo.dot"(%1020, %cst_111) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1057 = chlo.broadcast_add %1056, %cst_110 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1058 = "mhlo.reshape"(%1057) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1059 = chlo.broadcast_multiply %1058, %cst_109 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1060 = chlo.broadcast_add %1059, %cst_97 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1061 = chlo.broadcast_add %1055, %1060 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1062 = chlo.broadcast_multiply %1061, %cst_96 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1063 = chlo.broadcast_add %1062, %cst_95 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1064 = "mhlo.reshape"(%1063) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1065 = "mhlo.dot"(%1064, %cst_113) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1066 = chlo.broadcast_add %1065, %cst_112 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1067 = "mhlo.reshape"(%1066) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1068 = chlo.broadcast_maximum %1067, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1069 = "mhlo.reshape"(%1068) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1070 = "mhlo.dot"(%1069, %cst_117) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1071 = chlo.broadcast_add %1070, %cst_116 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1072 = "mhlo.reshape"(%1071) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1073 = chlo.broadcast_add %1072, %1063 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1074 = chlo.broadcast_multiply %1073, %cst_115 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1075 = chlo.broadcast_add %1074, %cst_114 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1076 = "mhlo.reshape"(%1075) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1077 = "mhlo.dot"(%1076, %cst_119) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1078 = chlo.broadcast_add %1077, %cst_118 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1079 = "mhlo.reshape"(%1078) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1080 = chlo.broadcast_maximum %1079, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1081 = "mhlo.reshape"(%1080) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1082 = "mhlo.dot"(%1081, %cst_123) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1083 = chlo.broadcast_add %1082, %cst_122 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1084 = "mhlo.reshape"(%1083) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1085 = chlo.broadcast_add %1084, %1075 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1086 = chlo.broadcast_multiply %1085, %cst_121 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1087 = chlo.broadcast_add %1086, %cst_120 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1088 = "mhlo.reshape"(%1087) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1089 = "mhlo.dot"(%1088, %cst_125) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1090 = chlo.broadcast_add %1089, %cst_124 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1091 = "mhlo.reshape"(%1090) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1092 = chlo.broadcast_maximum %1091, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1093 = "mhlo.reshape"(%1092) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1094 = "mhlo.dot"(%1093, %cst_129) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1095 = chlo.broadcast_add %1094, %cst_128 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1096 = "mhlo.reshape"(%1095) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1097 = chlo.broadcast_add %1096, %1087 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1098 = chlo.broadcast_multiply %1097, %cst_127 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1099 = chlo.broadcast_add %1098, %cst_126 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1100 = "mhlo.reshape"(%1099) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1101 = "mhlo.dot"(%1100, %cst_131) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1102 = chlo.broadcast_add %1101, %cst_130 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1103 = "mhlo.reshape"(%1102) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1104 = chlo.broadcast_maximum %1103, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1105 = "mhlo.reshape"(%1104) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1106 = "mhlo.dot"(%1105, %cst_139) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1107 = chlo.broadcast_add %1106, %cst_138 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1108 = "mhlo.reshape"(%1107) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1109 = chlo.broadcast_add %1108, %1099 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1110 = chlo.broadcast_multiply %1109, %cst_133 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1111 = chlo.broadcast_add %1110, %cst_132 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1112 = "mhlo.reshape"(%1111) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1113 = "mhlo.dot"(%1112, %cst_137) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1114 = chlo.broadcast_add %1113, %cst_136 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1115 = "mhlo.reshape"(%1114) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1116 = chlo.broadcast_add %1115, %1019 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1117 = chlo.broadcast_multiply %1116, %cst_135 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1118 = chlo.broadcast_add %1117, %cst_134 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1119 = "mhlo.reshape"(%1118) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1120 = "mhlo.dot"(%1119, %cst_149) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1121 = chlo.broadcast_add %1120, %cst_148 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1122 = "mhlo.reshape"(%1121) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1123 = "mhlo.transpose"(%1122) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1124 = "mhlo.dot"(%1119, %cst_153) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1125 = "mhlo.reshape"(%1124) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1126 = "mhlo.broadcast_in_dim"(%cst_152) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1127 = mhlo.add %1125, %1126 : tensor<1x384x128xf32>
  %1128 = chlo.broadcast_multiply %1127, %cst_151 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1129 = chlo.broadcast_add %1128, %cst_150 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1130 = "mhlo.reshape"(%1129) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1131 = "mhlo.dot"(%1130, %cst_145) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1132 = chlo.broadcast_add %1131, %cst_144 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1133 = "mhlo.reshape"(%1132) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1134 = "mhlo.transpose"(%1133) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1135 = "mhlo.dot"(%1130, %cst_147) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1136 = chlo.broadcast_add %1135, %cst_146 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1137 = "mhlo.reshape"(%1136) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1138 = "mhlo.transpose"(%1137) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1139 = "mhlo.dot_general"(%1138, %1134) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1140 = chlo.broadcast_multiply %1139, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1141 = chlo.broadcast_add %1140, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1142 = "mhlo.reduce"(%1141, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1143 = linalg.tensor_expand_shape %1142 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1144 = chlo.broadcast_subtract %1141, %1143 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1145 = "mhlo.exponential"(%1144) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1146 = "mhlo.reduce"(%1145, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1147 = linalg.tensor_expand_shape %1146 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1148 = chlo.broadcast_divide %1145, %1147 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1149 = "mhlo.dot_general"(%1148, %1123) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1150 = "mhlo.transpose"(%1149) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1151 = "mhlo.reshape"(%1150) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1152 = "mhlo.dot"(%1151, %cst_143) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1153 = chlo.broadcast_add %1152, %cst_142 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1154 = "mhlo.reshape"(%1153) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1155 = "mhlo.dot"(%1119, %cst_156) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1156 = chlo.broadcast_add %1155, %cst_155 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1157 = "mhlo.reshape"(%1156) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1158 = chlo.broadcast_multiply %1157, %cst_154 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1159 = chlo.broadcast_add %1158, %cst_142 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1160 = chlo.broadcast_add %1154, %1159 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1161 = chlo.broadcast_multiply %1160, %cst_141 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1162 = chlo.broadcast_add %1161, %cst_140 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1163 = "mhlo.reshape"(%1162) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1164 = "mhlo.dot"(%1163, %cst_158) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1165 = chlo.broadcast_add %1164, %cst_157 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1166 = "mhlo.reshape"(%1165) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1167 = chlo.broadcast_maximum %1166, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1168 = "mhlo.reshape"(%1167) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1169 = "mhlo.dot"(%1168, %cst_162) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1170 = chlo.broadcast_add %1169, %cst_161 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1171 = "mhlo.reshape"(%1170) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1172 = chlo.broadcast_add %1171, %1162 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1173 = chlo.broadcast_multiply %1172, %cst_160 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1174 = chlo.broadcast_add %1173, %cst_159 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1175 = "mhlo.reshape"(%1174) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1176 = "mhlo.dot"(%1175, %cst_164) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1177 = chlo.broadcast_add %1176, %cst_163 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1178 = "mhlo.reshape"(%1177) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1179 = chlo.broadcast_maximum %1178, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1180 = "mhlo.reshape"(%1179) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1181 = "mhlo.dot"(%1180, %cst_168) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1182 = chlo.broadcast_add %1181, %cst_167 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1183 = "mhlo.reshape"(%1182) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1184 = chlo.broadcast_add %1183, %1174 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1185 = chlo.broadcast_multiply %1184, %cst_166 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1186 = chlo.broadcast_add %1185, %cst_165 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1187 = "mhlo.reshape"(%1186) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1188 = "mhlo.dot"(%1187, %cst_170) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1189 = chlo.broadcast_add %1188, %cst_169 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1190 = "mhlo.reshape"(%1189) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1191 = chlo.broadcast_maximum %1190, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1192 = "mhlo.reshape"(%1191) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1193 = "mhlo.dot"(%1192, %cst_174) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1194 = chlo.broadcast_add %1193, %cst_173 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1195 = "mhlo.reshape"(%1194) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1196 = chlo.broadcast_add %1195, %1186 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1197 = chlo.broadcast_multiply %1196, %cst_172 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1198 = chlo.broadcast_add %1197, %cst_171 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1199 = "mhlo.reshape"(%1198) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1200 = "mhlo.dot"(%1199, %cst_176) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1201 = chlo.broadcast_add %1200, %cst_175 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1202 = "mhlo.reshape"(%1201) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1203 = chlo.broadcast_maximum %1202, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1204 = "mhlo.reshape"(%1203) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1205 = "mhlo.dot"(%1204, %cst_184) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1206 = chlo.broadcast_add %1205, %cst_183 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1207 = "mhlo.reshape"(%1206) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1208 = chlo.broadcast_add %1207, %1198 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1209 = chlo.broadcast_multiply %1208, %cst_178 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1210 = chlo.broadcast_add %1209, %cst_177 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1211 = "mhlo.reshape"(%1210) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1212 = "mhlo.dot"(%1211, %cst_182) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1213 = chlo.broadcast_add %1212, %cst_181 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1214 = "mhlo.reshape"(%1213) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1215 = chlo.broadcast_add %1214, %1118 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1216 = chlo.broadcast_multiply %1215, %cst_180 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1217 = chlo.broadcast_add %1216, %cst_179 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1218 = "mhlo.reshape"(%1217) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1219 = "mhlo.dot"(%1218, %cst_194) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1220 = chlo.broadcast_add %1219, %cst_193 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1221 = "mhlo.reshape"(%1220) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1222 = "mhlo.transpose"(%1221) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1223 = "mhlo.dot"(%1218, %cst_198) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1224 = "mhlo.reshape"(%1223) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1225 = "mhlo.broadcast_in_dim"(%cst_197) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1226 = mhlo.add %1224, %1225 : tensor<1x384x128xf32>
  %1227 = chlo.broadcast_multiply %1226, %cst_196 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1228 = chlo.broadcast_add %1227, %cst_195 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1229 = "mhlo.reshape"(%1228) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1230 = "mhlo.dot"(%1229, %cst_190) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1231 = chlo.broadcast_add %1230, %cst_189 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1232 = "mhlo.reshape"(%1231) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1233 = "mhlo.transpose"(%1232) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1234 = "mhlo.dot"(%1229, %cst_192) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1235 = chlo.broadcast_add %1234, %cst_191 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1236 = "mhlo.reshape"(%1235) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1237 = "mhlo.transpose"(%1236) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1238 = "mhlo.dot_general"(%1237, %1233) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1239 = chlo.broadcast_multiply %1238, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1240 = chlo.broadcast_add %1239, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1241 = "mhlo.reduce"(%1240, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1242 = linalg.tensor_expand_shape %1241 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1243 = chlo.broadcast_subtract %1240, %1242 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1244 = "mhlo.exponential"(%1243) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1245 = "mhlo.reduce"(%1244, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1246 = linalg.tensor_expand_shape %1245 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1247 = chlo.broadcast_divide %1244, %1246 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1248 = "mhlo.dot_general"(%1247, %1222) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1249 = "mhlo.transpose"(%1248) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1250 = "mhlo.reshape"(%1249) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1251 = "mhlo.dot"(%1250, %cst_188) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1252 = chlo.broadcast_add %1251, %cst_187 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1253 = "mhlo.reshape"(%1252) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1254 = "mhlo.dot"(%1218, %cst_201) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1255 = chlo.broadcast_add %1254, %cst_200 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1256 = "mhlo.reshape"(%1255) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1257 = chlo.broadcast_multiply %1256, %cst_199 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1258 = chlo.broadcast_add %1257, %cst_187 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1259 = chlo.broadcast_add %1253, %1258 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1260 = chlo.broadcast_multiply %1259, %cst_186 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1261 = chlo.broadcast_add %1260, %cst_185 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1262 = "mhlo.reshape"(%1261) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1263 = "mhlo.dot"(%1262, %cst_203) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1264 = chlo.broadcast_add %1263, %cst_202 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1265 = "mhlo.reshape"(%1264) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1266 = chlo.broadcast_maximum %1265, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1267 = "mhlo.reshape"(%1266) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1268 = "mhlo.dot"(%1267, %cst_207) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1269 = chlo.broadcast_add %1268, %cst_206 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1270 = "mhlo.reshape"(%1269) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1271 = chlo.broadcast_add %1270, %1261 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1272 = chlo.broadcast_multiply %1271, %cst_205 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1273 = chlo.broadcast_add %1272, %cst_204 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1274 = "mhlo.reshape"(%1273) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1275 = "mhlo.dot"(%1274, %cst_209) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1276 = chlo.broadcast_add %1275, %cst_208 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1277 = "mhlo.reshape"(%1276) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1278 = chlo.broadcast_maximum %1277, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1279 = "mhlo.reshape"(%1278) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1280 = "mhlo.dot"(%1279, %cst_213) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1281 = chlo.broadcast_add %1280, %cst_212 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1282 = "mhlo.reshape"(%1281) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1283 = chlo.broadcast_add %1282, %1273 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1284 = chlo.broadcast_multiply %1283, %cst_211 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1285 = chlo.broadcast_add %1284, %cst_210 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1286 = "mhlo.reshape"(%1285) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1287 = "mhlo.dot"(%1286, %cst_215) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1288 = chlo.broadcast_add %1287, %cst_214 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1289 = "mhlo.reshape"(%1288) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1290 = chlo.broadcast_maximum %1289, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1291 = "mhlo.reshape"(%1290) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1292 = "mhlo.dot"(%1291, %cst_219) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1293 = chlo.broadcast_add %1292, %cst_218 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1294 = "mhlo.reshape"(%1293) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1295 = chlo.broadcast_add %1294, %1285 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1296 = chlo.broadcast_multiply %1295, %cst_217 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1297 = chlo.broadcast_add %1296, %cst_216 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1298 = "mhlo.reshape"(%1297) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1299 = "mhlo.dot"(%1298, %cst_221) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1300 = chlo.broadcast_add %1299, %cst_220 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1301 = "mhlo.reshape"(%1300) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1302 = chlo.broadcast_maximum %1301, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1303 = "mhlo.reshape"(%1302) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1304 = "mhlo.dot"(%1303, %cst_229) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1305 = chlo.broadcast_add %1304, %cst_228 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1306 = "mhlo.reshape"(%1305) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1307 = chlo.broadcast_add %1306, %1297 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1308 = chlo.broadcast_multiply %1307, %cst_223 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1309 = chlo.broadcast_add %1308, %cst_222 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1310 = "mhlo.reshape"(%1309) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1311 = "mhlo.dot"(%1310, %cst_227) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1312 = chlo.broadcast_add %1311, %cst_226 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1313 = "mhlo.reshape"(%1312) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1314 = chlo.broadcast_add %1313, %1217 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1315 = chlo.broadcast_multiply %1314, %cst_225 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1316 = chlo.broadcast_add %1315, %cst_224 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1317 = "mhlo.reshape"(%1316) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1318 = "mhlo.dot"(%1317, %cst_239) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1319 = chlo.broadcast_add %1318, %cst_238 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1320 = "mhlo.reshape"(%1319) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1321 = "mhlo.transpose"(%1320) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1322 = "mhlo.dot"(%1317, %cst_243) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1323 = "mhlo.reshape"(%1322) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1324 = "mhlo.broadcast_in_dim"(%cst_242) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1325 = mhlo.add %1323, %1324 : tensor<1x384x128xf32>
  %1326 = chlo.broadcast_multiply %1325, %cst_241 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1327 = chlo.broadcast_add %1326, %cst_240 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1328 = "mhlo.reshape"(%1327) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1329 = "mhlo.dot"(%1328, %cst_235) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1330 = chlo.broadcast_add %1329, %cst_234 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1331 = "mhlo.reshape"(%1330) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1332 = "mhlo.transpose"(%1331) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1333 = "mhlo.dot"(%1328, %cst_237) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1334 = chlo.broadcast_add %1333, %cst_236 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1335 = "mhlo.reshape"(%1334) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1336 = "mhlo.transpose"(%1335) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1337 = "mhlo.dot_general"(%1336, %1332) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1338 = chlo.broadcast_multiply %1337, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1339 = chlo.broadcast_add %1338, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1340 = "mhlo.reduce"(%1339, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1341 = linalg.tensor_expand_shape %1340 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1342 = chlo.broadcast_subtract %1339, %1341 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1343 = "mhlo.exponential"(%1342) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1344 = "mhlo.reduce"(%1343, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1345 = linalg.tensor_expand_shape %1344 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1346 = chlo.broadcast_divide %1343, %1345 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1347 = "mhlo.dot_general"(%1346, %1321) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1348 = "mhlo.transpose"(%1347) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1349 = "mhlo.reshape"(%1348) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1350 = "mhlo.dot"(%1349, %cst_233) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1351 = chlo.broadcast_add %1350, %cst_232 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1352 = "mhlo.reshape"(%1351) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1353 = "mhlo.dot"(%1317, %cst_246) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1354 = chlo.broadcast_add %1353, %cst_245 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1355 = "mhlo.reshape"(%1354) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1356 = chlo.broadcast_multiply %1355, %cst_244 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1357 = chlo.broadcast_add %1356, %cst_232 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1358 = chlo.broadcast_add %1352, %1357 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1359 = chlo.broadcast_multiply %1358, %cst_231 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1360 = chlo.broadcast_add %1359, %cst_230 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1361 = "mhlo.reshape"(%1360) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1362 = "mhlo.dot"(%1361, %cst_248) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1363 = chlo.broadcast_add %1362, %cst_247 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1364 = "mhlo.reshape"(%1363) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1365 = chlo.broadcast_maximum %1364, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1366 = "mhlo.reshape"(%1365) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1367 = "mhlo.dot"(%1366, %cst_252) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1368 = chlo.broadcast_add %1367, %cst_251 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1369 = "mhlo.reshape"(%1368) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1370 = chlo.broadcast_add %1369, %1360 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1371 = chlo.broadcast_multiply %1370, %cst_250 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1372 = chlo.broadcast_add %1371, %cst_249 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1373 = "mhlo.reshape"(%1372) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1374 = "mhlo.dot"(%1373, %cst_254) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1375 = chlo.broadcast_add %1374, %cst_253 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1376 = "mhlo.reshape"(%1375) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1377 = chlo.broadcast_maximum %1376, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1378 = "mhlo.reshape"(%1377) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1379 = "mhlo.dot"(%1378, %cst_258) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1380 = chlo.broadcast_add %1379, %cst_257 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1381 = "mhlo.reshape"(%1380) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1382 = chlo.broadcast_add %1381, %1372 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1383 = chlo.broadcast_multiply %1382, %cst_256 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1384 = chlo.broadcast_add %1383, %cst_255 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1385 = "mhlo.reshape"(%1384) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1386 = "mhlo.dot"(%1385, %cst_260) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1387 = chlo.broadcast_add %1386, %cst_259 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1388 = "mhlo.reshape"(%1387) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1389 = chlo.broadcast_maximum %1388, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1390 = "mhlo.reshape"(%1389) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1391 = "mhlo.dot"(%1390, %cst_264) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1392 = chlo.broadcast_add %1391, %cst_263 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1393 = "mhlo.reshape"(%1392) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1394 = chlo.broadcast_add %1393, %1384 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1395 = chlo.broadcast_multiply %1394, %cst_262 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1396 = chlo.broadcast_add %1395, %cst_261 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1397 = "mhlo.reshape"(%1396) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1398 = "mhlo.dot"(%1397, %cst_266) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1399 = chlo.broadcast_add %1398, %cst_265 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1400 = "mhlo.reshape"(%1399) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1401 = chlo.broadcast_maximum %1400, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1402 = "mhlo.reshape"(%1401) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1403 = "mhlo.dot"(%1402, %cst_274) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1404 = chlo.broadcast_add %1403, %cst_273 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1405 = "mhlo.reshape"(%1404) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1406 = chlo.broadcast_add %1405, %1396 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1407 = chlo.broadcast_multiply %1406, %cst_268 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1408 = chlo.broadcast_add %1407, %cst_267 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1409 = "mhlo.reshape"(%1408) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1410 = "mhlo.dot"(%1409, %cst_272) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1411 = chlo.broadcast_add %1410, %cst_271 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1412 = "mhlo.reshape"(%1411) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1413 = chlo.broadcast_add %1412, %1316 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1414 = chlo.broadcast_multiply %1413, %cst_270 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1415 = chlo.broadcast_add %1414, %cst_269 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1416 = "mhlo.reshape"(%1415) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1417 = "mhlo.dot"(%1416, %cst_284) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1418 = chlo.broadcast_add %1417, %cst_283 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1419 = "mhlo.reshape"(%1418) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1420 = "mhlo.transpose"(%1419) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1421 = "mhlo.dot"(%1416, %cst_288) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1422 = "mhlo.reshape"(%1421) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1423 = "mhlo.broadcast_in_dim"(%cst_287) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1424 = mhlo.add %1422, %1423 : tensor<1x384x128xf32>
  %1425 = chlo.broadcast_multiply %1424, %cst_286 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1426 = chlo.broadcast_add %1425, %cst_285 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1427 = "mhlo.reshape"(%1426) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1428 = "mhlo.dot"(%1427, %cst_280) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1429 = chlo.broadcast_add %1428, %cst_279 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1430 = "mhlo.reshape"(%1429) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1431 = "mhlo.transpose"(%1430) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1432 = "mhlo.dot"(%1427, %cst_282) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1433 = chlo.broadcast_add %1432, %cst_281 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1434 = "mhlo.reshape"(%1433) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1435 = "mhlo.transpose"(%1434) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1436 = "mhlo.dot_general"(%1435, %1431) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1437 = chlo.broadcast_multiply %1436, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1438 = chlo.broadcast_add %1437, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1439 = "mhlo.reduce"(%1438, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1440 = linalg.tensor_expand_shape %1439 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1441 = chlo.broadcast_subtract %1438, %1440 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1442 = "mhlo.exponential"(%1441) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1443 = "mhlo.reduce"(%1442, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1444 = linalg.tensor_expand_shape %1443 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1445 = chlo.broadcast_divide %1442, %1444 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1446 = "mhlo.dot_general"(%1445, %1420) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1447 = "mhlo.transpose"(%1446) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1448 = "mhlo.reshape"(%1447) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1449 = "mhlo.dot"(%1448, %cst_278) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1450 = chlo.broadcast_add %1449, %cst_277 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1451 = "mhlo.reshape"(%1450) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1452 = "mhlo.dot"(%1416, %cst_291) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1453 = chlo.broadcast_add %1452, %cst_290 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1454 = "mhlo.reshape"(%1453) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1455 = chlo.broadcast_multiply %1454, %cst_289 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1456 = chlo.broadcast_add %1455, %cst_277 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1457 = chlo.broadcast_add %1451, %1456 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1458 = chlo.broadcast_multiply %1457, %cst_276 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1459 = chlo.broadcast_add %1458, %cst_275 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1460 = "mhlo.reshape"(%1459) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1461 = "mhlo.dot"(%1460, %cst_293) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1462 = chlo.broadcast_add %1461, %cst_292 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1463 = "mhlo.reshape"(%1462) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1464 = chlo.broadcast_maximum %1463, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1465 = "mhlo.reshape"(%1464) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1466 = "mhlo.dot"(%1465, %cst_297) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1467 = chlo.broadcast_add %1466, %cst_296 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1468 = "mhlo.reshape"(%1467) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1469 = chlo.broadcast_add %1468, %1459 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1470 = chlo.broadcast_multiply %1469, %cst_295 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1471 = chlo.broadcast_add %1470, %cst_294 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1472 = "mhlo.reshape"(%1471) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1473 = "mhlo.dot"(%1472, %cst_299) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1474 = chlo.broadcast_add %1473, %cst_298 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1475 = "mhlo.reshape"(%1474) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1476 = chlo.broadcast_maximum %1475, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1477 = "mhlo.reshape"(%1476) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1478 = "mhlo.dot"(%1477, %cst_303) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1479 = chlo.broadcast_add %1478, %cst_302 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1480 = "mhlo.reshape"(%1479) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1481 = chlo.broadcast_add %1480, %1471 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1482 = chlo.broadcast_multiply %1481, %cst_301 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1483 = chlo.broadcast_add %1482, %cst_300 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1484 = "mhlo.reshape"(%1483) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1485 = "mhlo.dot"(%1484, %cst_305) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1486 = chlo.broadcast_add %1485, %cst_304 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1487 = "mhlo.reshape"(%1486) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1488 = chlo.broadcast_maximum %1487, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1489 = "mhlo.reshape"(%1488) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1490 = "mhlo.dot"(%1489, %cst_309) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1491 = chlo.broadcast_add %1490, %cst_308 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1492 = "mhlo.reshape"(%1491) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1493 = chlo.broadcast_add %1492, %1483 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1494 = chlo.broadcast_multiply %1493, %cst_307 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1495 = chlo.broadcast_add %1494, %cst_306 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1496 = "mhlo.reshape"(%1495) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1497 = "mhlo.dot"(%1496, %cst_311) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1498 = chlo.broadcast_add %1497, %cst_310 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1499 = "mhlo.reshape"(%1498) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1500 = chlo.broadcast_maximum %1499, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1501 = "mhlo.reshape"(%1500) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1502 = "mhlo.dot"(%1501, %cst_319) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1503 = chlo.broadcast_add %1502, %cst_318 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1504 = "mhlo.reshape"(%1503) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1505 = chlo.broadcast_add %1504, %1495 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1506 = chlo.broadcast_multiply %1505, %cst_313 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1507 = chlo.broadcast_add %1506, %cst_312 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1508 = "mhlo.reshape"(%1507) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1509 = "mhlo.dot"(%1508, %cst_317) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1510 = chlo.broadcast_add %1509, %cst_316 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1511 = "mhlo.reshape"(%1510) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1512 = chlo.broadcast_add %1511, %1415 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1513 = chlo.broadcast_multiply %1512, %cst_315 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1514 = chlo.broadcast_add %1513, %cst_314 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1515 = "mhlo.reshape"(%1514) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1516 = "mhlo.dot"(%1515, %cst_329) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1517 = chlo.broadcast_add %1516, %cst_328 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1518 = "mhlo.reshape"(%1517) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1519 = "mhlo.transpose"(%1518) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1520 = "mhlo.dot"(%1515, %cst_333) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1521 = "mhlo.reshape"(%1520) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1522 = "mhlo.broadcast_in_dim"(%cst_332) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1523 = mhlo.add %1521, %1522 : tensor<1x384x128xf32>
  %1524 = chlo.broadcast_multiply %1523, %cst_331 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1525 = chlo.broadcast_add %1524, %cst_330 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1526 = "mhlo.reshape"(%1525) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1527 = "mhlo.dot"(%1526, %cst_325) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1528 = chlo.broadcast_add %1527, %cst_324 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1529 = "mhlo.reshape"(%1528) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1530 = "mhlo.transpose"(%1529) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1531 = "mhlo.dot"(%1526, %cst_327) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1532 = chlo.broadcast_add %1531, %cst_326 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1533 = "mhlo.reshape"(%1532) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1534 = "mhlo.transpose"(%1533) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1535 = "mhlo.dot_general"(%1534, %1530) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1536 = chlo.broadcast_multiply %1535, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1537 = chlo.broadcast_add %1536, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1538 = "mhlo.reduce"(%1537, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1539 = linalg.tensor_expand_shape %1538 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1540 = chlo.broadcast_subtract %1537, %1539 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1541 = "mhlo.exponential"(%1540) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1542 = "mhlo.reduce"(%1541, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1543 = linalg.tensor_expand_shape %1542 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1544 = chlo.broadcast_divide %1541, %1543 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1545 = "mhlo.dot_general"(%1544, %1519) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1546 = "mhlo.transpose"(%1545) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1547 = "mhlo.reshape"(%1546) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1548 = "mhlo.dot"(%1547, %cst_323) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1549 = chlo.broadcast_add %1548, %cst_322 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1550 = "mhlo.reshape"(%1549) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1551 = "mhlo.dot"(%1515, %cst_336) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1552 = chlo.broadcast_add %1551, %cst_335 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1553 = "mhlo.reshape"(%1552) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1554 = chlo.broadcast_multiply %1553, %cst_334 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1555 = chlo.broadcast_add %1554, %cst_322 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1556 = chlo.broadcast_add %1550, %1555 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1557 = chlo.broadcast_multiply %1556, %cst_321 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1558 = chlo.broadcast_add %1557, %cst_320 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1559 = "mhlo.reshape"(%1558) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1560 = "mhlo.dot"(%1559, %cst_338) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1561 = chlo.broadcast_add %1560, %cst_337 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1562 = "mhlo.reshape"(%1561) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1563 = chlo.broadcast_maximum %1562, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1564 = "mhlo.reshape"(%1563) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1565 = "mhlo.dot"(%1564, %cst_342) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1566 = chlo.broadcast_add %1565, %cst_341 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1567 = "mhlo.reshape"(%1566) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1568 = chlo.broadcast_add %1567, %1558 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1569 = chlo.broadcast_multiply %1568, %cst_340 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1570 = chlo.broadcast_add %1569, %cst_339 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1571 = "mhlo.reshape"(%1570) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1572 = "mhlo.dot"(%1571, %cst_344) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1573 = chlo.broadcast_add %1572, %cst_343 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1574 = "mhlo.reshape"(%1573) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1575 = chlo.broadcast_maximum %1574, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1576 = "mhlo.reshape"(%1575) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1577 = "mhlo.dot"(%1576, %cst_348) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1578 = chlo.broadcast_add %1577, %cst_347 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1579 = "mhlo.reshape"(%1578) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1580 = chlo.broadcast_add %1579, %1570 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1581 = chlo.broadcast_multiply %1580, %cst_346 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1582 = chlo.broadcast_add %1581, %cst_345 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1583 = "mhlo.reshape"(%1582) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1584 = "mhlo.dot"(%1583, %cst_350) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1585 = chlo.broadcast_add %1584, %cst_349 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1586 = "mhlo.reshape"(%1585) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1587 = chlo.broadcast_maximum %1586, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1588 = "mhlo.reshape"(%1587) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1589 = "mhlo.dot"(%1588, %cst_354) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1590 = chlo.broadcast_add %1589, %cst_353 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1591 = "mhlo.reshape"(%1590) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1592 = chlo.broadcast_add %1591, %1582 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1593 = chlo.broadcast_multiply %1592, %cst_352 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1594 = chlo.broadcast_add %1593, %cst_351 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1595 = "mhlo.reshape"(%1594) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1596 = "mhlo.dot"(%1595, %cst_356) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1597 = chlo.broadcast_add %1596, %cst_355 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1598 = "mhlo.reshape"(%1597) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1599 = chlo.broadcast_maximum %1598, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1600 = "mhlo.reshape"(%1599) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1601 = "mhlo.dot"(%1600, %cst_364) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1602 = chlo.broadcast_add %1601, %cst_363 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1603 = "mhlo.reshape"(%1602) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1604 = chlo.broadcast_add %1603, %1594 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1605 = chlo.broadcast_multiply %1604, %cst_358 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1606 = chlo.broadcast_add %1605, %cst_357 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1607 = "mhlo.reshape"(%1606) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1608 = "mhlo.dot"(%1607, %cst_362) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1609 = chlo.broadcast_add %1608, %cst_361 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1610 = "mhlo.reshape"(%1609) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1611 = chlo.broadcast_add %1610, %1514 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1612 = chlo.broadcast_multiply %1611, %cst_360 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1613 = chlo.broadcast_add %1612, %cst_359 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1614 = "mhlo.reshape"(%1613) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1615 = "mhlo.dot"(%1614, %cst_374) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1616 = chlo.broadcast_add %1615, %cst_373 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1617 = "mhlo.reshape"(%1616) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1618 = "mhlo.transpose"(%1617) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1619 = "mhlo.dot"(%1614, %cst_378) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1620 = "mhlo.reshape"(%1619) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1621 = "mhlo.broadcast_in_dim"(%cst_377) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1622 = mhlo.add %1620, %1621 : tensor<1x384x128xf32>
  %1623 = chlo.broadcast_multiply %1622, %cst_376 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1624 = chlo.broadcast_add %1623, %cst_375 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1625 = "mhlo.reshape"(%1624) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1626 = "mhlo.dot"(%1625, %cst_370) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1627 = chlo.broadcast_add %1626, %cst_369 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1628 = "mhlo.reshape"(%1627) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1629 = "mhlo.transpose"(%1628) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1630 = "mhlo.dot"(%1625, %cst_372) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1631 = chlo.broadcast_add %1630, %cst_371 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1632 = "mhlo.reshape"(%1631) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1633 = "mhlo.transpose"(%1632) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1634 = "mhlo.dot_general"(%1633, %1629) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1635 = chlo.broadcast_multiply %1634, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1636 = chlo.broadcast_add %1635, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1637 = "mhlo.reduce"(%1636, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1638 = linalg.tensor_expand_shape %1637 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1639 = chlo.broadcast_subtract %1636, %1638 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1640 = "mhlo.exponential"(%1639) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1641 = "mhlo.reduce"(%1640, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1642 = linalg.tensor_expand_shape %1641 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1643 = chlo.broadcast_divide %1640, %1642 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1644 = "mhlo.dot_general"(%1643, %1618) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1645 = "mhlo.transpose"(%1644) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1646 = "mhlo.reshape"(%1645) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1647 = "mhlo.dot"(%1646, %cst_368) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1648 = chlo.broadcast_add %1647, %cst_367 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1649 = "mhlo.reshape"(%1648) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1650 = "mhlo.dot"(%1614, %cst_381) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1651 = chlo.broadcast_add %1650, %cst_380 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1652 = "mhlo.reshape"(%1651) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1653 = chlo.broadcast_multiply %1652, %cst_379 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1654 = chlo.broadcast_add %1653, %cst_367 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1655 = chlo.broadcast_add %1649, %1654 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1656 = chlo.broadcast_multiply %1655, %cst_366 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1657 = chlo.broadcast_add %1656, %cst_365 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1658 = "mhlo.reshape"(%1657) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1659 = "mhlo.dot"(%1658, %cst_383) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1660 = chlo.broadcast_add %1659, %cst_382 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1661 = "mhlo.reshape"(%1660) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1662 = chlo.broadcast_maximum %1661, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1663 = "mhlo.reshape"(%1662) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1664 = "mhlo.dot"(%1663, %cst_387) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1665 = chlo.broadcast_add %1664, %cst_386 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1666 = "mhlo.reshape"(%1665) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1667 = chlo.broadcast_add %1666, %1657 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1668 = chlo.broadcast_multiply %1667, %cst_385 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1669 = chlo.broadcast_add %1668, %cst_384 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1670 = "mhlo.reshape"(%1669) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1671 = "mhlo.dot"(%1670, %cst_389) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1672 = chlo.broadcast_add %1671, %cst_388 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1673 = "mhlo.reshape"(%1672) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1674 = chlo.broadcast_maximum %1673, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1675 = "mhlo.reshape"(%1674) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1676 = "mhlo.dot"(%1675, %cst_393) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1677 = chlo.broadcast_add %1676, %cst_392 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1678 = "mhlo.reshape"(%1677) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1679 = chlo.broadcast_add %1678, %1669 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1680 = chlo.broadcast_multiply %1679, %cst_391 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1681 = chlo.broadcast_add %1680, %cst_390 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1682 = "mhlo.reshape"(%1681) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1683 = "mhlo.dot"(%1682, %cst_395) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1684 = chlo.broadcast_add %1683, %cst_394 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1685 = "mhlo.reshape"(%1684) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1686 = chlo.broadcast_maximum %1685, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1687 = "mhlo.reshape"(%1686) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1688 = "mhlo.dot"(%1687, %cst_399) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1689 = chlo.broadcast_add %1688, %cst_398 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1690 = "mhlo.reshape"(%1689) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1691 = chlo.broadcast_add %1690, %1681 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1692 = chlo.broadcast_multiply %1691, %cst_397 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1693 = chlo.broadcast_add %1692, %cst_396 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1694 = "mhlo.reshape"(%1693) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1695 = "mhlo.dot"(%1694, %cst_401) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1696 = chlo.broadcast_add %1695, %cst_400 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1697 = "mhlo.reshape"(%1696) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1698 = chlo.broadcast_maximum %1697, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1699 = "mhlo.reshape"(%1698) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1700 = "mhlo.dot"(%1699, %cst_409) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1701 = chlo.broadcast_add %1700, %cst_408 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1702 = "mhlo.reshape"(%1701) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1703 = chlo.broadcast_add %1702, %1693 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1704 = chlo.broadcast_multiply %1703, %cst_403 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1705 = chlo.broadcast_add %1704, %cst_402 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1706 = "mhlo.reshape"(%1705) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1707 = "mhlo.dot"(%1706, %cst_407) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1708 = chlo.broadcast_add %1707, %cst_406 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1709 = "mhlo.reshape"(%1708) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1710 = chlo.broadcast_add %1709, %1613 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1711 = chlo.broadcast_multiply %1710, %cst_405 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1712 = chlo.broadcast_add %1711, %cst_404 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1713 = "mhlo.reshape"(%1712) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1714 = "mhlo.dot"(%1713, %cst_419) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1715 = chlo.broadcast_add %1714, %cst_418 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1716 = "mhlo.reshape"(%1715) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1717 = "mhlo.transpose"(%1716) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1718 = "mhlo.dot"(%1713, %cst_423) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1719 = "mhlo.reshape"(%1718) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1720 = "mhlo.broadcast_in_dim"(%cst_422) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1721 = mhlo.add %1719, %1720 : tensor<1x384x128xf32>
  %1722 = chlo.broadcast_multiply %1721, %cst_421 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1723 = chlo.broadcast_add %1722, %cst_420 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1724 = "mhlo.reshape"(%1723) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1725 = "mhlo.dot"(%1724, %cst_415) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1726 = chlo.broadcast_add %1725, %cst_414 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1727 = "mhlo.reshape"(%1726) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1728 = "mhlo.transpose"(%1727) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1729 = "mhlo.dot"(%1724, %cst_417) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1730 = chlo.broadcast_add %1729, %cst_416 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1731 = "mhlo.reshape"(%1730) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1732 = "mhlo.transpose"(%1731) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1733 = "mhlo.dot_general"(%1732, %1728) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1734 = chlo.broadcast_multiply %1733, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1735 = chlo.broadcast_add %1734, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1736 = "mhlo.reduce"(%1735, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1737 = linalg.tensor_expand_shape %1736 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1738 = chlo.broadcast_subtract %1735, %1737 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1739 = "mhlo.exponential"(%1738) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1740 = "mhlo.reduce"(%1739, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1741 = linalg.tensor_expand_shape %1740 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1742 = chlo.broadcast_divide %1739, %1741 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1743 = "mhlo.dot_general"(%1742, %1717) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1744 = "mhlo.transpose"(%1743) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1745 = "mhlo.reshape"(%1744) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1746 = "mhlo.dot"(%1745, %cst_413) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1747 = chlo.broadcast_add %1746, %cst_412 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1748 = "mhlo.reshape"(%1747) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1749 = "mhlo.dot"(%1713, %cst_426) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1750 = chlo.broadcast_add %1749, %cst_425 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1751 = "mhlo.reshape"(%1750) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1752 = chlo.broadcast_multiply %1751, %cst_424 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1753 = chlo.broadcast_add %1752, %cst_412 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1754 = chlo.broadcast_add %1748, %1753 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1755 = chlo.broadcast_multiply %1754, %cst_411 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1756 = chlo.broadcast_add %1755, %cst_410 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1757 = "mhlo.reshape"(%1756) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1758 = "mhlo.dot"(%1757, %cst_428) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1759 = chlo.broadcast_add %1758, %cst_427 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1760 = "mhlo.reshape"(%1759) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1761 = chlo.broadcast_maximum %1760, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1762 = "mhlo.reshape"(%1761) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1763 = "mhlo.dot"(%1762, %cst_432) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1764 = chlo.broadcast_add %1763, %cst_431 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1765 = "mhlo.reshape"(%1764) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1766 = chlo.broadcast_add %1765, %1756 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1767 = chlo.broadcast_multiply %1766, %cst_430 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1768 = chlo.broadcast_add %1767, %cst_429 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1769 = "mhlo.reshape"(%1768) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1770 = "mhlo.dot"(%1769, %cst_434) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1771 = chlo.broadcast_add %1770, %cst_433 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1772 = "mhlo.reshape"(%1771) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1773 = chlo.broadcast_maximum %1772, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1774 = "mhlo.reshape"(%1773) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1775 = "mhlo.dot"(%1774, %cst_438) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1776 = chlo.broadcast_add %1775, %cst_437 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1777 = "mhlo.reshape"(%1776) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1778 = chlo.broadcast_add %1777, %1768 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1779 = chlo.broadcast_multiply %1778, %cst_436 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1780 = chlo.broadcast_add %1779, %cst_435 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1781 = "mhlo.reshape"(%1780) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1782 = "mhlo.dot"(%1781, %cst_440) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1783 = chlo.broadcast_add %1782, %cst_439 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1784 = "mhlo.reshape"(%1783) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1785 = chlo.broadcast_maximum %1784, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1786 = "mhlo.reshape"(%1785) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1787 = "mhlo.dot"(%1786, %cst_444) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1788 = chlo.broadcast_add %1787, %cst_443 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1789 = "mhlo.reshape"(%1788) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1790 = chlo.broadcast_add %1789, %1780 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1791 = chlo.broadcast_multiply %1790, %cst_442 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1792 = chlo.broadcast_add %1791, %cst_441 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1793 = "mhlo.reshape"(%1792) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1794 = "mhlo.dot"(%1793, %cst_446) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1795 = chlo.broadcast_add %1794, %cst_445 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1796 = "mhlo.reshape"(%1795) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1797 = chlo.broadcast_maximum %1796, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1798 = "mhlo.reshape"(%1797) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1799 = "mhlo.dot"(%1798, %cst_454) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1800 = chlo.broadcast_add %1799, %cst_453 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1801 = "mhlo.reshape"(%1800) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1802 = chlo.broadcast_add %1801, %1792 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1803 = chlo.broadcast_multiply %1802, %cst_448 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1804 = chlo.broadcast_add %1803, %cst_447 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1805 = "mhlo.reshape"(%1804) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1806 = "mhlo.dot"(%1805, %cst_452) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1807 = chlo.broadcast_add %1806, %cst_451 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1808 = "mhlo.reshape"(%1807) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1809 = chlo.broadcast_add %1808, %1712 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1810 = chlo.broadcast_multiply %1809, %cst_450 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1811 = chlo.broadcast_add %1810, %cst_449 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1812 = "mhlo.reshape"(%1811) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1813 = "mhlo.dot"(%1812, %cst_464) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1814 = chlo.broadcast_add %1813, %cst_463 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1815 = "mhlo.reshape"(%1814) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1816 = "mhlo.transpose"(%1815) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1817 = "mhlo.dot"(%1812, %cst_468) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1818 = "mhlo.reshape"(%1817) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1819 = "mhlo.broadcast_in_dim"(%cst_467) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1820 = mhlo.add %1818, %1819 : tensor<1x384x128xf32>
  %1821 = chlo.broadcast_multiply %1820, %cst_466 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1822 = chlo.broadcast_add %1821, %cst_465 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1823 = "mhlo.reshape"(%1822) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1824 = "mhlo.dot"(%1823, %cst_460) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1825 = chlo.broadcast_add %1824, %cst_459 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1826 = "mhlo.reshape"(%1825) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1827 = "mhlo.transpose"(%1826) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1828 = "mhlo.dot"(%1823, %cst_462) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1829 = chlo.broadcast_add %1828, %cst_461 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1830 = "mhlo.reshape"(%1829) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1831 = "mhlo.transpose"(%1830) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1832 = "mhlo.dot_general"(%1831, %1827) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1833 = chlo.broadcast_multiply %1832, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1834 = chlo.broadcast_add %1833, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1835 = "mhlo.reduce"(%1834, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1836 = linalg.tensor_expand_shape %1835 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1837 = chlo.broadcast_subtract %1834, %1836 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1838 = "mhlo.exponential"(%1837) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1839 = "mhlo.reduce"(%1838, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1840 = linalg.tensor_expand_shape %1839 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1841 = chlo.broadcast_divide %1838, %1840 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1842 = "mhlo.dot_general"(%1841, %1816) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1843 = "mhlo.transpose"(%1842) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1844 = "mhlo.reshape"(%1843) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1845 = "mhlo.dot"(%1844, %cst_458) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1846 = chlo.broadcast_add %1845, %cst_457 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1847 = "mhlo.reshape"(%1846) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1848 = "mhlo.dot"(%1812, %cst_471) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1849 = chlo.broadcast_add %1848, %cst_470 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1850 = "mhlo.reshape"(%1849) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1851 = chlo.broadcast_multiply %1850, %cst_469 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1852 = chlo.broadcast_add %1851, %cst_457 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1853 = chlo.broadcast_add %1847, %1852 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1854 = chlo.broadcast_multiply %1853, %cst_456 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1855 = chlo.broadcast_add %1854, %cst_455 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1856 = "mhlo.reshape"(%1855) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1857 = "mhlo.dot"(%1856, %cst_473) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1858 = chlo.broadcast_add %1857, %cst_472 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1859 = "mhlo.reshape"(%1858) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1860 = chlo.broadcast_maximum %1859, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1861 = "mhlo.reshape"(%1860) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1862 = "mhlo.dot"(%1861, %cst_477) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1863 = chlo.broadcast_add %1862, %cst_476 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1864 = "mhlo.reshape"(%1863) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1865 = chlo.broadcast_add %1864, %1855 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1866 = chlo.broadcast_multiply %1865, %cst_475 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1867 = chlo.broadcast_add %1866, %cst_474 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1868 = "mhlo.reshape"(%1867) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1869 = "mhlo.dot"(%1868, %cst_479) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1870 = chlo.broadcast_add %1869, %cst_478 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1871 = "mhlo.reshape"(%1870) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1872 = chlo.broadcast_maximum %1871, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1873 = "mhlo.reshape"(%1872) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1874 = "mhlo.dot"(%1873, %cst_483) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1875 = chlo.broadcast_add %1874, %cst_482 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1876 = "mhlo.reshape"(%1875) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1877 = chlo.broadcast_add %1876, %1867 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1878 = chlo.broadcast_multiply %1877, %cst_481 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1879 = chlo.broadcast_add %1878, %cst_480 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1880 = "mhlo.reshape"(%1879) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1881 = "mhlo.dot"(%1880, %cst_485) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1882 = chlo.broadcast_add %1881, %cst_484 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1883 = "mhlo.reshape"(%1882) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1884 = chlo.broadcast_maximum %1883, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1885 = "mhlo.reshape"(%1884) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1886 = "mhlo.dot"(%1885, %cst_489) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1887 = chlo.broadcast_add %1886, %cst_488 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1888 = "mhlo.reshape"(%1887) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1889 = chlo.broadcast_add %1888, %1879 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1890 = chlo.broadcast_multiply %1889, %cst_487 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1891 = chlo.broadcast_add %1890, %cst_486 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1892 = "mhlo.reshape"(%1891) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1893 = "mhlo.dot"(%1892, %cst_491) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1894 = chlo.broadcast_add %1893, %cst_490 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1895 = "mhlo.reshape"(%1894) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1896 = chlo.broadcast_maximum %1895, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1897 = "mhlo.reshape"(%1896) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1898 = "mhlo.dot"(%1897, %cst_499) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1899 = chlo.broadcast_add %1898, %cst_498 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1900 = "mhlo.reshape"(%1899) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1901 = chlo.broadcast_add %1900, %1891 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1902 = chlo.broadcast_multiply %1901, %cst_493 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1903 = chlo.broadcast_add %1902, %cst_492 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1904 = "mhlo.reshape"(%1903) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1905 = "mhlo.dot"(%1904, %cst_497) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1906 = chlo.broadcast_add %1905, %cst_496 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1907 = "mhlo.reshape"(%1906) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1908 = chlo.broadcast_add %1907, %1811 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %1909 = chlo.broadcast_multiply %1908, %cst_495 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1910 = chlo.broadcast_add %1909, %cst_494 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %1911 = "mhlo.reshape"(%1910) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1912 = "mhlo.dot"(%1911, %cst_509) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1913 = chlo.broadcast_add %1912, %cst_508 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1914 = "mhlo.reshape"(%1913) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1915 = "mhlo.transpose"(%1914) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1916 = "mhlo.dot"(%1911, %cst_513) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1917 = "mhlo.reshape"(%1916) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1918 = "mhlo.broadcast_in_dim"(%cst_512) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %1919 = mhlo.add %1917, %1918 : tensor<1x384x128xf32>
  %1920 = chlo.broadcast_multiply %1919, %cst_511 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1921 = chlo.broadcast_add %1920, %cst_510 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1922 = "mhlo.reshape"(%1921) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1923 = "mhlo.dot"(%1922, %cst_505) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1924 = chlo.broadcast_add %1923, %cst_504 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1925 = "mhlo.reshape"(%1924) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1926 = "mhlo.transpose"(%1925) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1927 = "mhlo.dot"(%1922, %cst_507) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1928 = chlo.broadcast_add %1927, %cst_506 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1929 = "mhlo.reshape"(%1928) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %1930 = "mhlo.transpose"(%1929) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %1931 = "mhlo.dot_general"(%1930, %1926) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<3> : tensor<1xi64>}} : (tensor<1x4x384x32xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x384xf32>
  %1932 = chlo.broadcast_multiply %1931, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384x384xf32>
  %1933 = chlo.broadcast_add %1932, %24 : (tensor<1x4x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1934 = "mhlo.reduce"(%1933, %4) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.maximum %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1935 = linalg.tensor_expand_shape %1934 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1936 = chlo.broadcast_subtract %1933, %1935 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1937 = "mhlo.exponential"(%1936) : (tensor<1x4x384x384xf32>) -> tensor<1x4x384x384xf32>
  %1938 = "mhlo.reduce"(%1937, %5) ( {
  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):  // no predecessors
    %2417 = mhlo.add %arg3, %arg4 : tensor<f32>
    "mhlo.return"(%2417) : (tensor<f32>) -> ()
  }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x4x384x384xf32>, tensor<f32>) -> tensor<1x4x384xf32>
  %1939 = linalg.tensor_expand_shape %1938 [[0], [1], [2, 3]] : tensor<1x4x384xf32> into tensor<1x4x384x1xf32>
  %1940 = chlo.broadcast_divide %1937, %1939 : (tensor<1x4x384x384xf32>, tensor<1x4x384x1xf32>) -> tensor<1x4x384x384xf32>
  %1941 = "mhlo.dot_general"(%1940, %1915) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contracting_dimensions = dense<3> : tensor<1xi64>, rhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}} : (tensor<1x4x384x384xf32>, tensor<1x4x384x32xf32>) -> tensor<1x4x384x32xf32>
  %1942 = "mhlo.transpose"(%1941) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x4x384x32xf32>) -> tensor<1x384x4x32xf32>
  %1943 = "mhlo.reshape"(%1942) : (tensor<1x384x4x32xf32>) -> tensor<384x128xf32>
  %1944 = "mhlo.dot"(%1943, %cst_503) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %1945 = chlo.broadcast_add %1944, %cst_502 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1946 = "mhlo.reshape"(%1945) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1947 = "mhlo.dot"(%1911, %cst_516) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1948 = chlo.broadcast_add %1947, %cst_515 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1949 = "mhlo.reshape"(%1948) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1950 = chlo.broadcast_multiply %1949, %cst_514 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1951 = chlo.broadcast_add %1950, %cst_502 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1952 = chlo.broadcast_add %1946, %1951 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1953 = chlo.broadcast_multiply %1952, %cst_501 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1954 = chlo.broadcast_add %1953, %cst_500 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1955 = "mhlo.reshape"(%1954) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1956 = "mhlo.dot"(%1955, %cst_518) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1957 = chlo.broadcast_add %1956, %cst_517 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1958 = "mhlo.reshape"(%1957) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1959 = chlo.broadcast_maximum %1958, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1960 = "mhlo.reshape"(%1959) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1961 = "mhlo.dot"(%1960, %cst_522) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1962 = chlo.broadcast_add %1961, %cst_521 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1963 = "mhlo.reshape"(%1962) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1964 = chlo.broadcast_add %1963, %1954 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1965 = chlo.broadcast_multiply %1964, %cst_520 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1966 = chlo.broadcast_add %1965, %cst_519 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1967 = "mhlo.reshape"(%1966) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1968 = "mhlo.dot"(%1967, %cst_524) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1969 = chlo.broadcast_add %1968, %cst_523 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1970 = "mhlo.reshape"(%1969) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1971 = chlo.broadcast_maximum %1970, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1972 = "mhlo.reshape"(%1971) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1973 = "mhlo.dot"(%1972, %cst_528) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1974 = chlo.broadcast_add %1973, %cst_527 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1975 = "mhlo.reshape"(%1974) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1976 = chlo.broadcast_add %1975, %1966 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1977 = chlo.broadcast_multiply %1976, %cst_526 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1978 = chlo.broadcast_add %1977, %cst_525 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1979 = "mhlo.reshape"(%1978) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1980 = "mhlo.dot"(%1979, %cst_530) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1981 = chlo.broadcast_add %1980, %cst_529 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1982 = "mhlo.reshape"(%1981) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1983 = chlo.broadcast_maximum %1982, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1984 = "mhlo.reshape"(%1983) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1985 = "mhlo.dot"(%1984, %cst_534) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1986 = chlo.broadcast_add %1985, %cst_533 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1987 = "mhlo.reshape"(%1986) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %1988 = chlo.broadcast_add %1987, %1978 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %1989 = chlo.broadcast_multiply %1988, %cst_532 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1990 = chlo.broadcast_add %1989, %cst_531 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %1991 = "mhlo.reshape"(%1990) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %1992 = "mhlo.dot"(%1991, %cst_536) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %1993 = chlo.broadcast_add %1992, %cst_535 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %1994 = "mhlo.reshape"(%1993) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %1995 = chlo.broadcast_maximum %1994, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x384x512xf32>, tensor<f32>) -> tensor<1x384x512xf32>
  %1996 = "mhlo.reshape"(%1995) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %1997 = "mhlo.dot"(%1996, %cst_544) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %1998 = chlo.broadcast_add %1997, %cst_543 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %1999 = "mhlo.reshape"(%1998) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %2000 = chlo.broadcast_add %1999, %1990 : (tensor<1x384x128xf32>, tensor<1x384x128xf32>) -> tensor<1x384x128xf32>
  %2001 = chlo.broadcast_multiply %2000, %cst_538 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %2002 = chlo.broadcast_add %2001, %cst_537 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %2003 = "mhlo.reshape"(%2002) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %2004 = "mhlo.dot"(%2003, %cst_542) : (tensor<384x128xf32>, tensor<128x512xf32>) -> tensor<384x512xf32>
  %2005 = chlo.broadcast_add %2004, %cst_541 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x512xf32>, tensor<512xf32>) -> tensor<384x512xf32>
  %2006 = "mhlo.reshape"(%2005) : (tensor<384x512xf32>) -> tensor<1x384x512xf32>
  %2007 = chlo.broadcast_add %2006, %1910 : (tensor<1x384x512xf32>, tensor<1x384x512xf32>) -> tensor<1x384x512xf32>
  %2008 = chlo.broadcast_multiply %2007, %cst_540 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %2009 = chlo.broadcast_add %2008, %cst_539 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x512xf32>, tensor<512xf32>) -> tensor<1x384x512xf32>
  %2010 = "mhlo.reshape"(%2009) : (tensor<1x384x512xf32>) -> tensor<384x512xf32>
  %2011 = "mhlo.dot"(%2010, %cst_599) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %2012 = chlo.broadcast_add %2011, %cst_598 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %2013 = "mhlo.reshape"(%2012) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %2014 = "mhlo.transpose"(%2013) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %2015 = "mhlo.dot"(%2010, %cst_603) : (tensor<384x512xf32>, tensor<512x128xf32>) -> tensor<384x128xf32>
  %2016 = "mhlo.reshape"(%2015) : (tensor<384x128xf32>) -> tensor<1x384x128xf32>
  %2017 = "mhlo.broadcast_in_dim"(%cst_602) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x384x128xf32>
  %2018 = mhlo.add %2016, %2017 : tensor<1x384x128xf32>
  %2019 = chlo.broadcast_multiply %2018, %cst_601 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %2020 = chlo.broadcast_add %2019, %cst_600 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
  %2021 = "mhlo.reshape"(%2020) : (tensor<1x384x128xf32>) -> tensor<384x128xf32>
  %2022 = "mhlo.dot"(%2021, %cst_595) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %2023 = chlo.broadcast_add %2022, %cst_594 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %2024 = "mhlo.reshape"(%2023) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %2025 = "mhlo.transpose"(%2024) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %2026 = "mhlo.dot"(%2021, %cst_597) : (tensor<384x128xf32>, tensor<128x128xf32>) -> tensor<384x128xf32>
  %2027 = chlo.broadcast_add %2026, %cst_596 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<384x128xf32>, tensor<128xf32>) -> tensor<384x128xf32>
  %2028 = "mhlo.reshape"(%2027) : (tensor<384x128xf32>) -> tensor<1x384x4x32xf32>
  %2029 = "mhlo.transpose"(%2028) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x384x4x32xf32>) -> tensor<1x4x384x32xf32>
  %2030 = "mhlo.dot_general"(%2029, %2025) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[0, 1]> : tensor<2xi64>, lhs_contract