AmosLewis · November 15, 2022 17:42 · AmosLewis · Nov 15, 2022
diff --git a/debert_torch_ir_print.txt b/debert_torch_ir_print.txt
 ➜  SHARK git:(main) ✗ torch-mlir-opt -pass-pipeline='builtin.module(torch-backend-to-tosa-backend-pipeline)' /tmp/_lambda.mlir -mlir-print-ir-after-all -mlir-disable-threading --debug
 Args: /home/chi/src/ubuntu20/shark/torch-mlir/build/bin/torch-mlir-opt -pass-pipeline=builtin.module(torch-backend-to-tosa-backend-pipeline) /tmp/_lambda.mlir -mlir-print-ir-after-all -mlir-disable-threading --debug 
 Load new dialect in Context builtin
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::SubElementTypeInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::SubElementAttrInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::RegionKindInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::CastOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ConditionallySpeculatable)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemoryEffectOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ResourceBlobManagerDialectInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmDialectInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::BytecodeDialectInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::detail::AffineBinaryOpExprStorage)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::detail::AffineConstantExprStorage)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::detail::AffineDimExprStorage)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::detail::AffineMapStorage)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::detail::IntegerSetStorage)
 Load new dialect in Context builtin
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::DebugActionManager::GenericHandler)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::ZeroOperands<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneRegion<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::ZeroResults<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::ZeroSuccessors<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NoRegionArguments<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NoTerminator<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::SingleBlock<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OpInvariants<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::AffineScope<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::IsIsolatedFromAbove<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::SymbolTable<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmOpInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::RegionKindInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::HasOnlyGraphRegion<Empty>)
 Load new dialect in Context torch
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::InferTypeOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolUserOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::RegionBranchOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::RegionBranchTerminatorOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::DialectInlinerInterface)
 Load new dialect in Context func
 Load new dialect in Context cf
 Load new dialect in Context arith
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::arith::ArithFastMathInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::VectorUnrollOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::InferIntRangeInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::bufferization::BufferizableOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::BranchOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::CallOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::CallableOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::FunctionOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::AutomaticAllocationScope<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::CallableOpInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::FunctionOpInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::ZeroRegions<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneResult<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::IntType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::ConstantLike<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ConditionallySpeculatable::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::AlwaysSpeculatableImplTrait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemoryEffectOpInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::torch::Torch::OpTrait::AllowedInModuleInitializer<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::InferTypeOpInterface::Trait<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::FloatType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::ValueTensorType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::DialectResourceBlobHandle<mlir::BuiltinDialect>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::BoolType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::NoneType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::StringType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::DeviceType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::torch::Torch::ListType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::VariadicOperands<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::torch::Torch::OpTrait::AllowsTypeRefinement<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::Type>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NOperands<5>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::torch::Torch::OpTrait::HasValueSemantics<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::torch::Torch::OpTrait::ReadOnly<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NOperands<3>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NOperands<4>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NOperands<2>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneOperand<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::NResults<2>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::HasParent<mlir::func::FuncOp>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::MemRefsNormalizable<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::ReturnLike<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::IsTerminator<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::DataLayoutSpecInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::detail::OpToOpPassAdaptor)
 Load new dialect in Context tensor
 Load new dialect in Context affine
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::AffineDmaStartOp)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::AffineMapAccessInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::AffineDmaWaitOp)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::LoopLikeOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::AffineReadOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::AffineWriteOpInterface)
 Load new dialect in Context complex
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedDimOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ReifyRankedShapedTypeOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OffsetSizeAndStrideOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::DestinationStyleOpInterface)
 Load new dialect in Context linalg
 Load new dialect in Context math
 Load new dialect in Context memref
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::CopyOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::ViewLikeOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::linalg::LinalgOp)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::linalg::ContractionOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::linalg::ConvolutionOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::linalg::FillOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::TilingInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::PartialReductionOpInterface)
 Ignoring repeated interface registrationIgnoring repeated interface registrationLoad new dialect in Context torch_c
 Load new dialect in Context tosa
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::InferShapedTypeOpInterface)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::tosa::TosaOp)

 //===-------------------------------------------===//
 Legalizing operation : 'func.func'(0x8f3d190) {
  * Fold {
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::DialectFoldInterface)
  } -> FAILURE : unable to fold
 } -> FAILURE : no matched legalization pattern
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.constant.int'(0x8ec3280) {
  %0 = "torch.constant.int"() {value = 1 : i64} : () -> !torch.int

  * Fold {
  } -> FAILURE : unable to fold
 } -> FAILURE : no matched legalization pattern
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.constant.int'(0x8edffa0) {
  %1 = "torch.constant.int"() {value = 32 : i64} : () -> !torch.int

  * Fold {
  } -> FAILURE : unable to fold
 } -> FAILURE : no matched legalization pattern
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.constant.int'(0x8ee1570) {
  %2 = "torch.constant.int"() {value = 128 : i64} : () -> !torch.int

  * Fold {
  } -> FAILURE : unable to fold
 } -> FAILURE : no matched legalization pattern
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.constant.float'(0x8ee4440) {
  %3 = "torch.constant.float"() {value = 1.000000e+00 : f64} : () -> !torch.float

  * Fold {
  } -> FAILURE : unable to fold
 } -> FAILURE : no matched legalization pattern
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8eea1e0) {
  %4 = "torch.vtensor.literal"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> !torch.vtensor<[2],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpTrait::OneTypedResult<mlir::TensorType>::Impl<Empty>)
 ImplicitTypeIDRegistry::lookupOrInsert(mlir::tosa::TosaOp::Trait<Empty>)
    ** Insert  : 'tosa.const'(0x8f54030)
    ** Replace : 'torch.vtensor.literal'(0x8eea1e0)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f54030) {
      %4 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %4 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %6 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %8 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %12 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %21 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %24 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %26 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %27 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %32 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %33 = torch.aten.ones %32, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %34 = torch.aten.zeros %32, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %35 = torch.aten.slice.Tensor %31, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %36 = torch.aten.slice.Tensor %35, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %37 = torch.aten.embedding %30, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %38 = torch.aten.embedding %29, %36, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %39 = torch.aten.add.Tensor %37, %38, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %40 = torch.aten.embedding %28, %34, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %41 = torch.aten.add.Tensor %39, %40, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %42 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %43 = torch.aten.sum.dim_IntList %41, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %44 = torch.aten.div.Scalar %43, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %45 = torch.aten.sub.Tensor %41, %44, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %46 = torch.aten.pow.Tensor_Scalar %45, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.aten.sum.dim_IntList %46, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %48 = torch.aten.div.Scalar %47, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %49 = torch.aten.sub.Tensor %41, %44, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.aten.add.Scalar %48, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %51 = torch.aten.sqrt %50 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %52 = torch.aten.div.Tensor %49, %51 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %53 = torch.aten.mul.Tensor %27, %52 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %54 = torch.aten.add.Tensor %53, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %55 = torch.aten.unsqueeze %33, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %56 = torch.aten.mul.Tensor %54, %55 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.unsqueeze %33, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %58 = torch.aten.unsqueeze %57, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %59 = torch.aten.squeeze.dim %58, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %60 = torch.aten.unsqueeze %59, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %61 = torch.aten.mul.Tensor %58, %60 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %62 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %63 = torch.aten.to.dtype %62, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %64 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %65 = torch.aten.broadcast_to %63, %64 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %66 = torch.aten.copy %65, %61, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %67 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %68 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %69 = torch.aten.view %56, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %70 = torch.aten.mm %69, %67 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %71 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %72 = torch.aten.view %70, %71 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %73 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %74 = torch.aten.view %72, %73 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %75 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %76 = torch.aten.permute %74, %75 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %77 = torch.aten.slice.Tensor %76, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %78 = torch.aten.slice.Tensor %76, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %79 = torch.aten.slice.Tensor %76, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %80 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %81 = torch.aten.unsqueeze %80, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %82 = torch.aten.slice.Tensor %81, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %83 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %84 = torch.aten.view %82, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %85 = torch.aten.permute %84, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %86 = torch.aten.add.Tensor %77, %85, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %87 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %88 = torch.aten.unsqueeze %87, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %89 = torch.aten.slice.Tensor %88, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %90 = torch.aten.view %89, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %91 = torch.aten.permute %90, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %92 = torch.aten.add.Tensor %79, %91, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %93 = torch.aten.div.Scalar %86, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %94 = torch.aten.transpose.int %78, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %95 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %96 = torch.aten.broadcast_to %93, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %97 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %98 = torch.aten.view %96, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %99 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %100 = torch.aten.broadcast_to %94, %99 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %101 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %102 = torch.aten.view %100, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %103 = torch.aten.bmm %98, %102 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %104 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %105 = torch.aten.view %103, %104 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %106 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %107 = torch.aten.to.dtype %106, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %108 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %109 = torch.aten.broadcast_to %107, %108 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %110 = torch.aten.copy %109, %66, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %111 = torch.aten.bitwise_not %110 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %112 = torch.aten.clone %24, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %113 = torch.aten.masked_fill.Tensor %105, %111, %112 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %113, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %114 = torch.aten.sub.Tensor %113, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %115 = torch.aten.exp %114 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %116 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %117 = torch.aten.sum.dim_IntList %115, %116, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %118 = torch.aten.div.Tensor %115, %117 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %119 = torch.aten.masked_fill.Scalar %118, %111, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %120 = torch.aten.broadcast_to %119, %104 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %121 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %122 = torch.aten.view %120, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %123 = torch.aten.broadcast_to %92, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %124 = torch.aten.view %123, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %125 = torch.aten.bmm %122, %124 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %126 = torch.aten.view %125, %95 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %127 = torch.aten.permute %126, %75 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %128 = torch.aten.clone %127, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %129 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %130 = torch.aten.view %128, %129 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %131 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %132 = torch.aten.view %130, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %133 = torch.aten.mm %132, %131 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %134 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %135 = torch.aten.add.Tensor %134, %133, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %136 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %137 = torch.aten.view %135, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %138 = torch.aten.add.Tensor %137, %56, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %139 = torch.aten.sum.dim_IntList %138, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %140 = torch.aten.div.Scalar %139, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %141 = torch.aten.sub.Tensor %138, %140, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %142 = torch.aten.pow.Tensor_Scalar %141, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %143 = torch.aten.sum.dim_IntList %142, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %144 = torch.aten.div.Scalar %143, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %145 = torch.aten.sub.Tensor %138, %140, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %146 = torch.aten.add.Scalar %144, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %147 = torch.aten.sqrt %146 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %148 = torch.aten.div.Tensor %145, %147 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %149 = torch.aten.mul.Tensor %27, %148 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %150 = torch.aten.add.Tensor %149, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %151 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %152 = torch.aten.view %150, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %153 = torch.aten.mm %152, %151 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %154 = torch.aten.mul.Scalar %21, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %155 = torch.aten.add.Tensor %154, %153, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %156 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %157 = torch.aten.view %155, %156 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %158 = torch.aten.gelu %157, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %159 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %160 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %161 = torch.aten.view %158, %160 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %162 = torch.aten.mm %161, %159 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %163 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %164 = torch.aten.add.Tensor %163, %162, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %165 = torch.aten.view %164, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %166 = torch.aten.add.Tensor %165, %150, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %167 = torch.aten.sum.dim_IntList %166, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %168 = torch.aten.div.Scalar %167, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %169 = torch.aten.sub.Tensor %166, %168, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %170 = torch.aten.pow.Tensor_Scalar %169, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %171 = torch.aten.sum.dim_IntList %170, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %172 = torch.aten.div.Scalar %171, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %173 = torch.aten.sub.Tensor %166, %168, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %174 = torch.aten.add.Scalar %172, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %175 = torch.aten.sqrt %174 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %176 = torch.aten.div.Tensor %173, %175 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %177 = torch.aten.mul.Tensor %27, %176 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %178 = torch.aten.add.Tensor %177, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %179 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %180 = torch.aten.view %178, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %181 = torch.aten.mm %180, %179 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %182 = torch.aten.view %181, %71 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %183 = torch.aten.view %182, %73 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %184 = torch.aten.permute %183, %75 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %185 = torch.aten.slice.Tensor %184, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %186 = torch.aten.slice.Tensor %184, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %187 = torch.aten.slice.Tensor %184, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %188 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %189 = torch.aten.unsqueeze %188, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %190 = torch.aten.slice.Tensor %189, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %191 = torch.aten.view %190, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %192 = torch.aten.permute %191, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %193 = torch.aten.add.Tensor %185, %192, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %194 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %195 = torch.aten.unsqueeze %194, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %196 = torch.aten.slice.Tensor %195, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %197 = torch.aten.view %196, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %198 = torch.aten.permute %197, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %199 = torch.aten.add.Tensor %187, %198, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %200 = torch.aten.div.Scalar %193, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %201 = torch.aten.transpose.int %186, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %202 = torch.aten.broadcast_to %200, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %203 = torch.aten.view %202, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %204 = torch.aten.broadcast_to %201, %99 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %205 = torch.aten.view %204, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %206 = torch.aten.bmm %203, %205 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %207 = torch.aten.view %206, %104 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %208 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %209 = torch.aten.to.dtype %208, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %210 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %211 = torch.aten.broadcast_to %209, %210 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %212 = torch.aten.copy %211, %66, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %213 = torch.aten.bitwise_not %212 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %214 = torch.aten.clone %24, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %215 = torch.aten.masked_fill.Tensor %207, %213, %214 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %215, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %216 = torch.aten.sub.Tensor %215, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %217 = torch.aten.exp %216 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %218 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %219 = torch.aten.sum.dim_IntList %217, %218, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %220 = torch.aten.div.Tensor %217, %219 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %221 = torch.aten.masked_fill.Scalar %220, %213, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %222 = torch.aten.broadcast_to %221, %104 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %223 = torch.aten.view %222, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %224 = torch.aten.broadcast_to %199, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %225 = torch.aten.view %224, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %226 = torch.aten.bmm %223, %225 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %227 = torch.aten.view %226, %95 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %228 = torch.aten.permute %227, %75 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %229 = torch.aten.clone %228, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %230 = torch.aten.view %229, %129 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %231 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %232 = torch.aten.view %230, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %233 = torch.aten.mm %232, %231 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %234 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %235 = torch.aten.add.Tensor %234, %233, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %236 = torch.aten.view %235, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %237 = torch.aten.add.Tensor %236, %178, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %238 = torch.aten.sum.dim_IntList %237, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %239 = torch.aten.div.Scalar %238, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %240 = torch.aten.sub.Tensor %237, %239, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %241 = torch.aten.pow.Tensor_Scalar %240, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %242 = torch.aten.sum.dim_IntList %241, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %243 = torch.aten.div.Scalar %242, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %244 = torch.aten.sub.Tensor %237, %239, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %245 = torch.aten.add.Scalar %243, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %246 = torch.aten.sqrt %245 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %247 = torch.aten.div.Tensor %244, %246 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %248 = torch.aten.mul.Tensor %27, %247 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %249 = torch.aten.add.Tensor %248, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %250 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %251 = torch.aten.view %249, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %252 = torch.aten.mm %251, %250 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %253 = torch.aten.mul.Scalar %21, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %254 = torch.aten.add.Tensor %253, %252, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %255 = torch.aten.view %254, %156 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %256 = torch.aten.gelu %255, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %257 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %258 = torch.aten.view %256, %160 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %259 = torch.aten.mm %258, %257 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %260 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %261 = torch.aten.add.Tensor %260, %259, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %262 = torch.aten.view %261, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %263 = torch.aten.add.Tensor %262, %249, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %264 = torch.aten.sum.dim_IntList %263, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %265 = torch.aten.div.Scalar %264, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %266 = torch.aten.sub.Tensor %263, %265, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %267 = torch.aten.pow.Tensor_Scalar %266, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %268 = torch.aten.sum.dim_IntList %267, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %269 = torch.aten.div.Scalar %268, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %270 = torch.aten.sub.Tensor %263, %265, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %271 = torch.aten.add.Scalar %269, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %272 = torch.aten.sqrt %271 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %273 = torch.aten.div.Tensor %270, %272 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %274 = torch.aten.mul.Tensor %27, %273 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %275 = torch.aten.add.Tensor %274, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %276 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %277 = torch.aten.view %275, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %278 = torch.aten.mm %277, %276 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %279 = torch.aten.view %278, %71 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %280 = torch.aten.view %279, %73 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %281 = torch.aten.permute %280, %75 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %282 = torch.aten.slice.Tensor %281, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %283 = torch.aten.slice.Tensor %281, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %284 = torch.aten.slice.Tensor %281, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %285 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %286 = torch.aten.unsqueeze %285, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %287 = torch.aten.slice.Tensor %286, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %288 = torch.aten.view %287, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %289 = torch.aten.permute %288, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %290 = torch.aten.add.Tensor %282, %289, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %291 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %292 = torch.aten.unsqueeze %291, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %293 = torch.aten.slice.Tensor %292, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %294 = torch.aten.view %293, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %295 = torch.aten.permute %294, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %296 = torch.aten.add.Tensor %284, %295, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %297 = torch.aten.div.Scalar %290, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %298 = torch.aten.transpose.int %283, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %299 = torch.aten.broadcast_to %297, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %300 = torch.aten.view %299, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %301 = torch.aten.broadcast_to %298, %99 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %302 = torch.aten.view %301, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %303 = torch.aten.bmm %300, %302 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %304 = torch.aten.view %303, %104 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %305 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %306 = torch.aten.to.dtype %305, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %307 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %308 = torch.aten.broadcast_to %306, %307 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %309 = torch.aten.copy %308, %66, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %310 = torch.aten.bitwise_not %309 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %311 = torch.aten.clone %24, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %312 = torch.aten.masked_fill.Tensor %304, %310, %311 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %312, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %313 = torch.aten.sub.Tensor %312, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %314 = torch.aten.exp %313 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %315 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %316 = torch.aten.sum.dim_IntList %314, %315, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %317 = torch.aten.div.Tensor %314, %316 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %318 = torch.aten.masked_fill.Scalar %317, %310, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %319 = torch.aten.broadcast_to %318, %104 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %320 = torch.aten.view %319, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %321 = torch.aten.broadcast_to %296, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %322 = torch.aten.view %321, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %323 = torch.aten.bmm %320, %322 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %324 = torch.aten.view %323, %95 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %325 = torch.aten.permute %324, %75 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %326 = torch.aten.clone %325, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %327 = torch.aten.view %326, %129 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %328 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %329 = torch.aten.view %327, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %330 = torch.aten.mm %329, %328 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %331 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %332 = torch.aten.add.Tensor %331, %330, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %333 = torch.aten.view %332, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %334 = torch.aten.add.Tensor %333, %275, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %335 = torch.aten.sum.dim_IntList %334, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %336 = torch.aten.div.Scalar %335, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %337 = torch.aten.sub.Tensor %334, %336, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %338 = torch.aten.pow.Tensor_Scalar %337, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %339 = torch.aten.sum.dim_IntList %338, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %340 = torch.aten.div.Scalar %339, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %341 = torch.aten.sub.Tensor %334, %336, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %342 = torch.aten.add.Scalar %340, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %343 = torch.aten.sqrt %342 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %344 = torch.aten.div.Tensor %341, %343 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %345 = torch.aten.mul.Tensor %27, %344 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %346 = torch.aten.add.Tensor %345, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %347 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %348 = torch.aten.view %346, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %349 = torch.aten.mm %348, %347 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %350 = torch.aten.mul.Scalar %21, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %351 = torch.aten.add.Tensor %350, %349, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %352 = torch.aten.view %351, %156 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %353 = torch.aten.gelu %352, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %354 = torch.aten.transpose.int %12, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %355 = torch.aten.view %353, %160 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %356 = torch.aten.mm %355, %354 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %357 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %358 = torch.aten.add.Tensor %357, %356, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %359 = torch.aten.view %358, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %360 = torch.aten.add.Tensor %359, %346, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %361 = torch.aten.sum.dim_IntList %360, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %362 = torch.aten.div.Scalar %361, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %363 = torch.aten.sub.Tensor %360, %362, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %364 = torch.aten.pow.Tensor_Scalar %363, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %365 = torch.aten.sum.dim_IntList %364, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %366 = torch.aten.div.Scalar %365, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %367 = torch.aten.sub.Tensor %360, %362, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %368 = torch.aten.add.Scalar %366, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %369 = torch.aten.sqrt %368 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %370 = torch.aten.div.Tensor %367, %369 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %371 = torch.aten.mul.Tensor %27, %370 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %372 = torch.aten.add.Tensor %371, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %373 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %374 = torch.aten.view %372, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %375 = torch.aten.mm %374, %373 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %376 = torch.aten.view %375, %71 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %377 = torch.aten.view %376, %73 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %378 = torch.aten.permute %377, %75 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %379 = torch.aten.slice.Tensor %378, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %380 = torch.aten.slice.Tensor %378, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %381 = torch.aten.slice.Tensor %378, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %382 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %383 = torch.aten.unsqueeze %382, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %384 = torch.aten.slice.Tensor %383, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %385 = torch.aten.view %384, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %386 = torch.aten.permute %385, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %387 = torch.aten.add.Tensor %379, %386, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %388 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %389 = torch.aten.unsqueeze %388, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %390 = torch.aten.slice.Tensor %389, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %391 = torch.aten.view %390, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %392 = torch.aten.permute %391, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %393 = torch.aten.add.Tensor %381, %392, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %394 = torch.aten.div.Scalar %387, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %395 = torch.aten.transpose.int %380, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %396 = torch.aten.broadcast_to %394, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %397 = torch.aten.view %396, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %398 = torch.aten.broadcast_to %395, %99 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %399 = torch.aten.view %398, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %400 = torch.aten.bmm %397, %399 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %401 = torch.aten.view %400, %104 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %402 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %403 = torch.aten.to.dtype %402, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %404 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %405 = torch.aten.broadcast_to %403, %404 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %406 = torch.aten.copy %405, %66, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %407 = torch.aten.bitwise_not %406 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %408 = torch.aten.clone %24, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %409 = torch.aten.masked_fill.Tensor %401, %407, %408 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %409, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %410 = torch.aten.sub.Tensor %409, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %411 = torch.aten.exp %410 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %412 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %413 = torch.aten.sum.dim_IntList %411, %412, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %414 = torch.aten.div.Tensor %411, %413 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %415 = torch.aten.masked_fill.Scalar %414, %407, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %416 = torch.aten.broadcast_to %415, %104 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %417 = torch.aten.view %416, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %418 = torch.aten.broadcast_to %393, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %419 = torch.aten.view %418, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %420 = torch.aten.bmm %417, %419 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %421 = torch.aten.view %420, %95 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %422 = torch.aten.permute %421, %75 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %423 = torch.aten.clone %422, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %424 = torch.aten.view %423, %129 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %425 = torch.aten.transpose.int %10, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %426 = torch.aten.view %424, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %427 = torch.aten.mm %426, %425 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %428 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %429 = torch.aten.add.Tensor %428, %427, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %430 = torch.aten.view %429, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %431 = torch.aten.add.Tensor %430, %372, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %432 = torch.aten.sum.dim_IntList %431, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %433 = torch.aten.div.Scalar %432, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %434 = torch.aten.sub.Tensor %431, %433, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %435 = torch.aten.pow.Tensor_Scalar %434, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %436 = torch.aten.sum.dim_IntList %435, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %437 = torch.aten.div.Scalar %436, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %438 = torch.aten.sub.Tensor %431, %433, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %439 = torch.aten.add.Scalar %437, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %440 = torch.aten.sqrt %439 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %441 = torch.aten.div.Tensor %438, %440 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %442 = torch.aten.mul.Tensor %27, %441 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %443 = torch.aten.add.Tensor %442, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %444 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %445 = torch.aten.view %443, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %446 = torch.aten.mm %445, %444 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %447 = torch.aten.mul.Scalar %21, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %448 = torch.aten.add.Tensor %447, %446, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %449 = torch.aten.view %448, %156 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %450 = torch.aten.gelu %449, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %451 = torch.aten.transpose.int %8, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %452 = torch.aten.view %450, %160 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %453 = torch.aten.mm %452, %451 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %454 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %455 = torch.aten.add.Tensor %454, %453, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %456 = torch.aten.view %455, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %457 = torch.aten.add.Tensor %456, %443, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %458 = torch.aten.sum.dim_IntList %457, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %459 = torch.aten.div.Scalar %458, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %460 = torch.aten.sub.Tensor %457, %459, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %461 = torch.aten.pow.Tensor_Scalar %460, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %462 = torch.aten.sum.dim_IntList %461, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %463 = torch.aten.div.Scalar %462, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %464 = torch.aten.sub.Tensor %457, %459, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %465 = torch.aten.add.Scalar %463, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %466 = torch.aten.sqrt %465 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %467 = torch.aten.div.Tensor %464, %466 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %468 = torch.aten.mul.Tensor %27, %467 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %469 = torch.aten.add.Tensor %468, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %470 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %471 = torch.aten.view %469, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %472 = torch.aten.mm %471, %470 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %473 = torch.aten.view %472, %71 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %474 = torch.aten.view %473, %73 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %475 = torch.aten.permute %474, %75 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %476 = torch.aten.slice.Tensor %475, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %477 = torch.aten.slice.Tensor %475, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %478 = torch.aten.slice.Tensor %475, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %479 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %480 = torch.aten.unsqueeze %479, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %481 = torch.aten.slice.Tensor %480, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %482 = torch.aten.view %481, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %483 = torch.aten.permute %482, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %484 = torch.aten.add.Tensor %476, %483, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %485 = torch.aten.unsqueeze %26, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %486 = torch.aten.unsqueeze %485, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %487 = torch.aten.slice.Tensor %486, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %488 = torch.aten.view %487, %83 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %489 = torch.aten.permute %488, %75 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %490 = torch.aten.add.Tensor %478, %489, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %491 = torch.aten.div.Scalar %484, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %492 = torch.aten.transpose.int %477, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %493 = torch.aten.broadcast_to %491, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %494 = torch.aten.view %493, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %495 = torch.aten.broadcast_to %492, %99 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %496 = torch.aten.view %495, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %497 = torch.aten.bmm %494, %496 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %498 = torch.aten.view %497, %104 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %499 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %500 = torch.aten.to.dtype %499, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %501 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %502 = torch.aten.broadcast_to %500, %501 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %503 = torch.aten.copy %502, %66, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %504 = torch.aten.bitwise_not %503 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %505 = torch.aten.clone %24, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %506 = torch.aten.masked_fill.Tensor %498, %504, %505 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %506, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %507 = torch.aten.sub.Tensor %506, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %508 = torch.aten.exp %507 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %509 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %510 = torch.aten.sum.dim_IntList %508, %509, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %511 = torch.aten.div.Tensor %508, %510 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %512 = torch.aten.masked_fill.Scalar %511, %504, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %513 = torch.aten.broadcast_to %512, %104 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %514 = torch.aten.view %513, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %515 = torch.aten.broadcast_to %490, %95 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %516 = torch.aten.view %515, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %517 = torch.aten.bmm %514, %516 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %518 = torch.aten.view %517, %95 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %519 = torch.aten.permute %518, %75 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %520 = torch.aten.clone %519, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %521 = torch.aten.view %520, %129 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %522 = torch.aten.transpose.int %6, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %523 = torch.aten.view %521, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %524 = torch.aten.mm %523, %522 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %525 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %526 = torch.aten.add.Tensor %525, %524, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %527 = torch.aten.view %526, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %528 = torch.aten.add.Tensor %527, %469, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %529 = torch.aten.sum.dim_IntList %528, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %530 = torch.aten.div.Scalar %529, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %531 = torch.aten.sub.Tensor %528, %530, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %532 = torch.aten.pow.Tensor_Scalar %531, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %533 = torch.aten.sum.dim_IntList %532, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %534 = torch.aten.div.Scalar %533, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %535 = torch.aten.sub.Tensor %528, %530, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %536 = torch.aten.add.Scalar %534, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %537 = torch.aten.sqrt %536 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %538 = torch.aten.div.Tensor %535, %537 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %539 = torch.aten.mul.Tensor %27, %538 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %540 = torch.aten.add.Tensor %539, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %541 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %542 = torch.aten.view %540, %68 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %543 = torch.aten.mm %542, %541 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %544 = torch.aten.mul.Scalar %21, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %545 = torch.aten.add.Tensor %544, %543, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %546 = torch.aten.view %545, %156 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %547 = torch.aten.gelu %546, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %548 = torch.aten.transpose.int %4, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %549 = torch.aten.view %547, %160 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %550 = torch.aten.mm %549, %548 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %551 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %552 = torch.aten.add.Tensor %551, %550, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %553 = torch.aten.view %552, %136 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %554 = torch.aten.add.Tensor %553, %540, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %555 = torch.aten.sum.dim_IntList %554, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %556 = torch.aten.div.Scalar %555, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %557 = torch.aten.sub.Tensor %554, %556, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %558 = torch.aten.pow.Tensor_Scalar %557, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %559 = torch.aten.sum.dim_IntList %558, %42, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %560 = torch.aten.div.Scalar %559, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %561 = torch.aten.sub.Tensor %554, %556, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %562 = torch.aten.add.Scalar %560, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %563 = torch.aten.sqrt %562 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %564 = torch.aten.div.Tensor %561, %563 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %565 = torch.aten.mul.Tensor %27, %564 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %566 = torch.aten.add.Tensor %565, %26, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %567 = torch.aten.slice.Tensor %566, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.slice.Tensor %567, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %569 = torch.aten.squeeze.dim %568, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %570 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %571 = torch.aten.mm %569, %570 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %572 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %573 = torch.aten.add.Tensor %572, %571, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %574 = torch.aten.gelu %573, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %575 = torch.aten.transpose.int %2, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %576 = torch.aten.mm %574, %575 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %577 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %578 = torch.aten.add.Tensor %577, %576, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %578 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8eef050) {
  %6 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> !torch.vtensor<[2,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8e6b020)
    ** Replace : 'torch.vtensor.literal'(0x8eef050)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8e6b020) {
      %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %6 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %8 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %10 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %22 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %25 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %27 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %28 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %33 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %34 = torch.aten.ones %33, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %35 = torch.aten.zeros %33, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %36 = torch.aten.slice.Tensor %32, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %37 = torch.aten.slice.Tensor %36, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %38 = torch.aten.embedding %31, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %39 = torch.aten.embedding %30, %37, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %40 = torch.aten.add.Tensor %38, %39, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %41 = torch.aten.embedding %29, %35, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %42 = torch.aten.add.Tensor %40, %41, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %43 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %44 = torch.aten.sum.dim_IntList %42, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %45 = torch.aten.div.Scalar %44, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %46 = torch.aten.sub.Tensor %42, %45, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.aten.pow.Tensor_Scalar %46, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.aten.sum.dim_IntList %47, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %49 = torch.aten.div.Scalar %48, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %50 = torch.aten.sub.Tensor %42, %45, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %51 = torch.aten.add.Scalar %49, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %52 = torch.aten.sqrt %51 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %53 = torch.aten.div.Tensor %50, %52 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %54 = torch.aten.mul.Tensor %28, %53 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %55 = torch.aten.add.Tensor %54, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %56 = torch.aten.unsqueeze %34, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %57 = torch.aten.mul.Tensor %55, %56 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %58 = torch.aten.unsqueeze %34, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %59 = torch.aten.unsqueeze %58, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %60 = torch.aten.squeeze.dim %59, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %61 = torch.aten.unsqueeze %60, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %62 = torch.aten.mul.Tensor %59, %61 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %63 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %64 = torch.aten.to.dtype %63, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %65 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %66 = torch.aten.broadcast_to %64, %65 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %67 = torch.aten.copy %66, %62, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %68 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %69 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %70 = torch.aten.view %57, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %71 = torch.aten.mm %70, %68 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %72 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %73 = torch.aten.view %71, %72 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %74 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %75 = torch.aten.view %73, %74 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %76 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %77 = torch.aten.permute %75, %76 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %78 = torch.aten.slice.Tensor %77, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %79 = torch.aten.slice.Tensor %77, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %80 = torch.aten.slice.Tensor %77, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %81 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %82 = torch.aten.unsqueeze %81, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %83 = torch.aten.slice.Tensor %82, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %84 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %85 = torch.aten.view %83, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %86 = torch.aten.permute %85, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %87 = torch.aten.add.Tensor %78, %86, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %88 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %89 = torch.aten.unsqueeze %88, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %90 = torch.aten.slice.Tensor %89, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %91 = torch.aten.view %90, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %92 = torch.aten.permute %91, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %93 = torch.aten.add.Tensor %80, %92, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %94 = torch.aten.div.Scalar %87, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %95 = torch.aten.transpose.int %79, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %96 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %97 = torch.aten.broadcast_to %94, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %98 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %99 = torch.aten.view %97, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %100 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %101 = torch.aten.broadcast_to %95, %100 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %102 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %103 = torch.aten.view %101, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %104 = torch.aten.bmm %99, %103 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %105 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %106 = torch.aten.view %104, %105 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %107 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %108 = torch.aten.to.dtype %107, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %109 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %110 = torch.aten.broadcast_to %108, %109 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %111 = torch.aten.copy %110, %67, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %112 = torch.aten.bitwise_not %111 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %113 = torch.aten.clone %25, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %114 = torch.aten.masked_fill.Tensor %106, %112, %113 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %114, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %115 = torch.aten.sub.Tensor %114, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %116 = torch.aten.exp %115 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %117 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %118 = torch.aten.sum.dim_IntList %116, %117, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %119 = torch.aten.div.Tensor %116, %118 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %120 = torch.aten.masked_fill.Scalar %119, %112, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %121 = torch.aten.broadcast_to %120, %105 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %122 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %123 = torch.aten.view %121, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %124 = torch.aten.broadcast_to %93, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %125 = torch.aten.view %124, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %126 = torch.aten.bmm %123, %125 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %127 = torch.aten.view %126, %96 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %128 = torch.aten.permute %127, %76 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %129 = torch.aten.clone %128, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %130 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %131 = torch.aten.view %129, %130 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %132 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %133 = torch.aten.view %131, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %134 = torch.aten.mm %133, %132 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %135 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %136 = torch.aten.add.Tensor %135, %134, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %137 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %138 = torch.aten.view %136, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %139 = torch.aten.add.Tensor %138, %57, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %140 = torch.aten.sum.dim_IntList %139, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %141 = torch.aten.div.Scalar %140, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %142 = torch.aten.sub.Tensor %139, %141, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %143 = torch.aten.pow.Tensor_Scalar %142, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %144 = torch.aten.sum.dim_IntList %143, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %145 = torch.aten.div.Scalar %144, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %146 = torch.aten.sub.Tensor %139, %141, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %147 = torch.aten.add.Scalar %145, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %148 = torch.aten.sqrt %147 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %149 = torch.aten.div.Tensor %146, %148 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %150 = torch.aten.mul.Tensor %28, %149 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %151 = torch.aten.add.Tensor %150, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %152 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %153 = torch.aten.view %151, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %154 = torch.aten.mm %153, %152 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %155 = torch.aten.mul.Scalar %22, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %156 = torch.aten.add.Tensor %155, %154, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %157 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %158 = torch.aten.view %156, %157 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %159 = torch.aten.gelu %158, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %160 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %161 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %162 = torch.aten.view %159, %161 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %163 = torch.aten.mm %162, %160 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %164 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %165 = torch.aten.add.Tensor %164, %163, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %166 = torch.aten.view %165, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %167 = torch.aten.add.Tensor %166, %151, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %168 = torch.aten.sum.dim_IntList %167, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %169 = torch.aten.div.Scalar %168, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %170 = torch.aten.sub.Tensor %167, %169, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %171 = torch.aten.pow.Tensor_Scalar %170, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %172 = torch.aten.sum.dim_IntList %171, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %173 = torch.aten.div.Scalar %172, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %174 = torch.aten.sub.Tensor %167, %169, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %175 = torch.aten.add.Scalar %173, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %176 = torch.aten.sqrt %175 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %177 = torch.aten.div.Tensor %174, %176 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %178 = torch.aten.mul.Tensor %28, %177 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %179 = torch.aten.add.Tensor %178, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %180 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %181 = torch.aten.view %179, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %182 = torch.aten.mm %181, %180 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %183 = torch.aten.view %182, %72 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %184 = torch.aten.view %183, %74 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %185 = torch.aten.permute %184, %76 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %186 = torch.aten.slice.Tensor %185, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %187 = torch.aten.slice.Tensor %185, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %188 = torch.aten.slice.Tensor %185, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %189 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %190 = torch.aten.unsqueeze %189, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %191 = torch.aten.slice.Tensor %190, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %192 = torch.aten.view %191, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %193 = torch.aten.permute %192, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %194 = torch.aten.add.Tensor %186, %193, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %195 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %196 = torch.aten.unsqueeze %195, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %197 = torch.aten.slice.Tensor %196, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %198 = torch.aten.view %197, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %199 = torch.aten.permute %198, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %200 = torch.aten.add.Tensor %188, %199, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %201 = torch.aten.div.Scalar %194, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %202 = torch.aten.transpose.int %187, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %203 = torch.aten.broadcast_to %201, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %204 = torch.aten.view %203, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %205 = torch.aten.broadcast_to %202, %100 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %206 = torch.aten.view %205, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %207 = torch.aten.bmm %204, %206 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %208 = torch.aten.view %207, %105 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %209 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %210 = torch.aten.to.dtype %209, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %211 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %212 = torch.aten.broadcast_to %210, %211 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %213 = torch.aten.copy %212, %67, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %214 = torch.aten.bitwise_not %213 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %215 = torch.aten.clone %25, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %216 = torch.aten.masked_fill.Tensor %208, %214, %215 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %216, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %217 = torch.aten.sub.Tensor %216, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %218 = torch.aten.exp %217 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %219 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %220 = torch.aten.sum.dim_IntList %218, %219, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %221 = torch.aten.div.Tensor %218, %220 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %222 = torch.aten.masked_fill.Scalar %221, %214, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %223 = torch.aten.broadcast_to %222, %105 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %224 = torch.aten.view %223, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %225 = torch.aten.broadcast_to %200, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %226 = torch.aten.view %225, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %227 = torch.aten.bmm %224, %226 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %228 = torch.aten.view %227, %96 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %229 = torch.aten.permute %228, %76 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %230 = torch.aten.clone %229, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %231 = torch.aten.view %230, %130 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %232 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %233 = torch.aten.view %231, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %234 = torch.aten.mm %233, %232 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %235 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %236 = torch.aten.add.Tensor %235, %234, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %237 = torch.aten.view %236, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %238 = torch.aten.add.Tensor %237, %179, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %239 = torch.aten.sum.dim_IntList %238, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %240 = torch.aten.div.Scalar %239, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %241 = torch.aten.sub.Tensor %238, %240, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %242 = torch.aten.pow.Tensor_Scalar %241, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %243 = torch.aten.sum.dim_IntList %242, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %244 = torch.aten.div.Scalar %243, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %245 = torch.aten.sub.Tensor %238, %240, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %246 = torch.aten.add.Scalar %244, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %247 = torch.aten.sqrt %246 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %248 = torch.aten.div.Tensor %245, %247 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %249 = torch.aten.mul.Tensor %28, %248 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %250 = torch.aten.add.Tensor %249, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %251 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %252 = torch.aten.view %250, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %253 = torch.aten.mm %252, %251 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %254 = torch.aten.mul.Scalar %22, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %255 = torch.aten.add.Tensor %254, %253, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %256 = torch.aten.view %255, %157 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %257 = torch.aten.gelu %256, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %258 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %259 = torch.aten.view %257, %161 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %260 = torch.aten.mm %259, %258 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %261 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %262 = torch.aten.add.Tensor %261, %260, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %263 = torch.aten.view %262, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %264 = torch.aten.add.Tensor %263, %250, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %265 = torch.aten.sum.dim_IntList %264, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %266 = torch.aten.div.Scalar %265, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %267 = torch.aten.sub.Tensor %264, %266, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %268 = torch.aten.pow.Tensor_Scalar %267, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %269 = torch.aten.sum.dim_IntList %268, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %270 = torch.aten.div.Scalar %269, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %271 = torch.aten.sub.Tensor %264, %266, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %272 = torch.aten.add.Scalar %270, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %273 = torch.aten.sqrt %272 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %274 = torch.aten.div.Tensor %271, %273 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %275 = torch.aten.mul.Tensor %28, %274 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %276 = torch.aten.add.Tensor %275, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %277 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %278 = torch.aten.view %276, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %279 = torch.aten.mm %278, %277 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %280 = torch.aten.view %279, %72 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %281 = torch.aten.view %280, %74 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %282 = torch.aten.permute %281, %76 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %283 = torch.aten.slice.Tensor %282, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %284 = torch.aten.slice.Tensor %282, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %285 = torch.aten.slice.Tensor %282, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %286 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %287 = torch.aten.unsqueeze %286, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %288 = torch.aten.slice.Tensor %287, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %289 = torch.aten.view %288, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %290 = torch.aten.permute %289, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %291 = torch.aten.add.Tensor %283, %290, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %292 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %293 = torch.aten.unsqueeze %292, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %294 = torch.aten.slice.Tensor %293, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %295 = torch.aten.view %294, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %296 = torch.aten.permute %295, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %297 = torch.aten.add.Tensor %285, %296, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %298 = torch.aten.div.Scalar %291, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %299 = torch.aten.transpose.int %284, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %300 = torch.aten.broadcast_to %298, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %301 = torch.aten.view %300, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %302 = torch.aten.broadcast_to %299, %100 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %303 = torch.aten.view %302, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %304 = torch.aten.bmm %301, %303 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %305 = torch.aten.view %304, %105 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %306 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %307 = torch.aten.to.dtype %306, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %308 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %309 = torch.aten.broadcast_to %307, %308 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %310 = torch.aten.copy %309, %67, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %311 = torch.aten.bitwise_not %310 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %312 = torch.aten.clone %25, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %313 = torch.aten.masked_fill.Tensor %305, %311, %312 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %313, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %314 = torch.aten.sub.Tensor %313, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %315 = torch.aten.exp %314 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %316 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %317 = torch.aten.sum.dim_IntList %315, %316, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %318 = torch.aten.div.Tensor %315, %317 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %319 = torch.aten.masked_fill.Scalar %318, %311, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %320 = torch.aten.broadcast_to %319, %105 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %321 = torch.aten.view %320, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %322 = torch.aten.broadcast_to %297, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %323 = torch.aten.view %322, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %324 = torch.aten.bmm %321, %323 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %325 = torch.aten.view %324, %96 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %326 = torch.aten.permute %325, %76 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %327 = torch.aten.clone %326, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %328 = torch.aten.view %327, %130 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %329 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %330 = torch.aten.view %328, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %331 = torch.aten.mm %330, %329 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %332 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %333 = torch.aten.add.Tensor %332, %331, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %334 = torch.aten.view %333, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %335 = torch.aten.add.Tensor %334, %276, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %336 = torch.aten.sum.dim_IntList %335, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %337 = torch.aten.div.Scalar %336, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %338 = torch.aten.sub.Tensor %335, %337, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %339 = torch.aten.pow.Tensor_Scalar %338, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %340 = torch.aten.sum.dim_IntList %339, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %341 = torch.aten.div.Scalar %340, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %342 = torch.aten.sub.Tensor %335, %337, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %343 = torch.aten.add.Scalar %341, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %344 = torch.aten.sqrt %343 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %345 = torch.aten.div.Tensor %342, %344 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %346 = torch.aten.mul.Tensor %28, %345 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %347 = torch.aten.add.Tensor %346, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %348 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %349 = torch.aten.view %347, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %350 = torch.aten.mm %349, %348 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %351 = torch.aten.mul.Scalar %22, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %352 = torch.aten.add.Tensor %351, %350, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %353 = torch.aten.view %352, %157 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %354 = torch.aten.gelu %353, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %355 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %356 = torch.aten.view %354, %161 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %357 = torch.aten.mm %356, %355 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %358 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %359 = torch.aten.add.Tensor %358, %357, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %360 = torch.aten.view %359, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %361 = torch.aten.add.Tensor %360, %347, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %362 = torch.aten.sum.dim_IntList %361, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %363 = torch.aten.div.Scalar %362, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %364 = torch.aten.sub.Tensor %361, %363, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %365 = torch.aten.pow.Tensor_Scalar %364, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %366 = torch.aten.sum.dim_IntList %365, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %367 = torch.aten.div.Scalar %366, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %368 = torch.aten.sub.Tensor %361, %363, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %369 = torch.aten.add.Scalar %367, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %370 = torch.aten.sqrt %369 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %371 = torch.aten.div.Tensor %368, %370 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %372 = torch.aten.mul.Tensor %28, %371 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %373 = torch.aten.add.Tensor %372, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %374 = torch.aten.transpose.int %12, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %375 = torch.aten.view %373, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %376 = torch.aten.mm %375, %374 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %377 = torch.aten.view %376, %72 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %378 = torch.aten.view %377, %74 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %379 = torch.aten.permute %378, %76 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %380 = torch.aten.slice.Tensor %379, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %381 = torch.aten.slice.Tensor %379, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %382 = torch.aten.slice.Tensor %379, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %383 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %384 = torch.aten.unsqueeze %383, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %385 = torch.aten.slice.Tensor %384, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %386 = torch.aten.view %385, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %387 = torch.aten.permute %386, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %388 = torch.aten.add.Tensor %380, %387, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %389 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %390 = torch.aten.unsqueeze %389, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %391 = torch.aten.slice.Tensor %390, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %392 = torch.aten.view %391, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %393 = torch.aten.permute %392, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %394 = torch.aten.add.Tensor %382, %393, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %395 = torch.aten.div.Scalar %388, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %396 = torch.aten.transpose.int %381, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %397 = torch.aten.broadcast_to %395, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %398 = torch.aten.view %397, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %399 = torch.aten.broadcast_to %396, %100 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %400 = torch.aten.view %399, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %401 = torch.aten.bmm %398, %400 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %402 = torch.aten.view %401, %105 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %403 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %404 = torch.aten.to.dtype %403, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %405 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %406 = torch.aten.broadcast_to %404, %405 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %407 = torch.aten.copy %406, %67, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %408 = torch.aten.bitwise_not %407 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %409 = torch.aten.clone %25, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %410 = torch.aten.masked_fill.Tensor %402, %408, %409 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %410, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %411 = torch.aten.sub.Tensor %410, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %412 = torch.aten.exp %411 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %413 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %414 = torch.aten.sum.dim_IntList %412, %413, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %415 = torch.aten.div.Tensor %412, %414 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %416 = torch.aten.masked_fill.Scalar %415, %408, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %417 = torch.aten.broadcast_to %416, %105 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %418 = torch.aten.view %417, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %419 = torch.aten.broadcast_to %394, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %420 = torch.aten.view %419, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %421 = torch.aten.bmm %418, %420 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %422 = torch.aten.view %421, %96 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %423 = torch.aten.permute %422, %76 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %424 = torch.aten.clone %423, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %425 = torch.aten.view %424, %130 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %426 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %427 = torch.aten.view %425, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %428 = torch.aten.mm %427, %426 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %429 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %430 = torch.aten.add.Tensor %429, %428, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %431 = torch.aten.view %430, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %432 = torch.aten.add.Tensor %431, %373, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %433 = torch.aten.sum.dim_IntList %432, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %434 = torch.aten.div.Scalar %433, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %435 = torch.aten.sub.Tensor %432, %434, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %436 = torch.aten.pow.Tensor_Scalar %435, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %437 = torch.aten.sum.dim_IntList %436, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %438 = torch.aten.div.Scalar %437, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %439 = torch.aten.sub.Tensor %432, %434, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %440 = torch.aten.add.Scalar %438, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %441 = torch.aten.sqrt %440 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %442 = torch.aten.div.Tensor %439, %441 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %443 = torch.aten.mul.Tensor %28, %442 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %444 = torch.aten.add.Tensor %443, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %445 = torch.aten.transpose.int %10, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %446 = torch.aten.view %444, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %447 = torch.aten.mm %446, %445 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %448 = torch.aten.mul.Scalar %22, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %449 = torch.aten.add.Tensor %448, %447, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %450 = torch.aten.view %449, %157 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %451 = torch.aten.gelu %450, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %452 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %453 = torch.aten.view %451, %161 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %454 = torch.aten.mm %453, %452 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %455 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %456 = torch.aten.add.Tensor %455, %454, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %457 = torch.aten.view %456, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %458 = torch.aten.add.Tensor %457, %444, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %459 = torch.aten.sum.dim_IntList %458, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %460 = torch.aten.div.Scalar %459, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %461 = torch.aten.sub.Tensor %458, %460, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %462 = torch.aten.pow.Tensor_Scalar %461, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %463 = torch.aten.sum.dim_IntList %462, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %464 = torch.aten.div.Scalar %463, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %465 = torch.aten.sub.Tensor %458, %460, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %466 = torch.aten.add.Scalar %464, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %467 = torch.aten.sqrt %466 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %468 = torch.aten.div.Tensor %465, %467 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %469 = torch.aten.mul.Tensor %28, %468 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %470 = torch.aten.add.Tensor %469, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %471 = torch.aten.transpose.int %8, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %472 = torch.aten.view %470, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %473 = torch.aten.mm %472, %471 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %474 = torch.aten.view %473, %72 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %475 = torch.aten.view %474, %74 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %476 = torch.aten.permute %475, %76 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %477 = torch.aten.slice.Tensor %476, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %478 = torch.aten.slice.Tensor %476, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %479 = torch.aten.slice.Tensor %476, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %480 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %481 = torch.aten.unsqueeze %480, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %482 = torch.aten.slice.Tensor %481, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %483 = torch.aten.view %482, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %484 = torch.aten.permute %483, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %485 = torch.aten.add.Tensor %477, %484, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %486 = torch.aten.unsqueeze %27, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %487 = torch.aten.unsqueeze %486, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %488 = torch.aten.slice.Tensor %487, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %489 = torch.aten.view %488, %84 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %490 = torch.aten.permute %489, %76 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %491 = torch.aten.add.Tensor %479, %490, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %492 = torch.aten.div.Scalar %485, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %493 = torch.aten.transpose.int %478, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %494 = torch.aten.broadcast_to %492, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %495 = torch.aten.view %494, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %496 = torch.aten.broadcast_to %493, %100 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %497 = torch.aten.view %496, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %498 = torch.aten.bmm %495, %497 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %499 = torch.aten.view %498, %105 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %500 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %501 = torch.aten.to.dtype %500, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %502 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %503 = torch.aten.broadcast_to %501, %502 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %504 = torch.aten.copy %503, %67, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %505 = torch.aten.bitwise_not %504 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %506 = torch.aten.clone %25, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %507 = torch.aten.masked_fill.Tensor %499, %505, %506 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %507, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %508 = torch.aten.sub.Tensor %507, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %509 = torch.aten.exp %508 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %510 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %511 = torch.aten.sum.dim_IntList %509, %510, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %512 = torch.aten.div.Tensor %509, %511 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %513 = torch.aten.masked_fill.Scalar %512, %505, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %514 = torch.aten.broadcast_to %513, %105 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %515 = torch.aten.view %514, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %516 = torch.aten.broadcast_to %491, %96 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %517 = torch.aten.view %516, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %518 = torch.aten.bmm %515, %517 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %519 = torch.aten.view %518, %96 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %520 = torch.aten.permute %519, %76 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %521 = torch.aten.clone %520, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %522 = torch.aten.view %521, %130 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %523 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %524 = torch.aten.view %522, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %525 = torch.aten.mm %524, %523 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %526 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %527 = torch.aten.add.Tensor %526, %525, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %528 = torch.aten.view %527, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %529 = torch.aten.add.Tensor %528, %470, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %530 = torch.aten.sum.dim_IntList %529, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %531 = torch.aten.div.Scalar %530, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %532 = torch.aten.sub.Tensor %529, %531, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %533 = torch.aten.pow.Tensor_Scalar %532, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %534 = torch.aten.sum.dim_IntList %533, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %535 = torch.aten.div.Scalar %534, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %536 = torch.aten.sub.Tensor %529, %531, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %537 = torch.aten.add.Scalar %535, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %538 = torch.aten.sqrt %537 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %539 = torch.aten.div.Tensor %536, %538 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %540 = torch.aten.mul.Tensor %28, %539 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %541 = torch.aten.add.Tensor %540, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %542 = torch.aten.transpose.int %6, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %543 = torch.aten.view %541, %69 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %544 = torch.aten.mm %543, %542 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %545 = torch.aten.mul.Scalar %22, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %546 = torch.aten.add.Tensor %545, %544, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %547 = torch.aten.view %546, %157 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %548 = torch.aten.gelu %547, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %549 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %550 = torch.aten.view %548, %161 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %551 = torch.aten.mm %550, %549 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %552 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %553 = torch.aten.add.Tensor %552, %551, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %554 = torch.aten.view %553, %137 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %555 = torch.aten.add.Tensor %554, %541, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %556 = torch.aten.sum.dim_IntList %555, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %557 = torch.aten.div.Scalar %556, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %558 = torch.aten.sub.Tensor %555, %557, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %559 = torch.aten.pow.Tensor_Scalar %558, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %560 = torch.aten.sum.dim_IntList %559, %43, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %561 = torch.aten.div.Scalar %560, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %562 = torch.aten.sub.Tensor %555, %557, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %563 = torch.aten.add.Scalar %561, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %564 = torch.aten.sqrt %563 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %565 = torch.aten.div.Tensor %562, %564 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %566 = torch.aten.mul.Tensor %28, %565 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %567 = torch.aten.add.Tensor %566, %27, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.slice.Tensor %567, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %569 = torch.aten.slice.Tensor %568, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %570 = torch.aten.squeeze.dim %569, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %571 = torch.aten.transpose.int %4, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %572 = torch.aten.mm %570, %571 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %573 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %574 = torch.aten.add.Tensor %573, %572, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %575 = torch.aten.gelu %574, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %576 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %577 = torch.aten.mm %575, %576 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %578 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %579 = torch.aten.add.Tensor %578, %577, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %579 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8ef2fe0) {
  %8 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> !torch.vtensor<[32,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8e6af10)
    ** Replace : 'torch.vtensor.literal'(0x8ef2fe0)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8e6af10) {
      %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %8 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %10 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %12 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %23 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %26 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %28 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %29 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %34 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %35 = torch.aten.ones %34, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %36 = torch.aten.zeros %34, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %37 = torch.aten.slice.Tensor %33, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %38 = torch.aten.slice.Tensor %37, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %39 = torch.aten.embedding %32, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %40 = torch.aten.embedding %31, %38, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %41 = torch.aten.add.Tensor %39, %40, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %42 = torch.aten.embedding %30, %36, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %43 = torch.aten.add.Tensor %41, %42, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %44 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %45 = torch.aten.sum.dim_IntList %43, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %46 = torch.aten.div.Scalar %45, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %47 = torch.aten.sub.Tensor %43, %46, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.aten.pow.Tensor_Scalar %47, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.aten.sum.dim_IntList %48, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %50 = torch.aten.div.Scalar %49, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %51 = torch.aten.sub.Tensor %43, %46, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %52 = torch.aten.add.Scalar %50, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %53 = torch.aten.sqrt %52 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %54 = torch.aten.div.Tensor %51, %53 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %55 = torch.aten.mul.Tensor %29, %54 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %56 = torch.aten.add.Tensor %55, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.unsqueeze %35, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %58 = torch.aten.mul.Tensor %56, %57 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %59 = torch.aten.unsqueeze %35, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %60 = torch.aten.unsqueeze %59, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %61 = torch.aten.squeeze.dim %60, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %62 = torch.aten.unsqueeze %61, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %63 = torch.aten.mul.Tensor %60, %62 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %64 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %65 = torch.aten.to.dtype %64, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %66 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %67 = torch.aten.broadcast_to %65, %66 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %68 = torch.aten.copy %67, %63, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %69 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %70 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %71 = torch.aten.view %58, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %72 = torch.aten.mm %71, %69 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %73 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %74 = torch.aten.view %72, %73 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %75 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %76 = torch.aten.view %74, %75 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %77 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %78 = torch.aten.permute %76, %77 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %79 = torch.aten.slice.Tensor %78, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %80 = torch.aten.slice.Tensor %78, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %81 = torch.aten.slice.Tensor %78, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %82 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %83 = torch.aten.unsqueeze %82, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %84 = torch.aten.slice.Tensor %83, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %85 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %86 = torch.aten.view %84, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %87 = torch.aten.permute %86, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %88 = torch.aten.add.Tensor %79, %87, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %89 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %90 = torch.aten.unsqueeze %89, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %91 = torch.aten.slice.Tensor %90, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %92 = torch.aten.view %91, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %93 = torch.aten.permute %92, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %94 = torch.aten.add.Tensor %81, %93, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %95 = torch.aten.div.Scalar %88, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %96 = torch.aten.transpose.int %80, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %97 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %98 = torch.aten.broadcast_to %95, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %99 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %100 = torch.aten.view %98, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %101 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %102 = torch.aten.broadcast_to %96, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %103 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %104 = torch.aten.view %102, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %105 = torch.aten.bmm %100, %104 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %106 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %107 = torch.aten.view %105, %106 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %108 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %109 = torch.aten.to.dtype %108, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %110 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %111 = torch.aten.broadcast_to %109, %110 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %112 = torch.aten.copy %111, %68, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %113 = torch.aten.bitwise_not %112 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %114 = torch.aten.clone %26, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %115 = torch.aten.masked_fill.Tensor %107, %113, %114 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %115, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %116 = torch.aten.sub.Tensor %115, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %117 = torch.aten.exp %116 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %118 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %119 = torch.aten.sum.dim_IntList %117, %118, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %120 = torch.aten.div.Tensor %117, %119 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %121 = torch.aten.masked_fill.Scalar %120, %113, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %122 = torch.aten.broadcast_to %121, %106 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %123 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %124 = torch.aten.view %122, %123 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %125 = torch.aten.broadcast_to %94, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %126 = torch.aten.view %125, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %127 = torch.aten.bmm %124, %126 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %128 = torch.aten.view %127, %97 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %129 = torch.aten.permute %128, %77 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %130 = torch.aten.clone %129, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %131 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %132 = torch.aten.view %130, %131 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %133 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %134 = torch.aten.view %132, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %135 = torch.aten.mm %134, %133 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %136 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %137 = torch.aten.add.Tensor %136, %135, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %138 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %139 = torch.aten.view %137, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %140 = torch.aten.add.Tensor %139, %58, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %141 = torch.aten.sum.dim_IntList %140, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %142 = torch.aten.div.Scalar %141, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %143 = torch.aten.sub.Tensor %140, %142, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %144 = torch.aten.pow.Tensor_Scalar %143, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %145 = torch.aten.sum.dim_IntList %144, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %146 = torch.aten.div.Scalar %145, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %147 = torch.aten.sub.Tensor %140, %142, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %148 = torch.aten.add.Scalar %146, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %149 = torch.aten.sqrt %148 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %150 = torch.aten.div.Tensor %147, %149 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %151 = torch.aten.mul.Tensor %29, %150 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %152 = torch.aten.add.Tensor %151, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %153 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %154 = torch.aten.view %152, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %155 = torch.aten.mm %154, %153 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %156 = torch.aten.mul.Scalar %23, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %157 = torch.aten.add.Tensor %156, %155, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %158 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %159 = torch.aten.view %157, %158 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %160 = torch.aten.gelu %159, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %161 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %162 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %163 = torch.aten.view %160, %162 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %164 = torch.aten.mm %163, %161 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %165 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %166 = torch.aten.add.Tensor %165, %164, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %167 = torch.aten.view %166, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %168 = torch.aten.add.Tensor %167, %152, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %169 = torch.aten.sum.dim_IntList %168, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %170 = torch.aten.div.Scalar %169, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %171 = torch.aten.sub.Tensor %168, %170, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %172 = torch.aten.pow.Tensor_Scalar %171, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %173 = torch.aten.sum.dim_IntList %172, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %174 = torch.aten.div.Scalar %173, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %175 = torch.aten.sub.Tensor %168, %170, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %176 = torch.aten.add.Scalar %174, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %177 = torch.aten.sqrt %176 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %178 = torch.aten.div.Tensor %175, %177 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %179 = torch.aten.mul.Tensor %29, %178 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %180 = torch.aten.add.Tensor %179, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %181 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %182 = torch.aten.view %180, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %183 = torch.aten.mm %182, %181 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %184 = torch.aten.view %183, %73 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %185 = torch.aten.view %184, %75 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %186 = torch.aten.permute %185, %77 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %187 = torch.aten.slice.Tensor %186, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %188 = torch.aten.slice.Tensor %186, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %189 = torch.aten.slice.Tensor %186, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %190 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %191 = torch.aten.unsqueeze %190, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %192 = torch.aten.slice.Tensor %191, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %193 = torch.aten.view %192, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %194 = torch.aten.permute %193, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %195 = torch.aten.add.Tensor %187, %194, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %196 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %197 = torch.aten.unsqueeze %196, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %198 = torch.aten.slice.Tensor %197, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %199 = torch.aten.view %198, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %200 = torch.aten.permute %199, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %201 = torch.aten.add.Tensor %189, %200, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %202 = torch.aten.div.Scalar %195, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %203 = torch.aten.transpose.int %188, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %204 = torch.aten.broadcast_to %202, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %205 = torch.aten.view %204, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %206 = torch.aten.broadcast_to %203, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %207 = torch.aten.view %206, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %208 = torch.aten.bmm %205, %207 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %209 = torch.aten.view %208, %106 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %210 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %211 = torch.aten.to.dtype %210, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %212 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %213 = torch.aten.broadcast_to %211, %212 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %214 = torch.aten.copy %213, %68, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %215 = torch.aten.bitwise_not %214 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %216 = torch.aten.clone %26, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %217 = torch.aten.masked_fill.Tensor %209, %215, %216 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %217, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %218 = torch.aten.sub.Tensor %217, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %219 = torch.aten.exp %218 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %220 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %221 = torch.aten.sum.dim_IntList %219, %220, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %222 = torch.aten.div.Tensor %219, %221 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %223 = torch.aten.masked_fill.Scalar %222, %215, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %224 = torch.aten.broadcast_to %223, %106 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %225 = torch.aten.view %224, %123 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %226 = torch.aten.broadcast_to %201, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %227 = torch.aten.view %226, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %228 = torch.aten.bmm %225, %227 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %229 = torch.aten.view %228, %97 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %230 = torch.aten.permute %229, %77 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %231 = torch.aten.clone %230, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %232 = torch.aten.view %231, %131 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %233 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %234 = torch.aten.view %232, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %235 = torch.aten.mm %234, %233 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %236 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %237 = torch.aten.add.Tensor %236, %235, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %238 = torch.aten.view %237, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %239 = torch.aten.add.Tensor %238, %180, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %240 = torch.aten.sum.dim_IntList %239, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %241 = torch.aten.div.Scalar %240, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %242 = torch.aten.sub.Tensor %239, %241, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %243 = torch.aten.pow.Tensor_Scalar %242, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %244 = torch.aten.sum.dim_IntList %243, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %245 = torch.aten.div.Scalar %244, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %246 = torch.aten.sub.Tensor %239, %241, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %247 = torch.aten.add.Scalar %245, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %248 = torch.aten.sqrt %247 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %249 = torch.aten.div.Tensor %246, %248 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %250 = torch.aten.mul.Tensor %29, %249 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %251 = torch.aten.add.Tensor %250, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %252 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %253 = torch.aten.view %251, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %254 = torch.aten.mm %253, %252 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %255 = torch.aten.mul.Scalar %23, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %256 = torch.aten.add.Tensor %255, %254, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %257 = torch.aten.view %256, %158 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %258 = torch.aten.gelu %257, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %259 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %260 = torch.aten.view %258, %162 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %261 = torch.aten.mm %260, %259 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %262 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %263 = torch.aten.add.Tensor %262, %261, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %264 = torch.aten.view %263, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %265 = torch.aten.add.Tensor %264, %251, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %266 = torch.aten.sum.dim_IntList %265, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %267 = torch.aten.div.Scalar %266, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %268 = torch.aten.sub.Tensor %265, %267, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %269 = torch.aten.pow.Tensor_Scalar %268, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %270 = torch.aten.sum.dim_IntList %269, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %271 = torch.aten.div.Scalar %270, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %272 = torch.aten.sub.Tensor %265, %267, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %273 = torch.aten.add.Scalar %271, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %274 = torch.aten.sqrt %273 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %275 = torch.aten.div.Tensor %272, %274 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %276 = torch.aten.mul.Tensor %29, %275 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %277 = torch.aten.add.Tensor %276, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %278 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %279 = torch.aten.view %277, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %280 = torch.aten.mm %279, %278 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %281 = torch.aten.view %280, %73 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %282 = torch.aten.view %281, %75 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %283 = torch.aten.permute %282, %77 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %284 = torch.aten.slice.Tensor %283, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %285 = torch.aten.slice.Tensor %283, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %286 = torch.aten.slice.Tensor %283, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %287 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %288 = torch.aten.unsqueeze %287, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %289 = torch.aten.slice.Tensor %288, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %290 = torch.aten.view %289, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %291 = torch.aten.permute %290, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %292 = torch.aten.add.Tensor %284, %291, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %293 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %294 = torch.aten.unsqueeze %293, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %295 = torch.aten.slice.Tensor %294, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %296 = torch.aten.view %295, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %297 = torch.aten.permute %296, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %298 = torch.aten.add.Tensor %286, %297, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %299 = torch.aten.div.Scalar %292, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %300 = torch.aten.transpose.int %285, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %301 = torch.aten.broadcast_to %299, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %302 = torch.aten.view %301, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %303 = torch.aten.broadcast_to %300, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %304 = torch.aten.view %303, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %305 = torch.aten.bmm %302, %304 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %306 = torch.aten.view %305, %106 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %307 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %308 = torch.aten.to.dtype %307, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %309 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %310 = torch.aten.broadcast_to %308, %309 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %311 = torch.aten.copy %310, %68, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %312 = torch.aten.bitwise_not %311 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %313 = torch.aten.clone %26, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %314 = torch.aten.masked_fill.Tensor %306, %312, %313 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %314, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %315 = torch.aten.sub.Tensor %314, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %316 = torch.aten.exp %315 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %317 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %318 = torch.aten.sum.dim_IntList %316, %317, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %319 = torch.aten.div.Tensor %316, %318 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %320 = torch.aten.masked_fill.Scalar %319, %312, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %321 = torch.aten.broadcast_to %320, %106 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %322 = torch.aten.view %321, %123 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %323 = torch.aten.broadcast_to %298, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %324 = torch.aten.view %323, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %325 = torch.aten.bmm %322, %324 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %326 = torch.aten.view %325, %97 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %327 = torch.aten.permute %326, %77 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %328 = torch.aten.clone %327, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %329 = torch.aten.view %328, %131 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %330 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %331 = torch.aten.view %329, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %332 = torch.aten.mm %331, %330 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %333 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %334 = torch.aten.add.Tensor %333, %332, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %335 = torch.aten.view %334, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %336 = torch.aten.add.Tensor %335, %277, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %337 = torch.aten.sum.dim_IntList %336, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %338 = torch.aten.div.Scalar %337, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %339 = torch.aten.sub.Tensor %336, %338, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %340 = torch.aten.pow.Tensor_Scalar %339, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %341 = torch.aten.sum.dim_IntList %340, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %342 = torch.aten.div.Scalar %341, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %343 = torch.aten.sub.Tensor %336, %338, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %344 = torch.aten.add.Scalar %342, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %345 = torch.aten.sqrt %344 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %346 = torch.aten.div.Tensor %343, %345 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %347 = torch.aten.mul.Tensor %29, %346 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %348 = torch.aten.add.Tensor %347, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %349 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %350 = torch.aten.view %348, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %351 = torch.aten.mm %350, %349 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %352 = torch.aten.mul.Scalar %23, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %353 = torch.aten.add.Tensor %352, %351, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %354 = torch.aten.view %353, %158 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %355 = torch.aten.gelu %354, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %356 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %357 = torch.aten.view %355, %162 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %358 = torch.aten.mm %357, %356 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %359 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %360 = torch.aten.add.Tensor %359, %358, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %361 = torch.aten.view %360, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %362 = torch.aten.add.Tensor %361, %348, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %363 = torch.aten.sum.dim_IntList %362, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %364 = torch.aten.div.Scalar %363, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %365 = torch.aten.sub.Tensor %362, %364, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %366 = torch.aten.pow.Tensor_Scalar %365, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %367 = torch.aten.sum.dim_IntList %366, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %368 = torch.aten.div.Scalar %367, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %369 = torch.aten.sub.Tensor %362, %364, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %370 = torch.aten.add.Scalar %368, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %371 = torch.aten.sqrt %370 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %372 = torch.aten.div.Tensor %369, %371 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %373 = torch.aten.mul.Tensor %29, %372 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %374 = torch.aten.add.Tensor %373, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %375 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %376 = torch.aten.view %374, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %377 = torch.aten.mm %376, %375 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %378 = torch.aten.view %377, %73 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %379 = torch.aten.view %378, %75 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %380 = torch.aten.permute %379, %77 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %381 = torch.aten.slice.Tensor %380, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %382 = torch.aten.slice.Tensor %380, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %383 = torch.aten.slice.Tensor %380, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %384 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %385 = torch.aten.unsqueeze %384, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %386 = torch.aten.slice.Tensor %385, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %387 = torch.aten.view %386, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %388 = torch.aten.permute %387, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %389 = torch.aten.add.Tensor %381, %388, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %390 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %391 = torch.aten.unsqueeze %390, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %392 = torch.aten.slice.Tensor %391, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %393 = torch.aten.view %392, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %394 = torch.aten.permute %393, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %395 = torch.aten.add.Tensor %383, %394, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %396 = torch.aten.div.Scalar %389, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %397 = torch.aten.transpose.int %382, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %398 = torch.aten.broadcast_to %396, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %399 = torch.aten.view %398, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %400 = torch.aten.broadcast_to %397, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %401 = torch.aten.view %400, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %402 = torch.aten.bmm %399, %401 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %403 = torch.aten.view %402, %106 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %404 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %405 = torch.aten.to.dtype %404, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %406 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %407 = torch.aten.broadcast_to %405, %406 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %408 = torch.aten.copy %407, %68, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %409 = torch.aten.bitwise_not %408 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %410 = torch.aten.clone %26, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %411 = torch.aten.masked_fill.Tensor %403, %409, %410 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %411, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %412 = torch.aten.sub.Tensor %411, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %413 = torch.aten.exp %412 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %414 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %415 = torch.aten.sum.dim_IntList %413, %414, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %416 = torch.aten.div.Tensor %413, %415 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %417 = torch.aten.masked_fill.Scalar %416, %409, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %418 = torch.aten.broadcast_to %417, %106 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %419 = torch.aten.view %418, %123 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %420 = torch.aten.broadcast_to %395, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %421 = torch.aten.view %420, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %422 = torch.aten.bmm %419, %421 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %423 = torch.aten.view %422, %97 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %424 = torch.aten.permute %423, %77 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %425 = torch.aten.clone %424, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %426 = torch.aten.view %425, %131 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %427 = torch.aten.transpose.int %12, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %428 = torch.aten.view %426, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %429 = torch.aten.mm %428, %427 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %430 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %431 = torch.aten.add.Tensor %430, %429, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %432 = torch.aten.view %431, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %433 = torch.aten.add.Tensor %432, %374, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %434 = torch.aten.sum.dim_IntList %433, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %435 = torch.aten.div.Scalar %434, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %436 = torch.aten.sub.Tensor %433, %435, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %437 = torch.aten.pow.Tensor_Scalar %436, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %438 = torch.aten.sum.dim_IntList %437, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %439 = torch.aten.div.Scalar %438, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %440 = torch.aten.sub.Tensor %433, %435, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %441 = torch.aten.add.Scalar %439, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %442 = torch.aten.sqrt %441 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %443 = torch.aten.div.Tensor %440, %442 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %444 = torch.aten.mul.Tensor %29, %443 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %445 = torch.aten.add.Tensor %444, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %446 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %447 = torch.aten.view %445, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %448 = torch.aten.mm %447, %446 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %449 = torch.aten.mul.Scalar %23, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %450 = torch.aten.add.Tensor %449, %448, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %451 = torch.aten.view %450, %158 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %452 = torch.aten.gelu %451, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %453 = torch.aten.transpose.int %10, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %454 = torch.aten.view %452, %162 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %455 = torch.aten.mm %454, %453 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %456 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %457 = torch.aten.add.Tensor %456, %455, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %458 = torch.aten.view %457, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %459 = torch.aten.add.Tensor %458, %445, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %460 = torch.aten.sum.dim_IntList %459, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %461 = torch.aten.div.Scalar %460, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %462 = torch.aten.sub.Tensor %459, %461, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %463 = torch.aten.pow.Tensor_Scalar %462, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %464 = torch.aten.sum.dim_IntList %463, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %465 = torch.aten.div.Scalar %464, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %466 = torch.aten.sub.Tensor %459, %461, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %467 = torch.aten.add.Scalar %465, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %468 = torch.aten.sqrt %467 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %469 = torch.aten.div.Tensor %466, %468 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %470 = torch.aten.mul.Tensor %29, %469 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %471 = torch.aten.add.Tensor %470, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %472 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %473 = torch.aten.view %471, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %474 = torch.aten.mm %473, %472 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %475 = torch.aten.view %474, %73 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %476 = torch.aten.view %475, %75 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %477 = torch.aten.permute %476, %77 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %478 = torch.aten.slice.Tensor %477, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %479 = torch.aten.slice.Tensor %477, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %480 = torch.aten.slice.Tensor %477, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %481 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %482 = torch.aten.unsqueeze %481, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %483 = torch.aten.slice.Tensor %482, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %484 = torch.aten.view %483, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %485 = torch.aten.permute %484, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %486 = torch.aten.add.Tensor %478, %485, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %487 = torch.aten.unsqueeze %28, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %488 = torch.aten.unsqueeze %487, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %489 = torch.aten.slice.Tensor %488, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %490 = torch.aten.view %489, %85 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %491 = torch.aten.permute %490, %77 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %492 = torch.aten.add.Tensor %480, %491, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %493 = torch.aten.div.Scalar %486, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %494 = torch.aten.transpose.int %479, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %495 = torch.aten.broadcast_to %493, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %496 = torch.aten.view %495, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %497 = torch.aten.broadcast_to %494, %101 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %498 = torch.aten.view %497, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %499 = torch.aten.bmm %496, %498 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %500 = torch.aten.view %499, %106 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %501 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %502 = torch.aten.to.dtype %501, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %503 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %504 = torch.aten.broadcast_to %502, %503 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %505 = torch.aten.copy %504, %68, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %506 = torch.aten.bitwise_not %505 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %507 = torch.aten.clone %26, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %508 = torch.aten.masked_fill.Tensor %500, %506, %507 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %508, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %509 = torch.aten.sub.Tensor %508, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %510 = torch.aten.exp %509 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %511 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %512 = torch.aten.sum.dim_IntList %510, %511, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %513 = torch.aten.div.Tensor %510, %512 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %514 = torch.aten.masked_fill.Scalar %513, %506, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %515 = torch.aten.broadcast_to %514, %106 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %516 = torch.aten.view %515, %123 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %517 = torch.aten.broadcast_to %492, %97 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %518 = torch.aten.view %517, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %519 = torch.aten.bmm %516, %518 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %520 = torch.aten.view %519, %97 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %521 = torch.aten.permute %520, %77 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %522 = torch.aten.clone %521, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %523 = torch.aten.view %522, %131 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %524 = torch.aten.transpose.int %8, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %525 = torch.aten.view %523, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %526 = torch.aten.mm %525, %524 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %527 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %528 = torch.aten.add.Tensor %527, %526, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %529 = torch.aten.view %528, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %530 = torch.aten.add.Tensor %529, %471, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %531 = torch.aten.sum.dim_IntList %530, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %532 = torch.aten.div.Scalar %531, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %533 = torch.aten.sub.Tensor %530, %532, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %534 = torch.aten.pow.Tensor_Scalar %533, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %535 = torch.aten.sum.dim_IntList %534, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %536 = torch.aten.div.Scalar %535, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %537 = torch.aten.sub.Tensor %530, %532, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %538 = torch.aten.add.Scalar %536, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %539 = torch.aten.sqrt %538 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %540 = torch.aten.div.Tensor %537, %539 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %541 = torch.aten.mul.Tensor %29, %540 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %542 = torch.aten.add.Tensor %541, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %543 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %544 = torch.aten.view %542, %70 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %545 = torch.aten.mm %544, %543 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %546 = torch.aten.mul.Scalar %23, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %547 = torch.aten.add.Tensor %546, %545, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %548 = torch.aten.view %547, %158 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %549 = torch.aten.gelu %548, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %550 = torch.aten.transpose.int %6, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %551 = torch.aten.view %549, %162 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %552 = torch.aten.mm %551, %550 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %553 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %554 = torch.aten.add.Tensor %553, %552, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %555 = torch.aten.view %554, %138 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %556 = torch.aten.add.Tensor %555, %542, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %557 = torch.aten.sum.dim_IntList %556, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %558 = torch.aten.div.Scalar %557, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %559 = torch.aten.sub.Tensor %556, %558, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %560 = torch.aten.pow.Tensor_Scalar %559, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %561 = torch.aten.sum.dim_IntList %560, %44, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %562 = torch.aten.div.Scalar %561, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %563 = torch.aten.sub.Tensor %556, %558, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %564 = torch.aten.add.Scalar %562, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %565 = torch.aten.sqrt %564 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %566 = torch.aten.div.Tensor %563, %565 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %567 = torch.aten.mul.Tensor %29, %566 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.add.Tensor %567, %28, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %569 = torch.aten.slice.Tensor %568, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %570 = torch.aten.slice.Tensor %569, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %571 = torch.aten.squeeze.dim %570, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %572 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %573 = torch.aten.mm %571, %572 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %574 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %575 = torch.aten.add.Tensor %574, %573, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %576 = torch.aten.gelu %575, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %577 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %578 = torch.aten.mm %576, %577 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %579 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %580 = torch.aten.add.Tensor %579, %578, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %580 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8ef5a80) {
  %10 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> !torch.vtensor<[32,37],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8e9b190)
    ** Replace : 'torch.vtensor.literal'(0x8ef5a80)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8e9b190) {
      %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %10 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %12 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %24 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %27 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %29 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %30 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %34 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %35 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %36 = torch.aten.ones %35, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %37 = torch.aten.zeros %35, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %38 = torch.aten.slice.Tensor %34, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %39 = torch.aten.slice.Tensor %38, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %40 = torch.aten.embedding %33, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %41 = torch.aten.embedding %32, %39, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %42 = torch.aten.add.Tensor %40, %41, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %43 = torch.aten.embedding %31, %37, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %44 = torch.aten.add.Tensor %42, %43, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %45 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %46 = torch.aten.sum.dim_IntList %44, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %47 = torch.aten.div.Scalar %46, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %48 = torch.aten.sub.Tensor %44, %47, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.aten.pow.Tensor_Scalar %48, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.aten.sum.dim_IntList %49, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %51 = torch.aten.div.Scalar %50, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %52 = torch.aten.sub.Tensor %44, %47, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %53 = torch.aten.add.Scalar %51, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %54 = torch.aten.sqrt %53 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %55 = torch.aten.div.Tensor %52, %54 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %56 = torch.aten.mul.Tensor %30, %55 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.add.Tensor %56, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %58 = torch.aten.unsqueeze %36, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %59 = torch.aten.mul.Tensor %57, %58 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %60 = torch.aten.unsqueeze %36, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %61 = torch.aten.unsqueeze %60, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %62 = torch.aten.squeeze.dim %61, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %63 = torch.aten.unsqueeze %62, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %64 = torch.aten.mul.Tensor %61, %63 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %65 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %66 = torch.aten.to.dtype %65, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %67 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %68 = torch.aten.broadcast_to %66, %67 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %69 = torch.aten.copy %68, %64, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %70 = torch.aten.transpose.int %28, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %71 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %72 = torch.aten.view %59, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %73 = torch.aten.mm %72, %70 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %74 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %75 = torch.aten.view %73, %74 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %76 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %77 = torch.aten.view %75, %76 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %78 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %79 = torch.aten.permute %77, %78 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %80 = torch.aten.slice.Tensor %79, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %81 = torch.aten.slice.Tensor %79, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %82 = torch.aten.slice.Tensor %79, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %83 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %84 = torch.aten.unsqueeze %83, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %85 = torch.aten.slice.Tensor %84, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %86 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %87 = torch.aten.view %85, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %88 = torch.aten.permute %87, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %89 = torch.aten.add.Tensor %80, %88, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %90 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %91 = torch.aten.unsqueeze %90, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %92 = torch.aten.slice.Tensor %91, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %93 = torch.aten.view %92, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %94 = torch.aten.permute %93, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %95 = torch.aten.add.Tensor %82, %94, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %96 = torch.aten.div.Scalar %89, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %97 = torch.aten.transpose.int %81, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %98 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %99 = torch.aten.broadcast_to %96, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %100 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %101 = torch.aten.view %99, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %102 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %103 = torch.aten.broadcast_to %97, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %104 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %105 = torch.aten.view %103, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %106 = torch.aten.bmm %101, %105 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %107 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %108 = torch.aten.view %106, %107 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %109 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %110 = torch.aten.to.dtype %109, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %111 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %112 = torch.aten.broadcast_to %110, %111 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %113 = torch.aten.copy %112, %69, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %114 = torch.aten.bitwise_not %113 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %115 = torch.aten.clone %27, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %116 = torch.aten.masked_fill.Tensor %108, %114, %115 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %116, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %117 = torch.aten.sub.Tensor %116, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %118 = torch.aten.exp %117 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %119 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %120 = torch.aten.sum.dim_IntList %118, %119, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %121 = torch.aten.div.Tensor %118, %120 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %122 = torch.aten.masked_fill.Scalar %121, %114, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %123 = torch.aten.broadcast_to %122, %107 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %124 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %125 = torch.aten.view %123, %124 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %126 = torch.aten.broadcast_to %95, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %127 = torch.aten.view %126, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %128 = torch.aten.bmm %125, %127 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %129 = torch.aten.view %128, %98 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %130 = torch.aten.permute %129, %78 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %131 = torch.aten.clone %130, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %132 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %133 = torch.aten.view %131, %132 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %134 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %135 = torch.aten.view %133, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %136 = torch.aten.mm %135, %134 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %137 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %138 = torch.aten.add.Tensor %137, %136, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %139 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %140 = torch.aten.view %138, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %141 = torch.aten.add.Tensor %140, %59, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %142 = torch.aten.sum.dim_IntList %141, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %143 = torch.aten.div.Scalar %142, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %144 = torch.aten.sub.Tensor %141, %143, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %145 = torch.aten.pow.Tensor_Scalar %144, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %146 = torch.aten.sum.dim_IntList %145, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %147 = torch.aten.div.Scalar %146, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %148 = torch.aten.sub.Tensor %141, %143, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %149 = torch.aten.add.Scalar %147, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %150 = torch.aten.sqrt %149 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %151 = torch.aten.div.Tensor %148, %150 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %152 = torch.aten.mul.Tensor %30, %151 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %153 = torch.aten.add.Tensor %152, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %154 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %155 = torch.aten.view %153, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %156 = torch.aten.mm %155, %154 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %157 = torch.aten.mul.Scalar %24, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %158 = torch.aten.add.Tensor %157, %156, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %159 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %160 = torch.aten.view %158, %159 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %161 = torch.aten.gelu %160, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %162 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %163 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %164 = torch.aten.view %161, %163 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %165 = torch.aten.mm %164, %162 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %166 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %167 = torch.aten.add.Tensor %166, %165, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %168 = torch.aten.view %167, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %169 = torch.aten.add.Tensor %168, %153, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %170 = torch.aten.sum.dim_IntList %169, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %171 = torch.aten.div.Scalar %170, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %172 = torch.aten.sub.Tensor %169, %171, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %173 = torch.aten.pow.Tensor_Scalar %172, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %174 = torch.aten.sum.dim_IntList %173, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %175 = torch.aten.div.Scalar %174, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %176 = torch.aten.sub.Tensor %169, %171, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %177 = torch.aten.add.Scalar %175, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %178 = torch.aten.sqrt %177 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %179 = torch.aten.div.Tensor %176, %178 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %180 = torch.aten.mul.Tensor %30, %179 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %181 = torch.aten.add.Tensor %180, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %182 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %183 = torch.aten.view %181, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %184 = torch.aten.mm %183, %182 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %185 = torch.aten.view %184, %74 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %186 = torch.aten.view %185, %76 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %187 = torch.aten.permute %186, %78 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %188 = torch.aten.slice.Tensor %187, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %189 = torch.aten.slice.Tensor %187, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %190 = torch.aten.slice.Tensor %187, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %191 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %192 = torch.aten.unsqueeze %191, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %193 = torch.aten.slice.Tensor %192, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %194 = torch.aten.view %193, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %195 = torch.aten.permute %194, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %196 = torch.aten.add.Tensor %188, %195, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %197 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %198 = torch.aten.unsqueeze %197, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %199 = torch.aten.slice.Tensor %198, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %200 = torch.aten.view %199, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %201 = torch.aten.permute %200, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %202 = torch.aten.add.Tensor %190, %201, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %203 = torch.aten.div.Scalar %196, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %204 = torch.aten.transpose.int %189, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %205 = torch.aten.broadcast_to %203, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %206 = torch.aten.view %205, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %207 = torch.aten.broadcast_to %204, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %208 = torch.aten.view %207, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %209 = torch.aten.bmm %206, %208 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %210 = torch.aten.view %209, %107 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %211 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %212 = torch.aten.to.dtype %211, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %213 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %214 = torch.aten.broadcast_to %212, %213 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %215 = torch.aten.copy %214, %69, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %216 = torch.aten.bitwise_not %215 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %217 = torch.aten.clone %27, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %218 = torch.aten.masked_fill.Tensor %210, %216, %217 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %218, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %219 = torch.aten.sub.Tensor %218, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %220 = torch.aten.exp %219 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %221 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %222 = torch.aten.sum.dim_IntList %220, %221, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %223 = torch.aten.div.Tensor %220, %222 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %224 = torch.aten.masked_fill.Scalar %223, %216, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %225 = torch.aten.broadcast_to %224, %107 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %226 = torch.aten.view %225, %124 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %227 = torch.aten.broadcast_to %202, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %228 = torch.aten.view %227, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %229 = torch.aten.bmm %226, %228 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %230 = torch.aten.view %229, %98 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %231 = torch.aten.permute %230, %78 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %232 = torch.aten.clone %231, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %233 = torch.aten.view %232, %132 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %234 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %235 = torch.aten.view %233, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %236 = torch.aten.mm %235, %234 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %237 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %238 = torch.aten.add.Tensor %237, %236, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %239 = torch.aten.view %238, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %240 = torch.aten.add.Tensor %239, %181, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %241 = torch.aten.sum.dim_IntList %240, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %242 = torch.aten.div.Scalar %241, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %243 = torch.aten.sub.Tensor %240, %242, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %244 = torch.aten.pow.Tensor_Scalar %243, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %245 = torch.aten.sum.dim_IntList %244, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %246 = torch.aten.div.Scalar %245, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %247 = torch.aten.sub.Tensor %240, %242, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %248 = torch.aten.add.Scalar %246, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %249 = torch.aten.sqrt %248 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %250 = torch.aten.div.Tensor %247, %249 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %251 = torch.aten.mul.Tensor %30, %250 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %252 = torch.aten.add.Tensor %251, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %253 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %254 = torch.aten.view %252, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %255 = torch.aten.mm %254, %253 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %256 = torch.aten.mul.Scalar %24, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %257 = torch.aten.add.Tensor %256, %255, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %258 = torch.aten.view %257, %159 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %259 = torch.aten.gelu %258, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %260 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %261 = torch.aten.view %259, %163 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %262 = torch.aten.mm %261, %260 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %263 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %264 = torch.aten.add.Tensor %263, %262, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %265 = torch.aten.view %264, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %266 = torch.aten.add.Tensor %265, %252, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %267 = torch.aten.sum.dim_IntList %266, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %268 = torch.aten.div.Scalar %267, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %269 = torch.aten.sub.Tensor %266, %268, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %270 = torch.aten.pow.Tensor_Scalar %269, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %271 = torch.aten.sum.dim_IntList %270, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %272 = torch.aten.div.Scalar %271, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %273 = torch.aten.sub.Tensor %266, %268, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %274 = torch.aten.add.Scalar %272, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %275 = torch.aten.sqrt %274 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %276 = torch.aten.div.Tensor %273, %275 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %277 = torch.aten.mul.Tensor %30, %276 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %278 = torch.aten.add.Tensor %277, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %279 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %280 = torch.aten.view %278, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %281 = torch.aten.mm %280, %279 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %282 = torch.aten.view %281, %74 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %283 = torch.aten.view %282, %76 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %284 = torch.aten.permute %283, %78 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %285 = torch.aten.slice.Tensor %284, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %286 = torch.aten.slice.Tensor %284, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %287 = torch.aten.slice.Tensor %284, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %288 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %289 = torch.aten.unsqueeze %288, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %290 = torch.aten.slice.Tensor %289, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %291 = torch.aten.view %290, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %292 = torch.aten.permute %291, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %293 = torch.aten.add.Tensor %285, %292, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %294 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %295 = torch.aten.unsqueeze %294, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %296 = torch.aten.slice.Tensor %295, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %297 = torch.aten.view %296, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %298 = torch.aten.permute %297, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %299 = torch.aten.add.Tensor %287, %298, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %300 = torch.aten.div.Scalar %293, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %301 = torch.aten.transpose.int %286, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %302 = torch.aten.broadcast_to %300, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %303 = torch.aten.view %302, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %304 = torch.aten.broadcast_to %301, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %305 = torch.aten.view %304, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %306 = torch.aten.bmm %303, %305 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %307 = torch.aten.view %306, %107 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %308 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %309 = torch.aten.to.dtype %308, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %310 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %311 = torch.aten.broadcast_to %309, %310 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %312 = torch.aten.copy %311, %69, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %313 = torch.aten.bitwise_not %312 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %314 = torch.aten.clone %27, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %315 = torch.aten.masked_fill.Tensor %307, %313, %314 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %315, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %316 = torch.aten.sub.Tensor %315, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %317 = torch.aten.exp %316 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %318 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %319 = torch.aten.sum.dim_IntList %317, %318, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %320 = torch.aten.div.Tensor %317, %319 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %321 = torch.aten.masked_fill.Scalar %320, %313, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %322 = torch.aten.broadcast_to %321, %107 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %323 = torch.aten.view %322, %124 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %324 = torch.aten.broadcast_to %299, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %325 = torch.aten.view %324, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %326 = torch.aten.bmm %323, %325 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %327 = torch.aten.view %326, %98 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %328 = torch.aten.permute %327, %78 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %329 = torch.aten.clone %328, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %330 = torch.aten.view %329, %132 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %331 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %332 = torch.aten.view %330, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %333 = torch.aten.mm %332, %331 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %334 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %335 = torch.aten.add.Tensor %334, %333, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %336 = torch.aten.view %335, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %337 = torch.aten.add.Tensor %336, %278, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %338 = torch.aten.sum.dim_IntList %337, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %339 = torch.aten.div.Scalar %338, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %340 = torch.aten.sub.Tensor %337, %339, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %341 = torch.aten.pow.Tensor_Scalar %340, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %342 = torch.aten.sum.dim_IntList %341, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %343 = torch.aten.div.Scalar %342, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %344 = torch.aten.sub.Tensor %337, %339, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %345 = torch.aten.add.Scalar %343, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %346 = torch.aten.sqrt %345 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %347 = torch.aten.div.Tensor %344, %346 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %348 = torch.aten.mul.Tensor %30, %347 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %349 = torch.aten.add.Tensor %348, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %350 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %351 = torch.aten.view %349, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %352 = torch.aten.mm %351, %350 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %353 = torch.aten.mul.Scalar %24, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %354 = torch.aten.add.Tensor %353, %352, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %355 = torch.aten.view %354, %159 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %356 = torch.aten.gelu %355, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %357 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %358 = torch.aten.view %356, %163 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %359 = torch.aten.mm %358, %357 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %360 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %361 = torch.aten.add.Tensor %360, %359, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %362 = torch.aten.view %361, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %363 = torch.aten.add.Tensor %362, %349, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %364 = torch.aten.sum.dim_IntList %363, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %365 = torch.aten.div.Scalar %364, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %366 = torch.aten.sub.Tensor %363, %365, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %367 = torch.aten.pow.Tensor_Scalar %366, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %368 = torch.aten.sum.dim_IntList %367, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %369 = torch.aten.div.Scalar %368, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %370 = torch.aten.sub.Tensor %363, %365, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %371 = torch.aten.add.Scalar %369, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %372 = torch.aten.sqrt %371 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %373 = torch.aten.div.Tensor %370, %372 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %374 = torch.aten.mul.Tensor %30, %373 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %375 = torch.aten.add.Tensor %374, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %376 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %377 = torch.aten.view %375, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %378 = torch.aten.mm %377, %376 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %379 = torch.aten.view %378, %74 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %380 = torch.aten.view %379, %76 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %381 = torch.aten.permute %380, %78 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %382 = torch.aten.slice.Tensor %381, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %383 = torch.aten.slice.Tensor %381, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %384 = torch.aten.slice.Tensor %381, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %385 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %386 = torch.aten.unsqueeze %385, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %387 = torch.aten.slice.Tensor %386, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %388 = torch.aten.view %387, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %389 = torch.aten.permute %388, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %390 = torch.aten.add.Tensor %382, %389, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %391 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %392 = torch.aten.unsqueeze %391, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %393 = torch.aten.slice.Tensor %392, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %394 = torch.aten.view %393, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %395 = torch.aten.permute %394, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %396 = torch.aten.add.Tensor %384, %395, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %397 = torch.aten.div.Scalar %390, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %398 = torch.aten.transpose.int %383, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %399 = torch.aten.broadcast_to %397, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %400 = torch.aten.view %399, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %401 = torch.aten.broadcast_to %398, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %402 = torch.aten.view %401, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %403 = torch.aten.bmm %400, %402 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %404 = torch.aten.view %403, %107 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %405 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %406 = torch.aten.to.dtype %405, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %407 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %408 = torch.aten.broadcast_to %406, %407 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %409 = torch.aten.copy %408, %69, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %410 = torch.aten.bitwise_not %409 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %411 = torch.aten.clone %27, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %412 = torch.aten.masked_fill.Tensor %404, %410, %411 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %412, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %413 = torch.aten.sub.Tensor %412, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %414 = torch.aten.exp %413 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %415 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %416 = torch.aten.sum.dim_IntList %414, %415, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %417 = torch.aten.div.Tensor %414, %416 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %418 = torch.aten.masked_fill.Scalar %417, %410, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %419 = torch.aten.broadcast_to %418, %107 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %420 = torch.aten.view %419, %124 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %421 = torch.aten.broadcast_to %396, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %422 = torch.aten.view %421, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %423 = torch.aten.bmm %420, %422 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %424 = torch.aten.view %423, %98 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %425 = torch.aten.permute %424, %78 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %426 = torch.aten.clone %425, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %427 = torch.aten.view %426, %132 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %428 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %429 = torch.aten.view %427, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %430 = torch.aten.mm %429, %428 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %431 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %432 = torch.aten.add.Tensor %431, %430, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %433 = torch.aten.view %432, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %434 = torch.aten.add.Tensor %433, %375, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %435 = torch.aten.sum.dim_IntList %434, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %436 = torch.aten.div.Scalar %435, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %437 = torch.aten.sub.Tensor %434, %436, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %438 = torch.aten.pow.Tensor_Scalar %437, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %439 = torch.aten.sum.dim_IntList %438, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %440 = torch.aten.div.Scalar %439, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %441 = torch.aten.sub.Tensor %434, %436, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %442 = torch.aten.add.Scalar %440, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %443 = torch.aten.sqrt %442 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %444 = torch.aten.div.Tensor %441, %443 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %445 = torch.aten.mul.Tensor %30, %444 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %446 = torch.aten.add.Tensor %445, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %447 = torch.aten.transpose.int %12, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %448 = torch.aten.view %446, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %449 = torch.aten.mm %448, %447 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %450 = torch.aten.mul.Scalar %24, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %451 = torch.aten.add.Tensor %450, %449, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %452 = torch.aten.view %451, %159 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %453 = torch.aten.gelu %452, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %454 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %455 = torch.aten.view %453, %163 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %456 = torch.aten.mm %455, %454 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %457 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %458 = torch.aten.add.Tensor %457, %456, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %459 = torch.aten.view %458, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %460 = torch.aten.add.Tensor %459, %446, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %461 = torch.aten.sum.dim_IntList %460, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %462 = torch.aten.div.Scalar %461, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %463 = torch.aten.sub.Tensor %460, %462, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %464 = torch.aten.pow.Tensor_Scalar %463, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %465 = torch.aten.sum.dim_IntList %464, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %466 = torch.aten.div.Scalar %465, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %467 = torch.aten.sub.Tensor %460, %462, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %468 = torch.aten.add.Scalar %466, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %469 = torch.aten.sqrt %468 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %470 = torch.aten.div.Tensor %467, %469 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %471 = torch.aten.mul.Tensor %30, %470 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %472 = torch.aten.add.Tensor %471, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %473 = torch.aten.transpose.int %10, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %474 = torch.aten.view %472, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %475 = torch.aten.mm %474, %473 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %476 = torch.aten.view %475, %74 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %477 = torch.aten.view %476, %76 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %478 = torch.aten.permute %477, %78 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %479 = torch.aten.slice.Tensor %478, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %480 = torch.aten.slice.Tensor %478, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %481 = torch.aten.slice.Tensor %478, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %482 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %483 = torch.aten.unsqueeze %482, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %484 = torch.aten.slice.Tensor %483, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %485 = torch.aten.view %484, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %486 = torch.aten.permute %485, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %487 = torch.aten.add.Tensor %479, %486, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %488 = torch.aten.unsqueeze %29, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %489 = torch.aten.unsqueeze %488, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %490 = torch.aten.slice.Tensor %489, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %491 = torch.aten.view %490, %86 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %492 = torch.aten.permute %491, %78 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %493 = torch.aten.add.Tensor %481, %492, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %494 = torch.aten.div.Scalar %487, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %495 = torch.aten.transpose.int %480, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %496 = torch.aten.broadcast_to %494, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %497 = torch.aten.view %496, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %498 = torch.aten.broadcast_to %495, %102 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %499 = torch.aten.view %498, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %500 = torch.aten.bmm %497, %499 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %501 = torch.aten.view %500, %107 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %502 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %503 = torch.aten.to.dtype %502, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %504 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %505 = torch.aten.broadcast_to %503, %504 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %506 = torch.aten.copy %505, %69, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %507 = torch.aten.bitwise_not %506 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %508 = torch.aten.clone %27, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %509 = torch.aten.masked_fill.Tensor %501, %507, %508 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %509, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %510 = torch.aten.sub.Tensor %509, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %511 = torch.aten.exp %510 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %512 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %513 = torch.aten.sum.dim_IntList %511, %512, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %514 = torch.aten.div.Tensor %511, %513 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %515 = torch.aten.masked_fill.Scalar %514, %507, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %516 = torch.aten.broadcast_to %515, %107 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %517 = torch.aten.view %516, %124 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %518 = torch.aten.broadcast_to %493, %98 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %519 = torch.aten.view %518, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %520 = torch.aten.bmm %517, %519 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %521 = torch.aten.view %520, %98 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %522 = torch.aten.permute %521, %78 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %523 = torch.aten.clone %522, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %524 = torch.aten.view %523, %132 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %525 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %526 = torch.aten.view %524, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %527 = torch.aten.mm %526, %525 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %528 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %529 = torch.aten.add.Tensor %528, %527, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %530 = torch.aten.view %529, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %531 = torch.aten.add.Tensor %530, %472, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %532 = torch.aten.sum.dim_IntList %531, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %533 = torch.aten.div.Scalar %532, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %534 = torch.aten.sub.Tensor %531, %533, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %535 = torch.aten.pow.Tensor_Scalar %534, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %536 = torch.aten.sum.dim_IntList %535, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %537 = torch.aten.div.Scalar %536, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %538 = torch.aten.sub.Tensor %531, %533, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %539 = torch.aten.add.Scalar %537, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %540 = torch.aten.sqrt %539 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %541 = torch.aten.div.Tensor %538, %540 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %542 = torch.aten.mul.Tensor %30, %541 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %543 = torch.aten.add.Tensor %542, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %544 = torch.aten.transpose.int %8, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %545 = torch.aten.view %543, %71 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %546 = torch.aten.mm %545, %544 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %547 = torch.aten.mul.Scalar %24, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %548 = torch.aten.add.Tensor %547, %546, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %549 = torch.aten.view %548, %159 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %550 = torch.aten.gelu %549, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %551 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %552 = torch.aten.view %550, %163 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %553 = torch.aten.mm %552, %551 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %554 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %555 = torch.aten.add.Tensor %554, %553, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %556 = torch.aten.view %555, %139 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %557 = torch.aten.add.Tensor %556, %543, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %558 = torch.aten.sum.dim_IntList %557, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %559 = torch.aten.div.Scalar %558, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %560 = torch.aten.sub.Tensor %557, %559, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %561 = torch.aten.pow.Tensor_Scalar %560, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %562 = torch.aten.sum.dim_IntList %561, %45, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %563 = torch.aten.div.Scalar %562, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %564 = torch.aten.sub.Tensor %557, %559, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %565 = torch.aten.add.Scalar %563, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %566 = torch.aten.sqrt %565 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %567 = torch.aten.div.Tensor %564, %566 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.mul.Tensor %30, %567 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %569 = torch.aten.add.Tensor %568, %29, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %570 = torch.aten.slice.Tensor %569, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %571 = torch.aten.slice.Tensor %570, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %572 = torch.aten.squeeze.dim %571, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %573 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %574 = torch.aten.mm %572, %573 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %575 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %576 = torch.aten.add.Tensor %575, %574, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %577 = torch.aten.gelu %576, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %578 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %579 = torch.aten.mm %577, %578 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %580 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %581 = torch.aten.add.Tensor %580, %579, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %581 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8ef9a10) {
  %12 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> !torch.vtensor<[37,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f2e840)
    ** Replace : 'torch.vtensor.literal'(0x8ef9a10)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f2e840) {
      %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %12 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %25 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %28 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %30 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %31 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %34 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %35 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %36 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %37 = torch.aten.ones %36, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %38 = torch.aten.zeros %36, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %39 = torch.aten.slice.Tensor %35, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %40 = torch.aten.slice.Tensor %39, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %41 = torch.aten.embedding %34, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %42 = torch.aten.embedding %33, %40, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %43 = torch.aten.add.Tensor %41, %42, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %44 = torch.aten.embedding %32, %38, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %45 = torch.aten.add.Tensor %43, %44, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %46 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %47 = torch.aten.sum.dim_IntList %45, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %48 = torch.aten.div.Scalar %47, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %49 = torch.aten.sub.Tensor %45, %48, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.aten.pow.Tensor_Scalar %49, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %51 = torch.aten.sum.dim_IntList %50, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %52 = torch.aten.div.Scalar %51, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %53 = torch.aten.sub.Tensor %45, %48, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %54 = torch.aten.add.Scalar %52, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %55 = torch.aten.sqrt %54 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %56 = torch.aten.div.Tensor %53, %55 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.mul.Tensor %31, %56 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %58 = torch.aten.add.Tensor %57, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %59 = torch.aten.unsqueeze %37, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %60 = torch.aten.mul.Tensor %58, %59 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %61 = torch.aten.unsqueeze %37, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %62 = torch.aten.unsqueeze %61, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %63 = torch.aten.squeeze.dim %62, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %64 = torch.aten.unsqueeze %63, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %65 = torch.aten.mul.Tensor %62, %64 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %66 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %67 = torch.aten.to.dtype %66, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %68 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %69 = torch.aten.broadcast_to %67, %68 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %70 = torch.aten.copy %69, %65, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %71 = torch.aten.transpose.int %29, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %72 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %73 = torch.aten.view %60, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %74 = torch.aten.mm %73, %71 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %75 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %76 = torch.aten.view %74, %75 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %77 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %78 = torch.aten.view %76, %77 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %79 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %80 = torch.aten.permute %78, %79 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %81 = torch.aten.slice.Tensor %80, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %82 = torch.aten.slice.Tensor %80, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %83 = torch.aten.slice.Tensor %80, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %84 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %85 = torch.aten.unsqueeze %84, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %86 = torch.aten.slice.Tensor %85, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %87 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %88 = torch.aten.view %86, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %89 = torch.aten.permute %88, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %90 = torch.aten.add.Tensor %81, %89, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %91 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %92 = torch.aten.unsqueeze %91, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %93 = torch.aten.slice.Tensor %92, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %94 = torch.aten.view %93, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %95 = torch.aten.permute %94, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %96 = torch.aten.add.Tensor %83, %95, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %97 = torch.aten.div.Scalar %90, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %98 = torch.aten.transpose.int %82, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %99 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %100 = torch.aten.broadcast_to %97, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %101 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %102 = torch.aten.view %100, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %103 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %104 = torch.aten.broadcast_to %98, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %105 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %106 = torch.aten.view %104, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %107 = torch.aten.bmm %102, %106 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %108 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %109 = torch.aten.view %107, %108 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %110 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %111 = torch.aten.to.dtype %110, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %112 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %113 = torch.aten.broadcast_to %111, %112 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %114 = torch.aten.copy %113, %70, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %115 = torch.aten.bitwise_not %114 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %116 = torch.aten.clone %28, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %117 = torch.aten.masked_fill.Tensor %109, %115, %116 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %117, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %118 = torch.aten.sub.Tensor %117, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %119 = torch.aten.exp %118 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %120 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %121 = torch.aten.sum.dim_IntList %119, %120, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %122 = torch.aten.div.Tensor %119, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %123 = torch.aten.masked_fill.Scalar %122, %115, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %124 = torch.aten.broadcast_to %123, %108 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %125 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %126 = torch.aten.view %124, %125 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %127 = torch.aten.broadcast_to %96, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %128 = torch.aten.view %127, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %129 = torch.aten.bmm %126, %128 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %130 = torch.aten.view %129, %99 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %131 = torch.aten.permute %130, %79 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %132 = torch.aten.clone %131, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %133 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %134 = torch.aten.view %132, %133 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %135 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %136 = torch.aten.view %134, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %137 = torch.aten.mm %136, %135 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %138 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %139 = torch.aten.add.Tensor %138, %137, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %140 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %141 = torch.aten.view %139, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %142 = torch.aten.add.Tensor %141, %60, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %143 = torch.aten.sum.dim_IntList %142, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %144 = torch.aten.div.Scalar %143, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %145 = torch.aten.sub.Tensor %142, %144, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %146 = torch.aten.pow.Tensor_Scalar %145, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %147 = torch.aten.sum.dim_IntList %146, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %148 = torch.aten.div.Scalar %147, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %149 = torch.aten.sub.Tensor %142, %144, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %150 = torch.aten.add.Scalar %148, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %151 = torch.aten.sqrt %150 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %152 = torch.aten.div.Tensor %149, %151 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %153 = torch.aten.mul.Tensor %31, %152 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %154 = torch.aten.add.Tensor %153, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %155 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %156 = torch.aten.view %154, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %157 = torch.aten.mm %156, %155 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %158 = torch.aten.mul.Scalar %25, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %159 = torch.aten.add.Tensor %158, %157, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %160 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %161 = torch.aten.view %159, %160 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %162 = torch.aten.gelu %161, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %163 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %164 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %165 = torch.aten.view %162, %164 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %166 = torch.aten.mm %165, %163 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %167 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %168 = torch.aten.add.Tensor %167, %166, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %169 = torch.aten.view %168, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %170 = torch.aten.add.Tensor %169, %154, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %171 = torch.aten.sum.dim_IntList %170, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %172 = torch.aten.div.Scalar %171, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %173 = torch.aten.sub.Tensor %170, %172, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %174 = torch.aten.pow.Tensor_Scalar %173, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %175 = torch.aten.sum.dim_IntList %174, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %176 = torch.aten.div.Scalar %175, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %177 = torch.aten.sub.Tensor %170, %172, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %178 = torch.aten.add.Scalar %176, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %179 = torch.aten.sqrt %178 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %180 = torch.aten.div.Tensor %177, %179 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %181 = torch.aten.mul.Tensor %31, %180 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %182 = torch.aten.add.Tensor %181, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %183 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %184 = torch.aten.view %182, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %185 = torch.aten.mm %184, %183 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %186 = torch.aten.view %185, %75 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %187 = torch.aten.view %186, %77 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %188 = torch.aten.permute %187, %79 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %189 = torch.aten.slice.Tensor %188, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %190 = torch.aten.slice.Tensor %188, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %191 = torch.aten.slice.Tensor %188, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %192 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %193 = torch.aten.unsqueeze %192, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %194 = torch.aten.slice.Tensor %193, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %195 = torch.aten.view %194, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %196 = torch.aten.permute %195, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %197 = torch.aten.add.Tensor %189, %196, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %198 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %199 = torch.aten.unsqueeze %198, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %200 = torch.aten.slice.Tensor %199, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %201 = torch.aten.view %200, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %202 = torch.aten.permute %201, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %203 = torch.aten.add.Tensor %191, %202, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %204 = torch.aten.div.Scalar %197, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %205 = torch.aten.transpose.int %190, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %206 = torch.aten.broadcast_to %204, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %207 = torch.aten.view %206, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %208 = torch.aten.broadcast_to %205, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %209 = torch.aten.view %208, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %210 = torch.aten.bmm %207, %209 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %211 = torch.aten.view %210, %108 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %212 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %213 = torch.aten.to.dtype %212, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %214 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %215 = torch.aten.broadcast_to %213, %214 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %216 = torch.aten.copy %215, %70, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %217 = torch.aten.bitwise_not %216 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %218 = torch.aten.clone %28, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %219 = torch.aten.masked_fill.Tensor %211, %217, %218 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %219, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %220 = torch.aten.sub.Tensor %219, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %221 = torch.aten.exp %220 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %222 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %223 = torch.aten.sum.dim_IntList %221, %222, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %224 = torch.aten.div.Tensor %221, %223 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %225 = torch.aten.masked_fill.Scalar %224, %217, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %226 = torch.aten.broadcast_to %225, %108 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %227 = torch.aten.view %226, %125 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %228 = torch.aten.broadcast_to %203, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %229 = torch.aten.view %228, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %230 = torch.aten.bmm %227, %229 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %231 = torch.aten.view %230, %99 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %232 = torch.aten.permute %231, %79 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %233 = torch.aten.clone %232, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %234 = torch.aten.view %233, %133 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %235 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %236 = torch.aten.view %234, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %237 = torch.aten.mm %236, %235 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %238 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %239 = torch.aten.add.Tensor %238, %237, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %240 = torch.aten.view %239, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %241 = torch.aten.add.Tensor %240, %182, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %242 = torch.aten.sum.dim_IntList %241, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %243 = torch.aten.div.Scalar %242, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %244 = torch.aten.sub.Tensor %241, %243, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %245 = torch.aten.pow.Tensor_Scalar %244, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %246 = torch.aten.sum.dim_IntList %245, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %247 = torch.aten.div.Scalar %246, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %248 = torch.aten.sub.Tensor %241, %243, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %249 = torch.aten.add.Scalar %247, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %250 = torch.aten.sqrt %249 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %251 = torch.aten.div.Tensor %248, %250 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %252 = torch.aten.mul.Tensor %31, %251 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %253 = torch.aten.add.Tensor %252, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %254 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %255 = torch.aten.view %253, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %256 = torch.aten.mm %255, %254 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %257 = torch.aten.mul.Scalar %25, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %258 = torch.aten.add.Tensor %257, %256, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %259 = torch.aten.view %258, %160 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %260 = torch.aten.gelu %259, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %261 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %262 = torch.aten.view %260, %164 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %263 = torch.aten.mm %262, %261 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %264 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %265 = torch.aten.add.Tensor %264, %263, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %266 = torch.aten.view %265, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %267 = torch.aten.add.Tensor %266, %253, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %268 = torch.aten.sum.dim_IntList %267, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %269 = torch.aten.div.Scalar %268, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %270 = torch.aten.sub.Tensor %267, %269, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %271 = torch.aten.pow.Tensor_Scalar %270, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %272 = torch.aten.sum.dim_IntList %271, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %273 = torch.aten.div.Scalar %272, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %274 = torch.aten.sub.Tensor %267, %269, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %275 = torch.aten.add.Scalar %273, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %276 = torch.aten.sqrt %275 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %277 = torch.aten.div.Tensor %274, %276 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %278 = torch.aten.mul.Tensor %31, %277 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %279 = torch.aten.add.Tensor %278, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %280 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %281 = torch.aten.view %279, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %282 = torch.aten.mm %281, %280 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %283 = torch.aten.view %282, %75 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %284 = torch.aten.view %283, %77 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %285 = torch.aten.permute %284, %79 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %286 = torch.aten.slice.Tensor %285, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %287 = torch.aten.slice.Tensor %285, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %288 = torch.aten.slice.Tensor %285, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %289 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %290 = torch.aten.unsqueeze %289, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %291 = torch.aten.slice.Tensor %290, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %292 = torch.aten.view %291, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %293 = torch.aten.permute %292, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %294 = torch.aten.add.Tensor %286, %293, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %295 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %296 = torch.aten.unsqueeze %295, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %297 = torch.aten.slice.Tensor %296, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %298 = torch.aten.view %297, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %299 = torch.aten.permute %298, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %300 = torch.aten.add.Tensor %288, %299, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %301 = torch.aten.div.Scalar %294, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %302 = torch.aten.transpose.int %287, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %303 = torch.aten.broadcast_to %301, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %304 = torch.aten.view %303, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %305 = torch.aten.broadcast_to %302, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %306 = torch.aten.view %305, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %307 = torch.aten.bmm %304, %306 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %308 = torch.aten.view %307, %108 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %309 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %310 = torch.aten.to.dtype %309, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %311 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %312 = torch.aten.broadcast_to %310, %311 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %313 = torch.aten.copy %312, %70, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %314 = torch.aten.bitwise_not %313 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %315 = torch.aten.clone %28, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %316 = torch.aten.masked_fill.Tensor %308, %314, %315 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %316, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %317 = torch.aten.sub.Tensor %316, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %318 = torch.aten.exp %317 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %319 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %320 = torch.aten.sum.dim_IntList %318, %319, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %321 = torch.aten.div.Tensor %318, %320 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %322 = torch.aten.masked_fill.Scalar %321, %314, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %323 = torch.aten.broadcast_to %322, %108 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %324 = torch.aten.view %323, %125 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %325 = torch.aten.broadcast_to %300, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %326 = torch.aten.view %325, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %327 = torch.aten.bmm %324, %326 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %328 = torch.aten.view %327, %99 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %329 = torch.aten.permute %328, %79 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %330 = torch.aten.clone %329, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %331 = torch.aten.view %330, %133 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %332 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %333 = torch.aten.view %331, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %334 = torch.aten.mm %333, %332 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %335 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %336 = torch.aten.add.Tensor %335, %334, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %337 = torch.aten.view %336, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %338 = torch.aten.add.Tensor %337, %279, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %339 = torch.aten.sum.dim_IntList %338, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %340 = torch.aten.div.Scalar %339, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %341 = torch.aten.sub.Tensor %338, %340, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %342 = torch.aten.pow.Tensor_Scalar %341, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %343 = torch.aten.sum.dim_IntList %342, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %344 = torch.aten.div.Scalar %343, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %345 = torch.aten.sub.Tensor %338, %340, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %346 = torch.aten.add.Scalar %344, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %347 = torch.aten.sqrt %346 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %348 = torch.aten.div.Tensor %345, %347 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %349 = torch.aten.mul.Tensor %31, %348 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %350 = torch.aten.add.Tensor %349, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %351 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %352 = torch.aten.view %350, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %353 = torch.aten.mm %352, %351 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %354 = torch.aten.mul.Scalar %25, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %355 = torch.aten.add.Tensor %354, %353, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %356 = torch.aten.view %355, %160 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %357 = torch.aten.gelu %356, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %358 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %359 = torch.aten.view %357, %164 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %360 = torch.aten.mm %359, %358 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %361 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %362 = torch.aten.add.Tensor %361, %360, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %363 = torch.aten.view %362, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %364 = torch.aten.add.Tensor %363, %350, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %365 = torch.aten.sum.dim_IntList %364, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %366 = torch.aten.div.Scalar %365, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %367 = torch.aten.sub.Tensor %364, %366, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %368 = torch.aten.pow.Tensor_Scalar %367, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %369 = torch.aten.sum.dim_IntList %368, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %370 = torch.aten.div.Scalar %369, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %371 = torch.aten.sub.Tensor %364, %366, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %372 = torch.aten.add.Scalar %370, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %373 = torch.aten.sqrt %372 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %374 = torch.aten.div.Tensor %371, %373 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %375 = torch.aten.mul.Tensor %31, %374 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %376 = torch.aten.add.Tensor %375, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %377 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %378 = torch.aten.view %376, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %379 = torch.aten.mm %378, %377 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %380 = torch.aten.view %379, %75 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %381 = torch.aten.view %380, %77 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %382 = torch.aten.permute %381, %79 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %383 = torch.aten.slice.Tensor %382, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %384 = torch.aten.slice.Tensor %382, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %385 = torch.aten.slice.Tensor %382, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %386 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %387 = torch.aten.unsqueeze %386, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %388 = torch.aten.slice.Tensor %387, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %389 = torch.aten.view %388, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %390 = torch.aten.permute %389, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %391 = torch.aten.add.Tensor %383, %390, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %392 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %393 = torch.aten.unsqueeze %392, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %394 = torch.aten.slice.Tensor %393, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %395 = torch.aten.view %394, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %396 = torch.aten.permute %395, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %397 = torch.aten.add.Tensor %385, %396, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %398 = torch.aten.div.Scalar %391, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %399 = torch.aten.transpose.int %384, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %400 = torch.aten.broadcast_to %398, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %401 = torch.aten.view %400, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %402 = torch.aten.broadcast_to %399, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %403 = torch.aten.view %402, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %404 = torch.aten.bmm %401, %403 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %405 = torch.aten.view %404, %108 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %406 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %407 = torch.aten.to.dtype %406, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %408 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %409 = torch.aten.broadcast_to %407, %408 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %410 = torch.aten.copy %409, %70, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %411 = torch.aten.bitwise_not %410 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %412 = torch.aten.clone %28, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %413 = torch.aten.masked_fill.Tensor %405, %411, %412 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %413, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %414 = torch.aten.sub.Tensor %413, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %415 = torch.aten.exp %414 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %416 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %417 = torch.aten.sum.dim_IntList %415, %416, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %418 = torch.aten.div.Tensor %415, %417 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %419 = torch.aten.masked_fill.Scalar %418, %411, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %420 = torch.aten.broadcast_to %419, %108 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %421 = torch.aten.view %420, %125 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %422 = torch.aten.broadcast_to %397, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %423 = torch.aten.view %422, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %424 = torch.aten.bmm %421, %423 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %425 = torch.aten.view %424, %99 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %426 = torch.aten.permute %425, %79 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %427 = torch.aten.clone %426, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %428 = torch.aten.view %427, %133 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %429 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %430 = torch.aten.view %428, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %431 = torch.aten.mm %430, %429 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %432 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %433 = torch.aten.add.Tensor %432, %431, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %434 = torch.aten.view %433, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %435 = torch.aten.add.Tensor %434, %376, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %436 = torch.aten.sum.dim_IntList %435, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %437 = torch.aten.div.Scalar %436, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %438 = torch.aten.sub.Tensor %435, %437, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %439 = torch.aten.pow.Tensor_Scalar %438, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %440 = torch.aten.sum.dim_IntList %439, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %441 = torch.aten.div.Scalar %440, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %442 = torch.aten.sub.Tensor %435, %437, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %443 = torch.aten.add.Scalar %441, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %444 = torch.aten.sqrt %443 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %445 = torch.aten.div.Tensor %442, %444 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %446 = torch.aten.mul.Tensor %31, %445 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %447 = torch.aten.add.Tensor %446, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %448 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %449 = torch.aten.view %447, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %450 = torch.aten.mm %449, %448 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %451 = torch.aten.mul.Scalar %25, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %452 = torch.aten.add.Tensor %451, %450, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %453 = torch.aten.view %452, %160 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %454 = torch.aten.gelu %453, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %455 = torch.aten.transpose.int %12, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %456 = torch.aten.view %454, %164 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %457 = torch.aten.mm %456, %455 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %458 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %459 = torch.aten.add.Tensor %458, %457, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %460 = torch.aten.view %459, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %461 = torch.aten.add.Tensor %460, %447, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %462 = torch.aten.sum.dim_IntList %461, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %463 = torch.aten.div.Scalar %462, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %464 = torch.aten.sub.Tensor %461, %463, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %465 = torch.aten.pow.Tensor_Scalar %464, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %466 = torch.aten.sum.dim_IntList %465, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %467 = torch.aten.div.Scalar %466, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %468 = torch.aten.sub.Tensor %461, %463, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %469 = torch.aten.add.Scalar %467, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %470 = torch.aten.sqrt %469 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %471 = torch.aten.div.Tensor %468, %470 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %472 = torch.aten.mul.Tensor %31, %471 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %473 = torch.aten.add.Tensor %472, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %474 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %475 = torch.aten.view %473, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %476 = torch.aten.mm %475, %474 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %477 = torch.aten.view %476, %75 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %478 = torch.aten.view %477, %77 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %479 = torch.aten.permute %478, %79 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %480 = torch.aten.slice.Tensor %479, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %481 = torch.aten.slice.Tensor %479, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %482 = torch.aten.slice.Tensor %479, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %483 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %484 = torch.aten.unsqueeze %483, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %485 = torch.aten.slice.Tensor %484, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %486 = torch.aten.view %485, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %487 = torch.aten.permute %486, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %488 = torch.aten.add.Tensor %480, %487, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %489 = torch.aten.unsqueeze %30, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %490 = torch.aten.unsqueeze %489, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %491 = torch.aten.slice.Tensor %490, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %492 = torch.aten.view %491, %87 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %493 = torch.aten.permute %492, %79 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %494 = torch.aten.add.Tensor %482, %493, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %495 = torch.aten.div.Scalar %488, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %496 = torch.aten.transpose.int %481, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %497 = torch.aten.broadcast_to %495, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %498 = torch.aten.view %497, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %499 = torch.aten.broadcast_to %496, %103 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %500 = torch.aten.view %499, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %501 = torch.aten.bmm %498, %500 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %502 = torch.aten.view %501, %108 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %503 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %504 = torch.aten.to.dtype %503, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %505 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %506 = torch.aten.broadcast_to %504, %505 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %507 = torch.aten.copy %506, %70, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %508 = torch.aten.bitwise_not %507 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %509 = torch.aten.clone %28, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %510 = torch.aten.masked_fill.Tensor %502, %508, %509 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %510, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %511 = torch.aten.sub.Tensor %510, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %512 = torch.aten.exp %511 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %513 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %514 = torch.aten.sum.dim_IntList %512, %513, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %515 = torch.aten.div.Tensor %512, %514 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %516 = torch.aten.masked_fill.Scalar %515, %508, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %517 = torch.aten.broadcast_to %516, %108 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %518 = torch.aten.view %517, %125 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %519 = torch.aten.broadcast_to %494, %99 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %520 = torch.aten.view %519, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %521 = torch.aten.bmm %518, %520 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %522 = torch.aten.view %521, %99 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %523 = torch.aten.permute %522, %79 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %524 = torch.aten.clone %523, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %525 = torch.aten.view %524, %133 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %526 = torch.aten.transpose.int %10, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %527 = torch.aten.view %525, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %528 = torch.aten.mm %527, %526 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %529 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %530 = torch.aten.add.Tensor %529, %528, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %531 = torch.aten.view %530, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %532 = torch.aten.add.Tensor %531, %473, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %533 = torch.aten.sum.dim_IntList %532, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %534 = torch.aten.div.Scalar %533, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %535 = torch.aten.sub.Tensor %532, %534, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %536 = torch.aten.pow.Tensor_Scalar %535, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %537 = torch.aten.sum.dim_IntList %536, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %538 = torch.aten.div.Scalar %537, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %539 = torch.aten.sub.Tensor %532, %534, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %540 = torch.aten.add.Scalar %538, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %541 = torch.aten.sqrt %540 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %542 = torch.aten.div.Tensor %539, %541 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %543 = torch.aten.mul.Tensor %31, %542 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %544 = torch.aten.add.Tensor %543, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %545 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %546 = torch.aten.view %544, %72 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %547 = torch.aten.mm %546, %545 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %548 = torch.aten.mul.Scalar %25, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %549 = torch.aten.add.Tensor %548, %547, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %550 = torch.aten.view %549, %160 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %551 = torch.aten.gelu %550, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %552 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %553 = torch.aten.view %551, %164 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %554 = torch.aten.mm %553, %552 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %555 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %556 = torch.aten.add.Tensor %555, %554, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %557 = torch.aten.view %556, %140 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %558 = torch.aten.add.Tensor %557, %544, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %559 = torch.aten.sum.dim_IntList %558, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %560 = torch.aten.div.Scalar %559, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %561 = torch.aten.sub.Tensor %558, %560, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %562 = torch.aten.pow.Tensor_Scalar %561, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %563 = torch.aten.sum.dim_IntList %562, %46, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %564 = torch.aten.div.Scalar %563, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %565 = torch.aten.sub.Tensor %558, %560, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %566 = torch.aten.add.Scalar %564, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %567 = torch.aten.sqrt %566 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %568 = torch.aten.div.Tensor %565, %567 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %569 = torch.aten.mul.Tensor %31, %568 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %570 = torch.aten.add.Tensor %569, %30, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %571 = torch.aten.slice.Tensor %570, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %572 = torch.aten.slice.Tensor %571, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %573 = torch.aten.squeeze.dim %572, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %574 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %575 = torch.aten.mm %573, %574 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %576 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %577 = torch.aten.add.Tensor %576, %575, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %578 = torch.aten.gelu %577, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %579 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %580 = torch.aten.mm %578, %579 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %581 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %582 = torch.aten.add.Tensor %581, %580, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %582 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8ef9ad0) {
  %14 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> !torch.vtensor<[32,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8ee0290)
    ** Replace : 'torch.vtensor.literal'(0x8ef9ad0)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8ee0290) {
      %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %26 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %29 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %31 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %32 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %34 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %35 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %36 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %37 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %38 = torch.aten.ones %37, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %39 = torch.aten.zeros %37, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %40 = torch.aten.slice.Tensor %36, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %41 = torch.aten.slice.Tensor %40, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %42 = torch.aten.embedding %35, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %43 = torch.aten.embedding %34, %41, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %44 = torch.aten.add.Tensor %42, %43, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %45 = torch.aten.embedding %33, %39, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %46 = torch.aten.add.Tensor %44, %45, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %48 = torch.aten.sum.dim_IntList %46, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %49 = torch.aten.div.Scalar %48, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %50 = torch.aten.sub.Tensor %46, %49, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %51 = torch.aten.pow.Tensor_Scalar %50, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %52 = torch.aten.sum.dim_IntList %51, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %53 = torch.aten.div.Scalar %52, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %54 = torch.aten.sub.Tensor %46, %49, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %55 = torch.aten.add.Scalar %53, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %56 = torch.aten.sqrt %55 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %57 = torch.aten.div.Tensor %54, %56 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %58 = torch.aten.mul.Tensor %32, %57 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %59 = torch.aten.add.Tensor %58, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %60 = torch.aten.unsqueeze %38, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %61 = torch.aten.mul.Tensor %59, %60 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %62 = torch.aten.unsqueeze %38, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %63 = torch.aten.unsqueeze %62, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %64 = torch.aten.squeeze.dim %63, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %65 = torch.aten.unsqueeze %64, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %66 = torch.aten.mul.Tensor %63, %65 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %67 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %68 = torch.aten.to.dtype %67, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %69 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %70 = torch.aten.broadcast_to %68, %69 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %71 = torch.aten.copy %70, %66, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %72 = torch.aten.transpose.int %30, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %73 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %74 = torch.aten.view %61, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %75 = torch.aten.mm %74, %72 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %76 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %77 = torch.aten.view %75, %76 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %78 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %79 = torch.aten.view %77, %78 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %80 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %81 = torch.aten.permute %79, %80 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %82 = torch.aten.slice.Tensor %81, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %83 = torch.aten.slice.Tensor %81, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %84 = torch.aten.slice.Tensor %81, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %85 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %86 = torch.aten.unsqueeze %85, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %87 = torch.aten.slice.Tensor %86, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %88 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %89 = torch.aten.view %87, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %90 = torch.aten.permute %89, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %91 = torch.aten.add.Tensor %82, %90, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %92 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %93 = torch.aten.unsqueeze %92, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %94 = torch.aten.slice.Tensor %93, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %95 = torch.aten.view %94, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %96 = torch.aten.permute %95, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %97 = torch.aten.add.Tensor %84, %96, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %98 = torch.aten.div.Scalar %91, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %99 = torch.aten.transpose.int %83, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %100 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %101 = torch.aten.broadcast_to %98, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %102 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %103 = torch.aten.view %101, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %104 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %105 = torch.aten.broadcast_to %99, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %106 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %107 = torch.aten.view %105, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %108 = torch.aten.bmm %103, %107 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %109 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %110 = torch.aten.view %108, %109 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %111 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %112 = torch.aten.to.dtype %111, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %113 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %114 = torch.aten.broadcast_to %112, %113 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %115 = torch.aten.copy %114, %71, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %116 = torch.aten.bitwise_not %115 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %117 = torch.aten.clone %29, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %118 = torch.aten.masked_fill.Tensor %110, %116, %117 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %118, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %119 = torch.aten.sub.Tensor %118, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %120 = torch.aten.exp %119 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %121 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %122 = torch.aten.sum.dim_IntList %120, %121, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %123 = torch.aten.div.Tensor %120, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %124 = torch.aten.masked_fill.Scalar %123, %116, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %125 = torch.aten.broadcast_to %124, %109 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %126 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %127 = torch.aten.view %125, %126 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %128 = torch.aten.broadcast_to %97, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %129 = torch.aten.view %128, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %130 = torch.aten.bmm %127, %129 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %131 = torch.aten.view %130, %100 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %132 = torch.aten.permute %131, %80 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %133 = torch.aten.clone %132, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %134 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %135 = torch.aten.view %133, %134 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %136 = torch.aten.transpose.int %28, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %137 = torch.aten.view %135, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %138 = torch.aten.mm %137, %136 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %139 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %140 = torch.aten.add.Tensor %139, %138, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %141 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %142 = torch.aten.view %140, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %143 = torch.aten.add.Tensor %142, %61, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %144 = torch.aten.sum.dim_IntList %143, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %145 = torch.aten.div.Scalar %144, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %146 = torch.aten.sub.Tensor %143, %145, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %147 = torch.aten.pow.Tensor_Scalar %146, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %148 = torch.aten.sum.dim_IntList %147, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %149 = torch.aten.div.Scalar %148, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %150 = torch.aten.sub.Tensor %143, %145, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %151 = torch.aten.add.Scalar %149, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %152 = torch.aten.sqrt %151 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %153 = torch.aten.div.Tensor %150, %152 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %154 = torch.aten.mul.Tensor %32, %153 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %155 = torch.aten.add.Tensor %154, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %156 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %157 = torch.aten.view %155, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %158 = torch.aten.mm %157, %156 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %159 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %160 = torch.aten.add.Tensor %159, %158, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %161 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %162 = torch.aten.view %160, %161 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %163 = torch.aten.gelu %162, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %164 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %165 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %166 = torch.aten.view %163, %165 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %167 = torch.aten.mm %166, %164 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %168 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %169 = torch.aten.add.Tensor %168, %167, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %170 = torch.aten.view %169, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %171 = torch.aten.add.Tensor %170, %155, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %172 = torch.aten.sum.dim_IntList %171, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %173 = torch.aten.div.Scalar %172, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %174 = torch.aten.sub.Tensor %171, %173, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %175 = torch.aten.pow.Tensor_Scalar %174, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %176 = torch.aten.sum.dim_IntList %175, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %177 = torch.aten.div.Scalar %176, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %178 = torch.aten.sub.Tensor %171, %173, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %179 = torch.aten.add.Scalar %177, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %180 = torch.aten.sqrt %179 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %181 = torch.aten.div.Tensor %178, %180 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %182 = torch.aten.mul.Tensor %32, %181 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %183 = torch.aten.add.Tensor %182, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %184 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %185 = torch.aten.view %183, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %186 = torch.aten.mm %185, %184 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %187 = torch.aten.view %186, %76 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %188 = torch.aten.view %187, %78 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %189 = torch.aten.permute %188, %80 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %190 = torch.aten.slice.Tensor %189, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %191 = torch.aten.slice.Tensor %189, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %192 = torch.aten.slice.Tensor %189, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %193 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %194 = torch.aten.unsqueeze %193, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %195 = torch.aten.slice.Tensor %194, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %196 = torch.aten.view %195, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %197 = torch.aten.permute %196, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %198 = torch.aten.add.Tensor %190, %197, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %199 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %200 = torch.aten.unsqueeze %199, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %201 = torch.aten.slice.Tensor %200, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %202 = torch.aten.view %201, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %203 = torch.aten.permute %202, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %204 = torch.aten.add.Tensor %192, %203, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %205 = torch.aten.div.Scalar %198, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %206 = torch.aten.transpose.int %191, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %207 = torch.aten.broadcast_to %205, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %208 = torch.aten.view %207, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %209 = torch.aten.broadcast_to %206, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %210 = torch.aten.view %209, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %211 = torch.aten.bmm %208, %210 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %212 = torch.aten.view %211, %109 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %213 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %214 = torch.aten.to.dtype %213, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %215 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %216 = torch.aten.broadcast_to %214, %215 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %217 = torch.aten.copy %216, %71, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %218 = torch.aten.bitwise_not %217 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %219 = torch.aten.clone %29, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %220 = torch.aten.masked_fill.Tensor %212, %218, %219 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %220, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %221 = torch.aten.sub.Tensor %220, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %222 = torch.aten.exp %221 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %223 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %224 = torch.aten.sum.dim_IntList %222, %223, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %225 = torch.aten.div.Tensor %222, %224 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %226 = torch.aten.masked_fill.Scalar %225, %218, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %227 = torch.aten.broadcast_to %226, %109 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %228 = torch.aten.view %227, %126 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %229 = torch.aten.broadcast_to %204, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %230 = torch.aten.view %229, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %231 = torch.aten.bmm %228, %230 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %232 = torch.aten.view %231, %100 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %233 = torch.aten.permute %232, %80 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %234 = torch.aten.clone %233, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %235 = torch.aten.view %234, %134 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %236 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %237 = torch.aten.view %235, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %238 = torch.aten.mm %237, %236 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %239 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %240 = torch.aten.add.Tensor %239, %238, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %241 = torch.aten.view %240, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %242 = torch.aten.add.Tensor %241, %183, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %243 = torch.aten.sum.dim_IntList %242, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %244 = torch.aten.div.Scalar %243, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %245 = torch.aten.sub.Tensor %242, %244, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %246 = torch.aten.pow.Tensor_Scalar %245, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %247 = torch.aten.sum.dim_IntList %246, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %248 = torch.aten.div.Scalar %247, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %249 = torch.aten.sub.Tensor %242, %244, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %250 = torch.aten.add.Scalar %248, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %251 = torch.aten.sqrt %250 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %252 = torch.aten.div.Tensor %249, %251 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %253 = torch.aten.mul.Tensor %32, %252 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %254 = torch.aten.add.Tensor %253, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %255 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %256 = torch.aten.view %254, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %257 = torch.aten.mm %256, %255 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %258 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %259 = torch.aten.add.Tensor %258, %257, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %260 = torch.aten.view %259, %161 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %261 = torch.aten.gelu %260, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %262 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %263 = torch.aten.view %261, %165 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %264 = torch.aten.mm %263, %262 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %265 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %266 = torch.aten.add.Tensor %265, %264, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %267 = torch.aten.view %266, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %268 = torch.aten.add.Tensor %267, %254, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %269 = torch.aten.sum.dim_IntList %268, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %270 = torch.aten.div.Scalar %269, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %271 = torch.aten.sub.Tensor %268, %270, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %272 = torch.aten.pow.Tensor_Scalar %271, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %273 = torch.aten.sum.dim_IntList %272, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %274 = torch.aten.div.Scalar %273, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %275 = torch.aten.sub.Tensor %268, %270, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %276 = torch.aten.add.Scalar %274, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %277 = torch.aten.sqrt %276 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %278 = torch.aten.div.Tensor %275, %277 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %279 = torch.aten.mul.Tensor %32, %278 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %280 = torch.aten.add.Tensor %279, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %281 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %282 = torch.aten.view %280, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %283 = torch.aten.mm %282, %281 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %284 = torch.aten.view %283, %76 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %285 = torch.aten.view %284, %78 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %286 = torch.aten.permute %285, %80 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %287 = torch.aten.slice.Tensor %286, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %288 = torch.aten.slice.Tensor %286, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %289 = torch.aten.slice.Tensor %286, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %290 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %291 = torch.aten.unsqueeze %290, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %292 = torch.aten.slice.Tensor %291, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %293 = torch.aten.view %292, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %294 = torch.aten.permute %293, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %295 = torch.aten.add.Tensor %287, %294, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %296 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %297 = torch.aten.unsqueeze %296, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %298 = torch.aten.slice.Tensor %297, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %299 = torch.aten.view %298, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %300 = torch.aten.permute %299, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %301 = torch.aten.add.Tensor %289, %300, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %302 = torch.aten.div.Scalar %295, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %303 = torch.aten.transpose.int %288, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %304 = torch.aten.broadcast_to %302, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %305 = torch.aten.view %304, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %306 = torch.aten.broadcast_to %303, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %307 = torch.aten.view %306, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %308 = torch.aten.bmm %305, %307 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %309 = torch.aten.view %308, %109 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %310 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %311 = torch.aten.to.dtype %310, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %312 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %313 = torch.aten.broadcast_to %311, %312 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %314 = torch.aten.copy %313, %71, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %315 = torch.aten.bitwise_not %314 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %316 = torch.aten.clone %29, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %317 = torch.aten.masked_fill.Tensor %309, %315, %316 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %317, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %318 = torch.aten.sub.Tensor %317, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %319 = torch.aten.exp %318 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %320 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %321 = torch.aten.sum.dim_IntList %319, %320, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %322 = torch.aten.div.Tensor %319, %321 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %323 = torch.aten.masked_fill.Scalar %322, %315, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %324 = torch.aten.broadcast_to %323, %109 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %325 = torch.aten.view %324, %126 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %326 = torch.aten.broadcast_to %301, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %327 = torch.aten.view %326, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %328 = torch.aten.bmm %325, %327 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %329 = torch.aten.view %328, %100 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %330 = torch.aten.permute %329, %80 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %331 = torch.aten.clone %330, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %332 = torch.aten.view %331, %134 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %333 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %334 = torch.aten.view %332, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %335 = torch.aten.mm %334, %333 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %336 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %337 = torch.aten.add.Tensor %336, %335, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %338 = torch.aten.view %337, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %339 = torch.aten.add.Tensor %338, %280, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %340 = torch.aten.sum.dim_IntList %339, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %341 = torch.aten.div.Scalar %340, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %342 = torch.aten.sub.Tensor %339, %341, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %343 = torch.aten.pow.Tensor_Scalar %342, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %344 = torch.aten.sum.dim_IntList %343, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %345 = torch.aten.div.Scalar %344, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %346 = torch.aten.sub.Tensor %339, %341, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %347 = torch.aten.add.Scalar %345, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %348 = torch.aten.sqrt %347 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %349 = torch.aten.div.Tensor %346, %348 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %350 = torch.aten.mul.Tensor %32, %349 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %351 = torch.aten.add.Tensor %350, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %352 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %353 = torch.aten.view %351, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %354 = torch.aten.mm %353, %352 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %355 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %356 = torch.aten.add.Tensor %355, %354, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %357 = torch.aten.view %356, %161 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %358 = torch.aten.gelu %357, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %359 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %360 = torch.aten.view %358, %165 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %361 = torch.aten.mm %360, %359 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %362 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %363 = torch.aten.add.Tensor %362, %361, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %364 = torch.aten.view %363, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %365 = torch.aten.add.Tensor %364, %351, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %366 = torch.aten.sum.dim_IntList %365, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %367 = torch.aten.div.Scalar %366, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %368 = torch.aten.sub.Tensor %365, %367, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %369 = torch.aten.pow.Tensor_Scalar %368, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %370 = torch.aten.sum.dim_IntList %369, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %371 = torch.aten.div.Scalar %370, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %372 = torch.aten.sub.Tensor %365, %367, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %373 = torch.aten.add.Scalar %371, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %374 = torch.aten.sqrt %373 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %375 = torch.aten.div.Tensor %372, %374 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %376 = torch.aten.mul.Tensor %32, %375 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %377 = torch.aten.add.Tensor %376, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %378 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %379 = torch.aten.view %377, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %380 = torch.aten.mm %379, %378 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %381 = torch.aten.view %380, %76 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %382 = torch.aten.view %381, %78 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %383 = torch.aten.permute %382, %80 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %384 = torch.aten.slice.Tensor %383, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %385 = torch.aten.slice.Tensor %383, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %386 = torch.aten.slice.Tensor %383, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %387 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %388 = torch.aten.unsqueeze %387, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %389 = torch.aten.slice.Tensor %388, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %390 = torch.aten.view %389, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %391 = torch.aten.permute %390, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %392 = torch.aten.add.Tensor %384, %391, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %393 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %394 = torch.aten.unsqueeze %393, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %395 = torch.aten.slice.Tensor %394, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %396 = torch.aten.view %395, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %397 = torch.aten.permute %396, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %398 = torch.aten.add.Tensor %386, %397, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %399 = torch.aten.div.Scalar %392, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %400 = torch.aten.transpose.int %385, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %401 = torch.aten.broadcast_to %399, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %402 = torch.aten.view %401, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %403 = torch.aten.broadcast_to %400, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %404 = torch.aten.view %403, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %405 = torch.aten.bmm %402, %404 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %406 = torch.aten.view %405, %109 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %407 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %408 = torch.aten.to.dtype %407, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %409 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %410 = torch.aten.broadcast_to %408, %409 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %411 = torch.aten.copy %410, %71, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %412 = torch.aten.bitwise_not %411 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %413 = torch.aten.clone %29, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %414 = torch.aten.masked_fill.Tensor %406, %412, %413 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %414, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %415 = torch.aten.sub.Tensor %414, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %416 = torch.aten.exp %415 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %417 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %418 = torch.aten.sum.dim_IntList %416, %417, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %419 = torch.aten.div.Tensor %416, %418 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %420 = torch.aten.masked_fill.Scalar %419, %412, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %421 = torch.aten.broadcast_to %420, %109 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %422 = torch.aten.view %421, %126 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %423 = torch.aten.broadcast_to %398, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %424 = torch.aten.view %423, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %425 = torch.aten.bmm %422, %424 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %426 = torch.aten.view %425, %100 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %427 = torch.aten.permute %426, %80 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %428 = torch.aten.clone %427, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %429 = torch.aten.view %428, %134 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %430 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %431 = torch.aten.view %429, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %432 = torch.aten.mm %431, %430 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %433 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %434 = torch.aten.add.Tensor %433, %432, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %435 = torch.aten.view %434, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %436 = torch.aten.add.Tensor %435, %377, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %437 = torch.aten.sum.dim_IntList %436, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %438 = torch.aten.div.Scalar %437, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %439 = torch.aten.sub.Tensor %436, %438, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %440 = torch.aten.pow.Tensor_Scalar %439, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %441 = torch.aten.sum.dim_IntList %440, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %442 = torch.aten.div.Scalar %441, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %443 = torch.aten.sub.Tensor %436, %438, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %444 = torch.aten.add.Scalar %442, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %445 = torch.aten.sqrt %444 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %446 = torch.aten.div.Tensor %443, %445 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %447 = torch.aten.mul.Tensor %32, %446 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %448 = torch.aten.add.Tensor %447, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %449 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %450 = torch.aten.view %448, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %451 = torch.aten.mm %450, %449 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %452 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %453 = torch.aten.add.Tensor %452, %451, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %454 = torch.aten.view %453, %161 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %455 = torch.aten.gelu %454, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %456 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %457 = torch.aten.view %455, %165 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %458 = torch.aten.mm %457, %456 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %459 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %460 = torch.aten.add.Tensor %459, %458, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %461 = torch.aten.view %460, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %462 = torch.aten.add.Tensor %461, %448, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %463 = torch.aten.sum.dim_IntList %462, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %464 = torch.aten.div.Scalar %463, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %465 = torch.aten.sub.Tensor %462, %464, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %466 = torch.aten.pow.Tensor_Scalar %465, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %467 = torch.aten.sum.dim_IntList %466, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %468 = torch.aten.div.Scalar %467, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %469 = torch.aten.sub.Tensor %462, %464, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %470 = torch.aten.add.Scalar %468, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %471 = torch.aten.sqrt %470 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %472 = torch.aten.div.Tensor %469, %471 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %473 = torch.aten.mul.Tensor %32, %472 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %474 = torch.aten.add.Tensor %473, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %475 = torch.aten.transpose.int %12, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %476 = torch.aten.view %474, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %477 = torch.aten.mm %476, %475 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %478 = torch.aten.view %477, %76 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %479 = torch.aten.view %478, %78 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %480 = torch.aten.permute %479, %80 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %481 = torch.aten.slice.Tensor %480, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %482 = torch.aten.slice.Tensor %480, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %483 = torch.aten.slice.Tensor %480, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %484 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %485 = torch.aten.unsqueeze %484, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %486 = torch.aten.slice.Tensor %485, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %487 = torch.aten.view %486, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %488 = torch.aten.permute %487, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %489 = torch.aten.add.Tensor %481, %488, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %490 = torch.aten.unsqueeze %31, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %491 = torch.aten.unsqueeze %490, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %492 = torch.aten.slice.Tensor %491, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %493 = torch.aten.view %492, %88 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %494 = torch.aten.permute %493, %80 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %495 = torch.aten.add.Tensor %483, %494, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %496 = torch.aten.div.Scalar %489, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %497 = torch.aten.transpose.int %482, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %498 = torch.aten.broadcast_to %496, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %499 = torch.aten.view %498, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %500 = torch.aten.broadcast_to %497, %104 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %501 = torch.aten.view %500, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %502 = torch.aten.bmm %499, %501 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %503 = torch.aten.view %502, %109 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %504 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %505 = torch.aten.to.dtype %504, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %506 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %507 = torch.aten.broadcast_to %505, %506 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %508 = torch.aten.copy %507, %71, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %509 = torch.aten.bitwise_not %508 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %510 = torch.aten.clone %29, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %511 = torch.aten.masked_fill.Tensor %503, %509, %510 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %511, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %512 = torch.aten.sub.Tensor %511, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %513 = torch.aten.exp %512 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %514 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %515 = torch.aten.sum.dim_IntList %513, %514, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %516 = torch.aten.div.Tensor %513, %515 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %517 = torch.aten.masked_fill.Scalar %516, %509, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %518 = torch.aten.broadcast_to %517, %109 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %519 = torch.aten.view %518, %126 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %520 = torch.aten.broadcast_to %495, %100 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %521 = torch.aten.view %520, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %522 = torch.aten.bmm %519, %521 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %523 = torch.aten.view %522, %100 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %524 = torch.aten.permute %523, %80 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %525 = torch.aten.clone %524, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %526 = torch.aten.view %525, %134 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %527 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %528 = torch.aten.view %526, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %529 = torch.aten.mm %528, %527 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %530 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %531 = torch.aten.add.Tensor %530, %529, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %532 = torch.aten.view %531, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %533 = torch.aten.add.Tensor %532, %474, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %534 = torch.aten.sum.dim_IntList %533, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %535 = torch.aten.div.Scalar %534, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %536 = torch.aten.sub.Tensor %533, %535, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %537 = torch.aten.pow.Tensor_Scalar %536, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %538 = torch.aten.sum.dim_IntList %537, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %539 = torch.aten.div.Scalar %538, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %540 = torch.aten.sub.Tensor %533, %535, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %541 = torch.aten.add.Scalar %539, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %542 = torch.aten.sqrt %541 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %543 = torch.aten.div.Tensor %540, %542 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %544 = torch.aten.mul.Tensor %32, %543 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %545 = torch.aten.add.Tensor %544, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %546 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %547 = torch.aten.view %545, %73 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %548 = torch.aten.mm %547, %546 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %549 = torch.aten.mul.Scalar %26, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %550 = torch.aten.add.Tensor %549, %548, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %551 = torch.aten.view %550, %161 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %552 = torch.aten.gelu %551, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %553 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %554 = torch.aten.view %552, %165 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %555 = torch.aten.mm %554, %553 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %556 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %557 = torch.aten.add.Tensor %556, %555, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %558 = torch.aten.view %557, %141 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %559 = torch.aten.add.Tensor %558, %545, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %560 = torch.aten.sum.dim_IntList %559, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %561 = torch.aten.div.Scalar %560, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %562 = torch.aten.sub.Tensor %559, %561, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %563 = torch.aten.pow.Tensor_Scalar %562, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %564 = torch.aten.sum.dim_IntList %563, %47, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %565 = torch.aten.div.Scalar %564, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %566 = torch.aten.sub.Tensor %559, %561, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %567 = torch.aten.add.Scalar %565, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %568 = torch.aten.sqrt %567 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %569 = torch.aten.div.Tensor %566, %568 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %570 = torch.aten.mul.Tensor %32, %569 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %571 = torch.aten.add.Tensor %570, %31, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %572 = torch.aten.slice.Tensor %571, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %573 = torch.aten.slice.Tensor %572, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %574 = torch.aten.squeeze.dim %573, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %575 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %576 = torch.aten.mm %574, %575 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %577 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %578 = torch.aten.add.Tensor %577, %576, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %579 = torch.aten.gelu %578, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %580 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %581 = torch.aten.mm %579, %580 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %582 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %583 = torch.aten.add.Tensor %582, %581, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %583 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8efda60) {
  %16 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> !torch.vtensor<[96,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f53050)
    ** Replace : 'torch.vtensor.literal'(0x8efda60)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f53050) {
      %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %27 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %30 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %32 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %33 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %34 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %35 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %36 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %37 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %38 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %39 = torch.aten.ones %38, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %40 = torch.aten.zeros %38, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %41 = torch.aten.slice.Tensor %37, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %42 = torch.aten.slice.Tensor %41, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %43 = torch.aten.embedding %36, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %44 = torch.aten.embedding %35, %42, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %45 = torch.aten.add.Tensor %43, %44, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %46 = torch.aten.embedding %34, %40, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.aten.add.Tensor %45, %46, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %49 = torch.aten.sum.dim_IntList %47, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %50 = torch.aten.div.Scalar %49, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %51 = torch.aten.sub.Tensor %47, %50, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %52 = torch.aten.pow.Tensor_Scalar %51, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %53 = torch.aten.sum.dim_IntList %52, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %54 = torch.aten.div.Scalar %53, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %55 = torch.aten.sub.Tensor %47, %50, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %56 = torch.aten.add.Scalar %54, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %57 = torch.aten.sqrt %56 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %58 = torch.aten.div.Tensor %55, %57 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %59 = torch.aten.mul.Tensor %33, %58 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %60 = torch.aten.add.Tensor %59, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %61 = torch.aten.unsqueeze %39, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %62 = torch.aten.mul.Tensor %60, %61 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %63 = torch.aten.unsqueeze %39, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %64 = torch.aten.unsqueeze %63, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %65 = torch.aten.squeeze.dim %64, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %66 = torch.aten.unsqueeze %65, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %67 = torch.aten.mul.Tensor %64, %66 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %68 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %69 = torch.aten.to.dtype %68, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %70 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %71 = torch.aten.broadcast_to %69, %70 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %72 = torch.aten.copy %71, %67, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %73 = torch.aten.transpose.int %31, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %74 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %75 = torch.aten.view %62, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %76 = torch.aten.mm %75, %73 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %77 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %78 = torch.aten.view %76, %77 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %79 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %80 = torch.aten.view %78, %79 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %81 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %82 = torch.aten.permute %80, %81 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %83 = torch.aten.slice.Tensor %82, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %84 = torch.aten.slice.Tensor %82, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %85 = torch.aten.slice.Tensor %82, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %86 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %87 = torch.aten.unsqueeze %86, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %88 = torch.aten.slice.Tensor %87, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %89 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %90 = torch.aten.view %88, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %91 = torch.aten.permute %90, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %92 = torch.aten.add.Tensor %83, %91, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %93 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %94 = torch.aten.unsqueeze %93, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %95 = torch.aten.slice.Tensor %94, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %96 = torch.aten.view %95, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %97 = torch.aten.permute %96, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %98 = torch.aten.add.Tensor %85, %97, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %99 = torch.aten.div.Scalar %92, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %100 = torch.aten.transpose.int %84, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %101 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %102 = torch.aten.broadcast_to %99, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %103 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %104 = torch.aten.view %102, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %105 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %106 = torch.aten.broadcast_to %100, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %107 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %108 = torch.aten.view %106, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %109 = torch.aten.bmm %104, %108 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %110 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %111 = torch.aten.view %109, %110 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %112 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %113 = torch.aten.to.dtype %112, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %114 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %115 = torch.aten.broadcast_to %113, %114 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %116 = torch.aten.copy %115, %72, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %117 = torch.aten.bitwise_not %116 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %118 = torch.aten.clone %30, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %119 = torch.aten.masked_fill.Tensor %111, %117, %118 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %119, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %120 = torch.aten.sub.Tensor %119, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %121 = torch.aten.exp %120 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %122 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %123 = torch.aten.sum.dim_IntList %121, %122, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %124 = torch.aten.div.Tensor %121, %123 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %125 = torch.aten.masked_fill.Scalar %124, %117, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %126 = torch.aten.broadcast_to %125, %110 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %127 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %128 = torch.aten.view %126, %127 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %129 = torch.aten.broadcast_to %98, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %130 = torch.aten.view %129, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %131 = torch.aten.bmm %128, %130 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %132 = torch.aten.view %131, %101 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %133 = torch.aten.permute %132, %81 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %134 = torch.aten.clone %133, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %135 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %136 = torch.aten.view %134, %135 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %137 = torch.aten.transpose.int %29, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %138 = torch.aten.view %136, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %139 = torch.aten.mm %138, %137 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %140 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %141 = torch.aten.add.Tensor %140, %139, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %142 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %143 = torch.aten.view %141, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %144 = torch.aten.add.Tensor %143, %62, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %145 = torch.aten.sum.dim_IntList %144, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %146 = torch.aten.div.Scalar %145, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %147 = torch.aten.sub.Tensor %144, %146, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %148 = torch.aten.pow.Tensor_Scalar %147, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %149 = torch.aten.sum.dim_IntList %148, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %150 = torch.aten.div.Scalar %149, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %151 = torch.aten.sub.Tensor %144, %146, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %152 = torch.aten.add.Scalar %150, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %153 = torch.aten.sqrt %152 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %154 = torch.aten.div.Tensor %151, %153 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %155 = torch.aten.mul.Tensor %33, %154 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %156 = torch.aten.add.Tensor %155, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %157 = torch.aten.transpose.int %28, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %158 = torch.aten.view %156, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %159 = torch.aten.mm %158, %157 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %160 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %161 = torch.aten.add.Tensor %160, %159, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %162 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %163 = torch.aten.view %161, %162 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %164 = torch.aten.gelu %163, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %165 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %166 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %167 = torch.aten.view %164, %166 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %168 = torch.aten.mm %167, %165 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %169 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %170 = torch.aten.add.Tensor %169, %168, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %171 = torch.aten.view %170, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %172 = torch.aten.add.Tensor %171, %156, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %173 = torch.aten.sum.dim_IntList %172, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %174 = torch.aten.div.Scalar %173, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %175 = torch.aten.sub.Tensor %172, %174, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %176 = torch.aten.pow.Tensor_Scalar %175, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %177 = torch.aten.sum.dim_IntList %176, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %178 = torch.aten.div.Scalar %177, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %179 = torch.aten.sub.Tensor %172, %174, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %180 = torch.aten.add.Scalar %178, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %181 = torch.aten.sqrt %180 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %182 = torch.aten.div.Tensor %179, %181 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %183 = torch.aten.mul.Tensor %33, %182 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %184 = torch.aten.add.Tensor %183, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %185 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %186 = torch.aten.view %184, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %187 = torch.aten.mm %186, %185 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %188 = torch.aten.view %187, %77 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %189 = torch.aten.view %188, %79 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %190 = torch.aten.permute %189, %81 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %191 = torch.aten.slice.Tensor %190, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %192 = torch.aten.slice.Tensor %190, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %193 = torch.aten.slice.Tensor %190, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %194 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %195 = torch.aten.unsqueeze %194, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %196 = torch.aten.slice.Tensor %195, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %197 = torch.aten.view %196, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %198 = torch.aten.permute %197, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %199 = torch.aten.add.Tensor %191, %198, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %200 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %201 = torch.aten.unsqueeze %200, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %202 = torch.aten.slice.Tensor %201, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %203 = torch.aten.view %202, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %204 = torch.aten.permute %203, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %205 = torch.aten.add.Tensor %193, %204, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %206 = torch.aten.div.Scalar %199, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %207 = torch.aten.transpose.int %192, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %208 = torch.aten.broadcast_to %206, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %209 = torch.aten.view %208, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %210 = torch.aten.broadcast_to %207, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %211 = torch.aten.view %210, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %212 = torch.aten.bmm %209, %211 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %213 = torch.aten.view %212, %110 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %214 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %215 = torch.aten.to.dtype %214, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %216 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %217 = torch.aten.broadcast_to %215, %216 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %218 = torch.aten.copy %217, %72, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %219 = torch.aten.bitwise_not %218 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %220 = torch.aten.clone %30, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %221 = torch.aten.masked_fill.Tensor %213, %219, %220 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %221, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %222 = torch.aten.sub.Tensor %221, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %223 = torch.aten.exp %222 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %224 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %225 = torch.aten.sum.dim_IntList %223, %224, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %226 = torch.aten.div.Tensor %223, %225 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %227 = torch.aten.masked_fill.Scalar %226, %219, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %228 = torch.aten.broadcast_to %227, %110 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %229 = torch.aten.view %228, %127 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %230 = torch.aten.broadcast_to %205, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %231 = torch.aten.view %230, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %232 = torch.aten.bmm %229, %231 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %233 = torch.aten.view %232, %101 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %234 = torch.aten.permute %233, %81 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %235 = torch.aten.clone %234, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %236 = torch.aten.view %235, %135 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %237 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %238 = torch.aten.view %236, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %239 = torch.aten.mm %238, %237 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %240 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %241 = torch.aten.add.Tensor %240, %239, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %242 = torch.aten.view %241, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %243 = torch.aten.add.Tensor %242, %184, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %244 = torch.aten.sum.dim_IntList %243, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %245 = torch.aten.div.Scalar %244, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %246 = torch.aten.sub.Tensor %243, %245, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %247 = torch.aten.pow.Tensor_Scalar %246, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %248 = torch.aten.sum.dim_IntList %247, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %249 = torch.aten.div.Scalar %248, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %250 = torch.aten.sub.Tensor %243, %245, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %251 = torch.aten.add.Scalar %249, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %252 = torch.aten.sqrt %251 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %253 = torch.aten.div.Tensor %250, %252 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %254 = torch.aten.mul.Tensor %33, %253 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %255 = torch.aten.add.Tensor %254, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %256 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %257 = torch.aten.view %255, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %258 = torch.aten.mm %257, %256 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %259 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %260 = torch.aten.add.Tensor %259, %258, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %261 = torch.aten.view %260, %162 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %262 = torch.aten.gelu %261, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %263 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %264 = torch.aten.view %262, %166 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %265 = torch.aten.mm %264, %263 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %266 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %267 = torch.aten.add.Tensor %266, %265, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %268 = torch.aten.view %267, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %269 = torch.aten.add.Tensor %268, %255, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %270 = torch.aten.sum.dim_IntList %269, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %271 = torch.aten.div.Scalar %270, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %272 = torch.aten.sub.Tensor %269, %271, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %273 = torch.aten.pow.Tensor_Scalar %272, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %274 = torch.aten.sum.dim_IntList %273, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %275 = torch.aten.div.Scalar %274, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %276 = torch.aten.sub.Tensor %269, %271, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %277 = torch.aten.add.Scalar %275, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %278 = torch.aten.sqrt %277 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %279 = torch.aten.div.Tensor %276, %278 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %280 = torch.aten.mul.Tensor %33, %279 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %281 = torch.aten.add.Tensor %280, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %282 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %283 = torch.aten.view %281, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %284 = torch.aten.mm %283, %282 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %285 = torch.aten.view %284, %77 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %286 = torch.aten.view %285, %79 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %287 = torch.aten.permute %286, %81 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %288 = torch.aten.slice.Tensor %287, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %289 = torch.aten.slice.Tensor %287, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %290 = torch.aten.slice.Tensor %287, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %291 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %292 = torch.aten.unsqueeze %291, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %293 = torch.aten.slice.Tensor %292, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %294 = torch.aten.view %293, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %295 = torch.aten.permute %294, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %296 = torch.aten.add.Tensor %288, %295, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %297 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %298 = torch.aten.unsqueeze %297, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %299 = torch.aten.slice.Tensor %298, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %300 = torch.aten.view %299, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %301 = torch.aten.permute %300, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %302 = torch.aten.add.Tensor %290, %301, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %303 = torch.aten.div.Scalar %296, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %304 = torch.aten.transpose.int %289, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %305 = torch.aten.broadcast_to %303, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %306 = torch.aten.view %305, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %307 = torch.aten.broadcast_to %304, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %308 = torch.aten.view %307, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %309 = torch.aten.bmm %306, %308 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %310 = torch.aten.view %309, %110 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %311 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %312 = torch.aten.to.dtype %311, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %313 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %314 = torch.aten.broadcast_to %312, %313 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %315 = torch.aten.copy %314, %72, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %316 = torch.aten.bitwise_not %315 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %317 = torch.aten.clone %30, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %318 = torch.aten.masked_fill.Tensor %310, %316, %317 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %318, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %319 = torch.aten.sub.Tensor %318, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %320 = torch.aten.exp %319 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %321 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %322 = torch.aten.sum.dim_IntList %320, %321, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %323 = torch.aten.div.Tensor %320, %322 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %324 = torch.aten.masked_fill.Scalar %323, %316, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %325 = torch.aten.broadcast_to %324, %110 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %326 = torch.aten.view %325, %127 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %327 = torch.aten.broadcast_to %302, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %328 = torch.aten.view %327, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %329 = torch.aten.bmm %326, %328 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %330 = torch.aten.view %329, %101 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %331 = torch.aten.permute %330, %81 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %332 = torch.aten.clone %331, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %333 = torch.aten.view %332, %135 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %334 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %335 = torch.aten.view %333, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %336 = torch.aten.mm %335, %334 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %337 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %338 = torch.aten.add.Tensor %337, %336, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %339 = torch.aten.view %338, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %340 = torch.aten.add.Tensor %339, %281, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %341 = torch.aten.sum.dim_IntList %340, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %342 = torch.aten.div.Scalar %341, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %343 = torch.aten.sub.Tensor %340, %342, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %344 = torch.aten.pow.Tensor_Scalar %343, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %345 = torch.aten.sum.dim_IntList %344, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %346 = torch.aten.div.Scalar %345, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %347 = torch.aten.sub.Tensor %340, %342, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %348 = torch.aten.add.Scalar %346, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %349 = torch.aten.sqrt %348 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %350 = torch.aten.div.Tensor %347, %349 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %351 = torch.aten.mul.Tensor %33, %350 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %352 = torch.aten.add.Tensor %351, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %353 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %354 = torch.aten.view %352, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %355 = torch.aten.mm %354, %353 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %356 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %357 = torch.aten.add.Tensor %356, %355, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %358 = torch.aten.view %357, %162 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %359 = torch.aten.gelu %358, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %360 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %361 = torch.aten.view %359, %166 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %362 = torch.aten.mm %361, %360 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %363 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %364 = torch.aten.add.Tensor %363, %362, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %365 = torch.aten.view %364, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %366 = torch.aten.add.Tensor %365, %352, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %367 = torch.aten.sum.dim_IntList %366, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %368 = torch.aten.div.Scalar %367, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %369 = torch.aten.sub.Tensor %366, %368, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %370 = torch.aten.pow.Tensor_Scalar %369, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %371 = torch.aten.sum.dim_IntList %370, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %372 = torch.aten.div.Scalar %371, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %373 = torch.aten.sub.Tensor %366, %368, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %374 = torch.aten.add.Scalar %372, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %375 = torch.aten.sqrt %374 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %376 = torch.aten.div.Tensor %373, %375 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %377 = torch.aten.mul.Tensor %33, %376 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %378 = torch.aten.add.Tensor %377, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %379 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %380 = torch.aten.view %378, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %381 = torch.aten.mm %380, %379 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %382 = torch.aten.view %381, %77 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %383 = torch.aten.view %382, %79 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %384 = torch.aten.permute %383, %81 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %385 = torch.aten.slice.Tensor %384, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %386 = torch.aten.slice.Tensor %384, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %387 = torch.aten.slice.Tensor %384, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %388 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %389 = torch.aten.unsqueeze %388, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %390 = torch.aten.slice.Tensor %389, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %391 = torch.aten.view %390, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %392 = torch.aten.permute %391, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %393 = torch.aten.add.Tensor %385, %392, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %394 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %395 = torch.aten.unsqueeze %394, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %396 = torch.aten.slice.Tensor %395, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %397 = torch.aten.view %396, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %398 = torch.aten.permute %397, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %399 = torch.aten.add.Tensor %387, %398, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %400 = torch.aten.div.Scalar %393, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %401 = torch.aten.transpose.int %386, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %402 = torch.aten.broadcast_to %400, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %403 = torch.aten.view %402, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %404 = torch.aten.broadcast_to %401, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %405 = torch.aten.view %404, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %406 = torch.aten.bmm %403, %405 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %407 = torch.aten.view %406, %110 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %408 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %409 = torch.aten.to.dtype %408, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %410 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %411 = torch.aten.broadcast_to %409, %410 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %412 = torch.aten.copy %411, %72, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %413 = torch.aten.bitwise_not %412 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %414 = torch.aten.clone %30, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %415 = torch.aten.masked_fill.Tensor %407, %413, %414 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %415, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %416 = torch.aten.sub.Tensor %415, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %417 = torch.aten.exp %416 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %418 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %419 = torch.aten.sum.dim_IntList %417, %418, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %420 = torch.aten.div.Tensor %417, %419 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %421 = torch.aten.masked_fill.Scalar %420, %413, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %422 = torch.aten.broadcast_to %421, %110 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %423 = torch.aten.view %422, %127 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %424 = torch.aten.broadcast_to %399, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %425 = torch.aten.view %424, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %426 = torch.aten.bmm %423, %425 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %427 = torch.aten.view %426, %101 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %428 = torch.aten.permute %427, %81 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %429 = torch.aten.clone %428, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %430 = torch.aten.view %429, %135 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %431 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %432 = torch.aten.view %430, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %433 = torch.aten.mm %432, %431 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %434 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %435 = torch.aten.add.Tensor %434, %433, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %436 = torch.aten.view %435, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %437 = torch.aten.add.Tensor %436, %378, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %438 = torch.aten.sum.dim_IntList %437, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %439 = torch.aten.div.Scalar %438, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %440 = torch.aten.sub.Tensor %437, %439, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %441 = torch.aten.pow.Tensor_Scalar %440, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %442 = torch.aten.sum.dim_IntList %441, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %443 = torch.aten.div.Scalar %442, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %444 = torch.aten.sub.Tensor %437, %439, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %445 = torch.aten.add.Scalar %443, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %446 = torch.aten.sqrt %445 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %447 = torch.aten.div.Tensor %444, %446 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %448 = torch.aten.mul.Tensor %33, %447 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %449 = torch.aten.add.Tensor %448, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %450 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %451 = torch.aten.view %449, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %452 = torch.aten.mm %451, %450 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %453 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %454 = torch.aten.add.Tensor %453, %452, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %455 = torch.aten.view %454, %162 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %456 = torch.aten.gelu %455, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %457 = torch.aten.transpose.int %14, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %458 = torch.aten.view %456, %166 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %459 = torch.aten.mm %458, %457 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %460 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %461 = torch.aten.add.Tensor %460, %459, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %462 = torch.aten.view %461, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %463 = torch.aten.add.Tensor %462, %449, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %464 = torch.aten.sum.dim_IntList %463, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %465 = torch.aten.div.Scalar %464, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %466 = torch.aten.sub.Tensor %463, %465, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %467 = torch.aten.pow.Tensor_Scalar %466, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %468 = torch.aten.sum.dim_IntList %467, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %469 = torch.aten.div.Scalar %468, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %470 = torch.aten.sub.Tensor %463, %465, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %471 = torch.aten.add.Scalar %469, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %472 = torch.aten.sqrt %471 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %473 = torch.aten.div.Tensor %470, %472 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %474 = torch.aten.mul.Tensor %33, %473 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %475 = torch.aten.add.Tensor %474, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %476 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %477 = torch.aten.view %475, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %478 = torch.aten.mm %477, %476 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %479 = torch.aten.view %478, %77 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %480 = torch.aten.view %479, %79 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %481 = torch.aten.permute %480, %81 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %482 = torch.aten.slice.Tensor %481, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %483 = torch.aten.slice.Tensor %481, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %484 = torch.aten.slice.Tensor %481, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %485 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %486 = torch.aten.unsqueeze %485, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %487 = torch.aten.slice.Tensor %486, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %488 = torch.aten.view %487, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %489 = torch.aten.permute %488, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %490 = torch.aten.add.Tensor %482, %489, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %491 = torch.aten.unsqueeze %32, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %492 = torch.aten.unsqueeze %491, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %493 = torch.aten.slice.Tensor %492, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %494 = torch.aten.view %493, %89 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %495 = torch.aten.permute %494, %81 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %496 = torch.aten.add.Tensor %484, %495, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %497 = torch.aten.div.Scalar %490, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %498 = torch.aten.transpose.int %483, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %499 = torch.aten.broadcast_to %497, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %500 = torch.aten.view %499, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %501 = torch.aten.broadcast_to %498, %105 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %502 = torch.aten.view %501, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %503 = torch.aten.bmm %500, %502 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %504 = torch.aten.view %503, %110 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %505 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %506 = torch.aten.to.dtype %505, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %507 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %508 = torch.aten.broadcast_to %506, %507 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %509 = torch.aten.copy %508, %72, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %510 = torch.aten.bitwise_not %509 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %511 = torch.aten.clone %30, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %512 = torch.aten.masked_fill.Tensor %504, %510, %511 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %512, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %513 = torch.aten.sub.Tensor %512, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %514 = torch.aten.exp %513 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %515 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %516 = torch.aten.sum.dim_IntList %514, %515, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %517 = torch.aten.div.Tensor %514, %516 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %518 = torch.aten.masked_fill.Scalar %517, %510, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %519 = torch.aten.broadcast_to %518, %110 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %520 = torch.aten.view %519, %127 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %521 = torch.aten.broadcast_to %496, %101 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %522 = torch.aten.view %521, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %523 = torch.aten.bmm %520, %522 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %524 = torch.aten.view %523, %101 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %525 = torch.aten.permute %524, %81 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %526 = torch.aten.clone %525, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %527 = torch.aten.view %526, %135 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %528 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %529 = torch.aten.view %527, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %530 = torch.aten.mm %529, %528 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %531 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %532 = torch.aten.add.Tensor %531, %530, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %533 = torch.aten.view %532, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %534 = torch.aten.add.Tensor %533, %475, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %535 = torch.aten.sum.dim_IntList %534, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %536 = torch.aten.div.Scalar %535, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %537 = torch.aten.sub.Tensor %534, %536, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %538 = torch.aten.pow.Tensor_Scalar %537, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %539 = torch.aten.sum.dim_IntList %538, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %540 = torch.aten.div.Scalar %539, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %541 = torch.aten.sub.Tensor %534, %536, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %542 = torch.aten.add.Scalar %540, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %543 = torch.aten.sqrt %542 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %544 = torch.aten.div.Tensor %541, %543 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %545 = torch.aten.mul.Tensor %33, %544 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %546 = torch.aten.add.Tensor %545, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %547 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %548 = torch.aten.view %546, %74 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %549 = torch.aten.mm %548, %547 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %550 = torch.aten.mul.Scalar %27, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %551 = torch.aten.add.Tensor %550, %549, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %552 = torch.aten.view %551, %162 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %553 = torch.aten.gelu %552, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %554 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %555 = torch.aten.view %553, %166 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %556 = torch.aten.mm %555, %554 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %557 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %558 = torch.aten.add.Tensor %557, %556, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %559 = torch.aten.view %558, %142 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %560 = torch.aten.add.Tensor %559, %546, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %561 = torch.aten.sum.dim_IntList %560, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %562 = torch.aten.div.Scalar %561, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %563 = torch.aten.sub.Tensor %560, %562, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %564 = torch.aten.pow.Tensor_Scalar %563, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %565 = torch.aten.sum.dim_IntList %564, %48, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %566 = torch.aten.div.Scalar %565, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %567 = torch.aten.sub.Tensor %560, %562, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.add.Scalar %566, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %569 = torch.aten.sqrt %568 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %570 = torch.aten.div.Tensor %567, %569 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %571 = torch.aten.mul.Tensor %33, %570 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %572 = torch.aten.add.Tensor %571, %32, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %573 = torch.aten.slice.Tensor %572, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %574 = torch.aten.slice.Tensor %573, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %575 = torch.aten.squeeze.dim %574, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %576 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %577 = torch.aten.mm %575, %576 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %578 = torch.aten.mul.Scalar %32, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %579 = torch.aten.add.Tensor %578, %577, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %580 = torch.aten.gelu %579, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %581 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %582 = torch.aten.mm %580, %581 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %583 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %584 = torch.aten.add.Tensor %583, %582, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %584 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8efdb20) {
  %18 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> !torch.vtensor<[32,37],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f0cdb0)
    ** Replace : 'torch.vtensor.literal'(0x8efdb20)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f0cdb0) {
      %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %16 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %28 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %31 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %33 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %34 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %35 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %36 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %37 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %38 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %39 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %40 = torch.aten.ones %39, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %41 = torch.aten.zeros %39, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %42 = torch.aten.slice.Tensor %38, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %43 = torch.aten.slice.Tensor %42, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %44 = torch.aten.embedding %37, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %45 = torch.aten.embedding %36, %43, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %46 = torch.aten.add.Tensor %44, %45, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.aten.embedding %35, %41, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.aten.add.Tensor %46, %47, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %50 = torch.aten.sum.dim_IntList %48, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %51 = torch.aten.div.Scalar %50, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %52 = torch.aten.sub.Tensor %48, %51, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %53 = torch.aten.pow.Tensor_Scalar %52, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %54 = torch.aten.sum.dim_IntList %53, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %55 = torch.aten.div.Scalar %54, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %56 = torch.aten.sub.Tensor %48, %51, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.add.Scalar %55, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %58 = torch.aten.sqrt %57 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %59 = torch.aten.div.Tensor %56, %58 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %60 = torch.aten.mul.Tensor %34, %59 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %61 = torch.aten.add.Tensor %60, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %62 = torch.aten.unsqueeze %40, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %63 = torch.aten.mul.Tensor %61, %62 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %64 = torch.aten.unsqueeze %40, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %65 = torch.aten.unsqueeze %64, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %66 = torch.aten.squeeze.dim %65, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %67 = torch.aten.unsqueeze %66, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %68 = torch.aten.mul.Tensor %65, %67 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %69 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %70 = torch.aten.to.dtype %69, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %71 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %72 = torch.aten.broadcast_to %70, %71 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %73 = torch.aten.copy %72, %68, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %74 = torch.aten.transpose.int %32, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %75 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %76 = torch.aten.view %63, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %77 = torch.aten.mm %76, %74 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %78 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %79 = torch.aten.view %77, %78 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %80 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %81 = torch.aten.view %79, %80 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %82 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %83 = torch.aten.permute %81, %82 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %84 = torch.aten.slice.Tensor %83, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %85 = torch.aten.slice.Tensor %83, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %86 = torch.aten.slice.Tensor %83, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %87 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %88 = torch.aten.unsqueeze %87, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %89 = torch.aten.slice.Tensor %88, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %90 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %91 = torch.aten.view %89, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %92 = torch.aten.permute %91, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %93 = torch.aten.add.Tensor %84, %92, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %94 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %95 = torch.aten.unsqueeze %94, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %96 = torch.aten.slice.Tensor %95, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %97 = torch.aten.view %96, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %98 = torch.aten.permute %97, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %99 = torch.aten.add.Tensor %86, %98, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %100 = torch.aten.div.Scalar %93, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %101 = torch.aten.transpose.int %85, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %102 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %103 = torch.aten.broadcast_to %100, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %104 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %105 = torch.aten.view %103, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %106 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %107 = torch.aten.broadcast_to %101, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %108 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %109 = torch.aten.view %107, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %110 = torch.aten.bmm %105, %109 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %111 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %112 = torch.aten.view %110, %111 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %113 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %114 = torch.aten.to.dtype %113, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %115 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %116 = torch.aten.broadcast_to %114, %115 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %117 = torch.aten.copy %116, %73, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %118 = torch.aten.bitwise_not %117 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %119 = torch.aten.clone %31, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %120 = torch.aten.masked_fill.Tensor %112, %118, %119 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %120, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %121 = torch.aten.sub.Tensor %120, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %122 = torch.aten.exp %121 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %123 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %124 = torch.aten.sum.dim_IntList %122, %123, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %125 = torch.aten.div.Tensor %122, %124 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %126 = torch.aten.masked_fill.Scalar %125, %118, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %127 = torch.aten.broadcast_to %126, %111 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %128 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %129 = torch.aten.view %127, %128 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %130 = torch.aten.broadcast_to %99, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %131 = torch.aten.view %130, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %132 = torch.aten.bmm %129, %131 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %133 = torch.aten.view %132, %102 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %134 = torch.aten.permute %133, %82 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %135 = torch.aten.clone %134, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %136 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %137 = torch.aten.view %135, %136 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %138 = torch.aten.transpose.int %30, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %139 = torch.aten.view %137, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %140 = torch.aten.mm %139, %138 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %141 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %142 = torch.aten.add.Tensor %141, %140, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %143 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %144 = torch.aten.view %142, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %145 = torch.aten.add.Tensor %144, %63, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %146 = torch.aten.sum.dim_IntList %145, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %147 = torch.aten.div.Scalar %146, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %148 = torch.aten.sub.Tensor %145, %147, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %149 = torch.aten.pow.Tensor_Scalar %148, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %150 = torch.aten.sum.dim_IntList %149, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %151 = torch.aten.div.Scalar %150, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %152 = torch.aten.sub.Tensor %145, %147, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %153 = torch.aten.add.Scalar %151, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %154 = torch.aten.sqrt %153 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %155 = torch.aten.div.Tensor %152, %154 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %156 = torch.aten.mul.Tensor %34, %155 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %157 = torch.aten.add.Tensor %156, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %158 = torch.aten.transpose.int %29, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %159 = torch.aten.view %157, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %160 = torch.aten.mm %159, %158 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %161 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %162 = torch.aten.add.Tensor %161, %160, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %163 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %164 = torch.aten.view %162, %163 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %165 = torch.aten.gelu %164, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %166 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %167 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %168 = torch.aten.view %165, %167 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %169 = torch.aten.mm %168, %166 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %170 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %171 = torch.aten.add.Tensor %170, %169, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %172 = torch.aten.view %171, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %173 = torch.aten.add.Tensor %172, %157, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %174 = torch.aten.sum.dim_IntList %173, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %175 = torch.aten.div.Scalar %174, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %176 = torch.aten.sub.Tensor %173, %175, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %177 = torch.aten.pow.Tensor_Scalar %176, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %178 = torch.aten.sum.dim_IntList %177, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %179 = torch.aten.div.Scalar %178, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %180 = torch.aten.sub.Tensor %173, %175, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %181 = torch.aten.add.Scalar %179, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %182 = torch.aten.sqrt %181 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %183 = torch.aten.div.Tensor %180, %182 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %184 = torch.aten.mul.Tensor %34, %183 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %185 = torch.aten.add.Tensor %184, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %186 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %187 = torch.aten.view %185, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %188 = torch.aten.mm %187, %186 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %189 = torch.aten.view %188, %78 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %190 = torch.aten.view %189, %80 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %191 = torch.aten.permute %190, %82 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %192 = torch.aten.slice.Tensor %191, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %193 = torch.aten.slice.Tensor %191, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %194 = torch.aten.slice.Tensor %191, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %195 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %196 = torch.aten.unsqueeze %195, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %197 = torch.aten.slice.Tensor %196, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %198 = torch.aten.view %197, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %199 = torch.aten.permute %198, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %200 = torch.aten.add.Tensor %192, %199, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %201 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %202 = torch.aten.unsqueeze %201, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %203 = torch.aten.slice.Tensor %202, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %204 = torch.aten.view %203, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %205 = torch.aten.permute %204, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %206 = torch.aten.add.Tensor %194, %205, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %207 = torch.aten.div.Scalar %200, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %208 = torch.aten.transpose.int %193, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %209 = torch.aten.broadcast_to %207, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %210 = torch.aten.view %209, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %211 = torch.aten.broadcast_to %208, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %212 = torch.aten.view %211, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %213 = torch.aten.bmm %210, %212 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %214 = torch.aten.view %213, %111 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %215 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %216 = torch.aten.to.dtype %215, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %217 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %218 = torch.aten.broadcast_to %216, %217 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %219 = torch.aten.copy %218, %73, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %220 = torch.aten.bitwise_not %219 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %221 = torch.aten.clone %31, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %222 = torch.aten.masked_fill.Tensor %214, %220, %221 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %222, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %223 = torch.aten.sub.Tensor %222, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %224 = torch.aten.exp %223 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %225 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %226 = torch.aten.sum.dim_IntList %224, %225, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %227 = torch.aten.div.Tensor %224, %226 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %228 = torch.aten.masked_fill.Scalar %227, %220, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %229 = torch.aten.broadcast_to %228, %111 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %230 = torch.aten.view %229, %128 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %231 = torch.aten.broadcast_to %206, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %232 = torch.aten.view %231, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %233 = torch.aten.bmm %230, %232 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %234 = torch.aten.view %233, %102 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %235 = torch.aten.permute %234, %82 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %236 = torch.aten.clone %235, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %237 = torch.aten.view %236, %136 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %238 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %239 = torch.aten.view %237, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %240 = torch.aten.mm %239, %238 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %241 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %242 = torch.aten.add.Tensor %241, %240, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %243 = torch.aten.view %242, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %244 = torch.aten.add.Tensor %243, %185, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %245 = torch.aten.sum.dim_IntList %244, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %246 = torch.aten.div.Scalar %245, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %247 = torch.aten.sub.Tensor %244, %246, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %248 = torch.aten.pow.Tensor_Scalar %247, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %249 = torch.aten.sum.dim_IntList %248, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %250 = torch.aten.div.Scalar %249, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %251 = torch.aten.sub.Tensor %244, %246, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %252 = torch.aten.add.Scalar %250, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %253 = torch.aten.sqrt %252 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %254 = torch.aten.div.Tensor %251, %253 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %255 = torch.aten.mul.Tensor %34, %254 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %256 = torch.aten.add.Tensor %255, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %257 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %258 = torch.aten.view %256, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %259 = torch.aten.mm %258, %257 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %260 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %261 = torch.aten.add.Tensor %260, %259, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %262 = torch.aten.view %261, %163 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %263 = torch.aten.gelu %262, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %264 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %265 = torch.aten.view %263, %167 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %266 = torch.aten.mm %265, %264 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %267 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %268 = torch.aten.add.Tensor %267, %266, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %269 = torch.aten.view %268, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %270 = torch.aten.add.Tensor %269, %256, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %271 = torch.aten.sum.dim_IntList %270, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %272 = torch.aten.div.Scalar %271, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %273 = torch.aten.sub.Tensor %270, %272, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %274 = torch.aten.pow.Tensor_Scalar %273, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %275 = torch.aten.sum.dim_IntList %274, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %276 = torch.aten.div.Scalar %275, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %277 = torch.aten.sub.Tensor %270, %272, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %278 = torch.aten.add.Scalar %276, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %279 = torch.aten.sqrt %278 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %280 = torch.aten.div.Tensor %277, %279 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %281 = torch.aten.mul.Tensor %34, %280 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %282 = torch.aten.add.Tensor %281, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %283 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %284 = torch.aten.view %282, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %285 = torch.aten.mm %284, %283 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %286 = torch.aten.view %285, %78 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %287 = torch.aten.view %286, %80 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %288 = torch.aten.permute %287, %82 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %289 = torch.aten.slice.Tensor %288, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %290 = torch.aten.slice.Tensor %288, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %291 = torch.aten.slice.Tensor %288, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %292 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %293 = torch.aten.unsqueeze %292, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %294 = torch.aten.slice.Tensor %293, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %295 = torch.aten.view %294, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %296 = torch.aten.permute %295, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %297 = torch.aten.add.Tensor %289, %296, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %298 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %299 = torch.aten.unsqueeze %298, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %300 = torch.aten.slice.Tensor %299, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %301 = torch.aten.view %300, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %302 = torch.aten.permute %301, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %303 = torch.aten.add.Tensor %291, %302, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %304 = torch.aten.div.Scalar %297, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %305 = torch.aten.transpose.int %290, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %306 = torch.aten.broadcast_to %304, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %307 = torch.aten.view %306, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %308 = torch.aten.broadcast_to %305, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %309 = torch.aten.view %308, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %310 = torch.aten.bmm %307, %309 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %311 = torch.aten.view %310, %111 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %312 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %313 = torch.aten.to.dtype %312, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %314 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %315 = torch.aten.broadcast_to %313, %314 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %316 = torch.aten.copy %315, %73, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %317 = torch.aten.bitwise_not %316 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %318 = torch.aten.clone %31, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %319 = torch.aten.masked_fill.Tensor %311, %317, %318 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %319, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %320 = torch.aten.sub.Tensor %319, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %321 = torch.aten.exp %320 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %322 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %323 = torch.aten.sum.dim_IntList %321, %322, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %324 = torch.aten.div.Tensor %321, %323 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %325 = torch.aten.masked_fill.Scalar %324, %317, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %326 = torch.aten.broadcast_to %325, %111 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %327 = torch.aten.view %326, %128 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %328 = torch.aten.broadcast_to %303, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %329 = torch.aten.view %328, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %330 = torch.aten.bmm %327, %329 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %331 = torch.aten.view %330, %102 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %332 = torch.aten.permute %331, %82 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %333 = torch.aten.clone %332, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %334 = torch.aten.view %333, %136 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %335 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %336 = torch.aten.view %334, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %337 = torch.aten.mm %336, %335 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %338 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %339 = torch.aten.add.Tensor %338, %337, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %340 = torch.aten.view %339, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %341 = torch.aten.add.Tensor %340, %282, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %342 = torch.aten.sum.dim_IntList %341, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %343 = torch.aten.div.Scalar %342, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %344 = torch.aten.sub.Tensor %341, %343, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %345 = torch.aten.pow.Tensor_Scalar %344, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %346 = torch.aten.sum.dim_IntList %345, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %347 = torch.aten.div.Scalar %346, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %348 = torch.aten.sub.Tensor %341, %343, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %349 = torch.aten.add.Scalar %347, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %350 = torch.aten.sqrt %349 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %351 = torch.aten.div.Tensor %348, %350 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %352 = torch.aten.mul.Tensor %34, %351 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %353 = torch.aten.add.Tensor %352, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %354 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %355 = torch.aten.view %353, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %356 = torch.aten.mm %355, %354 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %357 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %358 = torch.aten.add.Tensor %357, %356, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %359 = torch.aten.view %358, %163 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %360 = torch.aten.gelu %359, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %361 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %362 = torch.aten.view %360, %167 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %363 = torch.aten.mm %362, %361 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %364 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %365 = torch.aten.add.Tensor %364, %363, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %366 = torch.aten.view %365, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %367 = torch.aten.add.Tensor %366, %353, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %368 = torch.aten.sum.dim_IntList %367, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %369 = torch.aten.div.Scalar %368, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %370 = torch.aten.sub.Tensor %367, %369, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %371 = torch.aten.pow.Tensor_Scalar %370, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %372 = torch.aten.sum.dim_IntList %371, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %373 = torch.aten.div.Scalar %372, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %374 = torch.aten.sub.Tensor %367, %369, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %375 = torch.aten.add.Scalar %373, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %376 = torch.aten.sqrt %375 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %377 = torch.aten.div.Tensor %374, %376 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %378 = torch.aten.mul.Tensor %34, %377 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %379 = torch.aten.add.Tensor %378, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %380 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %381 = torch.aten.view %379, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %382 = torch.aten.mm %381, %380 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %383 = torch.aten.view %382, %78 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %384 = torch.aten.view %383, %80 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %385 = torch.aten.permute %384, %82 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %386 = torch.aten.slice.Tensor %385, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %387 = torch.aten.slice.Tensor %385, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %388 = torch.aten.slice.Tensor %385, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %389 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %390 = torch.aten.unsqueeze %389, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %391 = torch.aten.slice.Tensor %390, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %392 = torch.aten.view %391, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %393 = torch.aten.permute %392, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %394 = torch.aten.add.Tensor %386, %393, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %395 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %396 = torch.aten.unsqueeze %395, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %397 = torch.aten.slice.Tensor %396, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %398 = torch.aten.view %397, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %399 = torch.aten.permute %398, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %400 = torch.aten.add.Tensor %388, %399, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %401 = torch.aten.div.Scalar %394, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %402 = torch.aten.transpose.int %387, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %403 = torch.aten.broadcast_to %401, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %404 = torch.aten.view %403, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %405 = torch.aten.broadcast_to %402, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %406 = torch.aten.view %405, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %407 = torch.aten.bmm %404, %406 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %408 = torch.aten.view %407, %111 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %409 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %410 = torch.aten.to.dtype %409, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %411 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %412 = torch.aten.broadcast_to %410, %411 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %413 = torch.aten.copy %412, %73, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %414 = torch.aten.bitwise_not %413 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %415 = torch.aten.clone %31, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %416 = torch.aten.masked_fill.Tensor %408, %414, %415 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %416, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %417 = torch.aten.sub.Tensor %416, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %418 = torch.aten.exp %417 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %419 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %420 = torch.aten.sum.dim_IntList %418, %419, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %421 = torch.aten.div.Tensor %418, %420 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %422 = torch.aten.masked_fill.Scalar %421, %414, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %423 = torch.aten.broadcast_to %422, %111 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %424 = torch.aten.view %423, %128 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %425 = torch.aten.broadcast_to %400, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %426 = torch.aten.view %425, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %427 = torch.aten.bmm %424, %426 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %428 = torch.aten.view %427, %102 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %429 = torch.aten.permute %428, %82 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %430 = torch.aten.clone %429, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %431 = torch.aten.view %430, %136 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %432 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %433 = torch.aten.view %431, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %434 = torch.aten.mm %433, %432 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %435 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %436 = torch.aten.add.Tensor %435, %434, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %437 = torch.aten.view %436, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %438 = torch.aten.add.Tensor %437, %379, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %439 = torch.aten.sum.dim_IntList %438, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %440 = torch.aten.div.Scalar %439, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %441 = torch.aten.sub.Tensor %438, %440, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %442 = torch.aten.pow.Tensor_Scalar %441, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %443 = torch.aten.sum.dim_IntList %442, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %444 = torch.aten.div.Scalar %443, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %445 = torch.aten.sub.Tensor %438, %440, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %446 = torch.aten.add.Scalar %444, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %447 = torch.aten.sqrt %446 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %448 = torch.aten.div.Tensor %445, %447 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %449 = torch.aten.mul.Tensor %34, %448 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %450 = torch.aten.add.Tensor %449, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %451 = torch.aten.transpose.int %16, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %452 = torch.aten.view %450, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %453 = torch.aten.mm %452, %451 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %454 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %455 = torch.aten.add.Tensor %454, %453, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %456 = torch.aten.view %455, %163 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %457 = torch.aten.gelu %456, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %458 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %459 = torch.aten.view %457, %167 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %460 = torch.aten.mm %459, %458 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %461 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %462 = torch.aten.add.Tensor %461, %460, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %463 = torch.aten.view %462, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %464 = torch.aten.add.Tensor %463, %450, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %465 = torch.aten.sum.dim_IntList %464, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %466 = torch.aten.div.Scalar %465, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %467 = torch.aten.sub.Tensor %464, %466, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %468 = torch.aten.pow.Tensor_Scalar %467, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %469 = torch.aten.sum.dim_IntList %468, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %470 = torch.aten.div.Scalar %469, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %471 = torch.aten.sub.Tensor %464, %466, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %472 = torch.aten.add.Scalar %470, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %473 = torch.aten.sqrt %472 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %474 = torch.aten.div.Tensor %471, %473 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %475 = torch.aten.mul.Tensor %34, %474 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %476 = torch.aten.add.Tensor %475, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %477 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %478 = torch.aten.view %476, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %479 = torch.aten.mm %478, %477 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %480 = torch.aten.view %479, %78 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %481 = torch.aten.view %480, %80 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %482 = torch.aten.permute %481, %82 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %483 = torch.aten.slice.Tensor %482, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %484 = torch.aten.slice.Tensor %482, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %485 = torch.aten.slice.Tensor %482, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %486 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %487 = torch.aten.unsqueeze %486, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %488 = torch.aten.slice.Tensor %487, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %489 = torch.aten.view %488, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %490 = torch.aten.permute %489, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %491 = torch.aten.add.Tensor %483, %490, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %492 = torch.aten.unsqueeze %33, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %493 = torch.aten.unsqueeze %492, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %494 = torch.aten.slice.Tensor %493, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %495 = torch.aten.view %494, %90 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %496 = torch.aten.permute %495, %82 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %497 = torch.aten.add.Tensor %485, %496, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %498 = torch.aten.div.Scalar %491, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %499 = torch.aten.transpose.int %484, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %500 = torch.aten.broadcast_to %498, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %501 = torch.aten.view %500, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %502 = torch.aten.broadcast_to %499, %106 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %503 = torch.aten.view %502, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %504 = torch.aten.bmm %501, %503 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %505 = torch.aten.view %504, %111 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %506 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %507 = torch.aten.to.dtype %506, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %508 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %509 = torch.aten.broadcast_to %507, %508 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %510 = torch.aten.copy %509, %73, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %511 = torch.aten.bitwise_not %510 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %512 = torch.aten.clone %31, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %513 = torch.aten.masked_fill.Tensor %505, %511, %512 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %513, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %514 = torch.aten.sub.Tensor %513, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %515 = torch.aten.exp %514 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %516 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %517 = torch.aten.sum.dim_IntList %515, %516, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %518 = torch.aten.div.Tensor %515, %517 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %519 = torch.aten.masked_fill.Scalar %518, %511, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %520 = torch.aten.broadcast_to %519, %111 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %521 = torch.aten.view %520, %128 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %522 = torch.aten.broadcast_to %497, %102 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %523 = torch.aten.view %522, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %524 = torch.aten.bmm %521, %523 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %525 = torch.aten.view %524, %102 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %526 = torch.aten.permute %525, %82 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %527 = torch.aten.clone %526, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %528 = torch.aten.view %527, %136 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %529 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %530 = torch.aten.view %528, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %531 = torch.aten.mm %530, %529 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %532 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %533 = torch.aten.add.Tensor %532, %531, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %534 = torch.aten.view %533, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %535 = torch.aten.add.Tensor %534, %476, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %536 = torch.aten.sum.dim_IntList %535, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %537 = torch.aten.div.Scalar %536, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %538 = torch.aten.sub.Tensor %535, %537, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %539 = torch.aten.pow.Tensor_Scalar %538, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %540 = torch.aten.sum.dim_IntList %539, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %541 = torch.aten.div.Scalar %540, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %542 = torch.aten.sub.Tensor %535, %537, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %543 = torch.aten.add.Scalar %541, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %544 = torch.aten.sqrt %543 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %545 = torch.aten.div.Tensor %542, %544 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %546 = torch.aten.mul.Tensor %34, %545 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %547 = torch.aten.add.Tensor %546, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %548 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %549 = torch.aten.view %547, %75 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %550 = torch.aten.mm %549, %548 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %551 = torch.aten.mul.Scalar %28, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %552 = torch.aten.add.Tensor %551, %550, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %553 = torch.aten.view %552, %163 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %554 = torch.aten.gelu %553, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %555 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %556 = torch.aten.view %554, %167 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %557 = torch.aten.mm %556, %555 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %558 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %559 = torch.aten.add.Tensor %558, %557, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %560 = torch.aten.view %559, %143 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %561 = torch.aten.add.Tensor %560, %547, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %562 = torch.aten.sum.dim_IntList %561, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %563 = torch.aten.div.Scalar %562, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %564 = torch.aten.sub.Tensor %561, %563, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %565 = torch.aten.pow.Tensor_Scalar %564, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %566 = torch.aten.sum.dim_IntList %565, %49, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %567 = torch.aten.div.Scalar %566, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %568 = torch.aten.sub.Tensor %561, %563, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %569 = torch.aten.add.Scalar %567, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %570 = torch.aten.sqrt %569 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %571 = torch.aten.div.Tensor %568, %570 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %572 = torch.aten.mul.Tensor %34, %571 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %573 = torch.aten.add.Tensor %572, %33, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %574 = torch.aten.slice.Tensor %573, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %575 = torch.aten.slice.Tensor %574, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %576 = torch.aten.squeeze.dim %575, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %577 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %578 = torch.aten.mm %576, %577 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %579 = torch.aten.mul.Scalar %33, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %580 = torch.aten.add.Tensor %579, %578, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %581 = torch.aten.gelu %580, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %582 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %583 = torch.aten.mm %581, %582 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %584 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %585 = torch.aten.add.Tensor %584, %583, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %585 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8ed8f50) {
  %20 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> !torch.vtensor<[37,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f3a660)
    ** Replace : 'torch.vtensor.literal'(0x8ed8f50)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f3a660) {
      %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %18 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %29 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %32 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %34 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %35 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %36 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %37 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %38 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %39 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %40 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %41 = torch.aten.ones %40, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %42 = torch.aten.zeros %40, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %43 = torch.aten.slice.Tensor %39, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %44 = torch.aten.slice.Tensor %43, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %45 = torch.aten.embedding %38, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %46 = torch.aten.embedding %37, %44, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.aten.add.Tensor %45, %46, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.aten.embedding %36, %42, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.aten.add.Tensor %47, %48, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %51 = torch.aten.sum.dim_IntList %49, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %52 = torch.aten.div.Scalar %51, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %53 = torch.aten.sub.Tensor %49, %52, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %54 = torch.aten.pow.Tensor_Scalar %53, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %55 = torch.aten.sum.dim_IntList %54, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %56 = torch.aten.div.Scalar %55, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %57 = torch.aten.sub.Tensor %49, %52, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %58 = torch.aten.add.Scalar %56, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %59 = torch.aten.sqrt %58 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %60 = torch.aten.div.Tensor %57, %59 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %61 = torch.aten.mul.Tensor %35, %60 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %62 = torch.aten.add.Tensor %61, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %63 = torch.aten.unsqueeze %41, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %64 = torch.aten.mul.Tensor %62, %63 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %65 = torch.aten.unsqueeze %41, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %66 = torch.aten.unsqueeze %65, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %67 = torch.aten.squeeze.dim %66, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %68 = torch.aten.unsqueeze %67, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %69 = torch.aten.mul.Tensor %66, %68 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %70 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %71 = torch.aten.to.dtype %70, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %72 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %73 = torch.aten.broadcast_to %71, %72 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %74 = torch.aten.copy %73, %69, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %75 = torch.aten.transpose.int %33, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %76 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %77 = torch.aten.view %64, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %78 = torch.aten.mm %77, %75 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %79 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %80 = torch.aten.view %78, %79 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %81 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %82 = torch.aten.view %80, %81 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %83 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %84 = torch.aten.permute %82, %83 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %85 = torch.aten.slice.Tensor %84, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %86 = torch.aten.slice.Tensor %84, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %87 = torch.aten.slice.Tensor %84, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %88 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %89 = torch.aten.unsqueeze %88, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %90 = torch.aten.slice.Tensor %89, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %91 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %92 = torch.aten.view %90, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %93 = torch.aten.permute %92, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %94 = torch.aten.add.Tensor %85, %93, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %95 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %96 = torch.aten.unsqueeze %95, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %97 = torch.aten.slice.Tensor %96, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %98 = torch.aten.view %97, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %99 = torch.aten.permute %98, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %100 = torch.aten.add.Tensor %87, %99, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %101 = torch.aten.div.Scalar %94, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %102 = torch.aten.transpose.int %86, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %103 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %104 = torch.aten.broadcast_to %101, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %105 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %106 = torch.aten.view %104, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %107 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %108 = torch.aten.broadcast_to %102, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %109 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %110 = torch.aten.view %108, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %111 = torch.aten.bmm %106, %110 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %112 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %113 = torch.aten.view %111, %112 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %114 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %115 = torch.aten.to.dtype %114, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %116 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %117 = torch.aten.broadcast_to %115, %116 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %118 = torch.aten.copy %117, %74, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %119 = torch.aten.bitwise_not %118 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %120 = torch.aten.clone %32, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %121 = torch.aten.masked_fill.Tensor %113, %119, %120 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %121, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %122 = torch.aten.sub.Tensor %121, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %123 = torch.aten.exp %122 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %124 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %125 = torch.aten.sum.dim_IntList %123, %124, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %126 = torch.aten.div.Tensor %123, %125 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %127 = torch.aten.masked_fill.Scalar %126, %119, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %128 = torch.aten.broadcast_to %127, %112 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %129 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %130 = torch.aten.view %128, %129 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %131 = torch.aten.broadcast_to %100, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %132 = torch.aten.view %131, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %133 = torch.aten.bmm %130, %132 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %134 = torch.aten.view %133, %103 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %135 = torch.aten.permute %134, %83 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %136 = torch.aten.clone %135, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %137 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %138 = torch.aten.view %136, %137 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %139 = torch.aten.transpose.int %31, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %140 = torch.aten.view %138, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %141 = torch.aten.mm %140, %139 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %142 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %143 = torch.aten.add.Tensor %142, %141, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %144 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %145 = torch.aten.view %143, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %146 = torch.aten.add.Tensor %145, %64, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %147 = torch.aten.sum.dim_IntList %146, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %148 = torch.aten.div.Scalar %147, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %149 = torch.aten.sub.Tensor %146, %148, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %150 = torch.aten.pow.Tensor_Scalar %149, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %151 = torch.aten.sum.dim_IntList %150, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %152 = torch.aten.div.Scalar %151, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %153 = torch.aten.sub.Tensor %146, %148, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %154 = torch.aten.add.Scalar %152, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %155 = torch.aten.sqrt %154 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %156 = torch.aten.div.Tensor %153, %155 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %157 = torch.aten.mul.Tensor %35, %156 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %158 = torch.aten.add.Tensor %157, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %159 = torch.aten.transpose.int %30, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %160 = torch.aten.view %158, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %161 = torch.aten.mm %160, %159 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %162 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %163 = torch.aten.add.Tensor %162, %161, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %164 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %165 = torch.aten.view %163, %164 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %166 = torch.aten.gelu %165, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %167 = torch.aten.transpose.int %28, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %168 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %169 = torch.aten.view %166, %168 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %170 = torch.aten.mm %169, %167 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %171 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %172 = torch.aten.add.Tensor %171, %170, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %173 = torch.aten.view %172, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %174 = torch.aten.add.Tensor %173, %158, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %175 = torch.aten.sum.dim_IntList %174, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %176 = torch.aten.div.Scalar %175, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %177 = torch.aten.sub.Tensor %174, %176, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %178 = torch.aten.pow.Tensor_Scalar %177, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %179 = torch.aten.sum.dim_IntList %178, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %180 = torch.aten.div.Scalar %179, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %181 = torch.aten.sub.Tensor %174, %176, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %182 = torch.aten.add.Scalar %180, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %183 = torch.aten.sqrt %182 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %184 = torch.aten.div.Tensor %181, %183 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %185 = torch.aten.mul.Tensor %35, %184 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %186 = torch.aten.add.Tensor %185, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %187 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %188 = torch.aten.view %186, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %189 = torch.aten.mm %188, %187 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %190 = torch.aten.view %189, %79 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %191 = torch.aten.view %190, %81 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %192 = torch.aten.permute %191, %83 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %193 = torch.aten.slice.Tensor %192, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %194 = torch.aten.slice.Tensor %192, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %195 = torch.aten.slice.Tensor %192, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %196 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %197 = torch.aten.unsqueeze %196, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %198 = torch.aten.slice.Tensor %197, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %199 = torch.aten.view %198, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %200 = torch.aten.permute %199, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %201 = torch.aten.add.Tensor %193, %200, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %202 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %203 = torch.aten.unsqueeze %202, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %204 = torch.aten.slice.Tensor %203, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %205 = torch.aten.view %204, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %206 = torch.aten.permute %205, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %207 = torch.aten.add.Tensor %195, %206, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %208 = torch.aten.div.Scalar %201, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %209 = torch.aten.transpose.int %194, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %210 = torch.aten.broadcast_to %208, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %211 = torch.aten.view %210, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %212 = torch.aten.broadcast_to %209, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %213 = torch.aten.view %212, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %214 = torch.aten.bmm %211, %213 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %215 = torch.aten.view %214, %112 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %216 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %217 = torch.aten.to.dtype %216, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %218 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %219 = torch.aten.broadcast_to %217, %218 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %220 = torch.aten.copy %219, %74, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %221 = torch.aten.bitwise_not %220 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %222 = torch.aten.clone %32, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %223 = torch.aten.masked_fill.Tensor %215, %221, %222 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %223, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %224 = torch.aten.sub.Tensor %223, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %225 = torch.aten.exp %224 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %226 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %227 = torch.aten.sum.dim_IntList %225, %226, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %228 = torch.aten.div.Tensor %225, %227 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %229 = torch.aten.masked_fill.Scalar %228, %221, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %230 = torch.aten.broadcast_to %229, %112 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %231 = torch.aten.view %230, %129 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %232 = torch.aten.broadcast_to %207, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %233 = torch.aten.view %232, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %234 = torch.aten.bmm %231, %233 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %235 = torch.aten.view %234, %103 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %236 = torch.aten.permute %235, %83 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %237 = torch.aten.clone %236, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %238 = torch.aten.view %237, %137 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %239 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %240 = torch.aten.view %238, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %241 = torch.aten.mm %240, %239 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %242 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %243 = torch.aten.add.Tensor %242, %241, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %244 = torch.aten.view %243, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %245 = torch.aten.add.Tensor %244, %186, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %246 = torch.aten.sum.dim_IntList %245, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %247 = torch.aten.div.Scalar %246, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %248 = torch.aten.sub.Tensor %245, %247, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %249 = torch.aten.pow.Tensor_Scalar %248, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %250 = torch.aten.sum.dim_IntList %249, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %251 = torch.aten.div.Scalar %250, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %252 = torch.aten.sub.Tensor %245, %247, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %253 = torch.aten.add.Scalar %251, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %254 = torch.aten.sqrt %253 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %255 = torch.aten.div.Tensor %252, %254 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %256 = torch.aten.mul.Tensor %35, %255 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %257 = torch.aten.add.Tensor %256, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %258 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %259 = torch.aten.view %257, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %260 = torch.aten.mm %259, %258 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %261 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %262 = torch.aten.add.Tensor %261, %260, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %263 = torch.aten.view %262, %164 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %264 = torch.aten.gelu %263, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %265 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %266 = torch.aten.view %264, %168 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %267 = torch.aten.mm %266, %265 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %268 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %269 = torch.aten.add.Tensor %268, %267, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %270 = torch.aten.view %269, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %271 = torch.aten.add.Tensor %270, %257, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %272 = torch.aten.sum.dim_IntList %271, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %273 = torch.aten.div.Scalar %272, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %274 = torch.aten.sub.Tensor %271, %273, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %275 = torch.aten.pow.Tensor_Scalar %274, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %276 = torch.aten.sum.dim_IntList %275, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %277 = torch.aten.div.Scalar %276, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %278 = torch.aten.sub.Tensor %271, %273, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %279 = torch.aten.add.Scalar %277, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %280 = torch.aten.sqrt %279 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %281 = torch.aten.div.Tensor %278, %280 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %282 = torch.aten.mul.Tensor %35, %281 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %283 = torch.aten.add.Tensor %282, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %284 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %285 = torch.aten.view %283, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %286 = torch.aten.mm %285, %284 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %287 = torch.aten.view %286, %79 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %288 = torch.aten.view %287, %81 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %289 = torch.aten.permute %288, %83 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %290 = torch.aten.slice.Tensor %289, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %291 = torch.aten.slice.Tensor %289, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %292 = torch.aten.slice.Tensor %289, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %293 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %294 = torch.aten.unsqueeze %293, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %295 = torch.aten.slice.Tensor %294, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %296 = torch.aten.view %295, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %297 = torch.aten.permute %296, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %298 = torch.aten.add.Tensor %290, %297, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %299 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %300 = torch.aten.unsqueeze %299, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %301 = torch.aten.slice.Tensor %300, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %302 = torch.aten.view %301, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %303 = torch.aten.permute %302, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %304 = torch.aten.add.Tensor %292, %303, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %305 = torch.aten.div.Scalar %298, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %306 = torch.aten.transpose.int %291, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %307 = torch.aten.broadcast_to %305, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %308 = torch.aten.view %307, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %309 = torch.aten.broadcast_to %306, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %310 = torch.aten.view %309, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %311 = torch.aten.bmm %308, %310 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %312 = torch.aten.view %311, %112 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %313 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %314 = torch.aten.to.dtype %313, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %315 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %316 = torch.aten.broadcast_to %314, %315 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %317 = torch.aten.copy %316, %74, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %318 = torch.aten.bitwise_not %317 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %319 = torch.aten.clone %32, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %320 = torch.aten.masked_fill.Tensor %312, %318, %319 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %320, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %321 = torch.aten.sub.Tensor %320, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %322 = torch.aten.exp %321 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %323 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %324 = torch.aten.sum.dim_IntList %322, %323, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %325 = torch.aten.div.Tensor %322, %324 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %326 = torch.aten.masked_fill.Scalar %325, %318, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %327 = torch.aten.broadcast_to %326, %112 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %328 = torch.aten.view %327, %129 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %329 = torch.aten.broadcast_to %304, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %330 = torch.aten.view %329, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %331 = torch.aten.bmm %328, %330 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %332 = torch.aten.view %331, %103 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %333 = torch.aten.permute %332, %83 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %334 = torch.aten.clone %333, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %335 = torch.aten.view %334, %137 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %336 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %337 = torch.aten.view %335, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %338 = torch.aten.mm %337, %336 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %339 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %340 = torch.aten.add.Tensor %339, %338, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %341 = torch.aten.view %340, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %342 = torch.aten.add.Tensor %341, %283, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %343 = torch.aten.sum.dim_IntList %342, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %344 = torch.aten.div.Scalar %343, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %345 = torch.aten.sub.Tensor %342, %344, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %346 = torch.aten.pow.Tensor_Scalar %345, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %347 = torch.aten.sum.dim_IntList %346, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %348 = torch.aten.div.Scalar %347, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %349 = torch.aten.sub.Tensor %342, %344, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %350 = torch.aten.add.Scalar %348, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %351 = torch.aten.sqrt %350 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %352 = torch.aten.div.Tensor %349, %351 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %353 = torch.aten.mul.Tensor %35, %352 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %354 = torch.aten.add.Tensor %353, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %355 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %356 = torch.aten.view %354, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %357 = torch.aten.mm %356, %355 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %358 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %359 = torch.aten.add.Tensor %358, %357, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %360 = torch.aten.view %359, %164 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %361 = torch.aten.gelu %360, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %362 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %363 = torch.aten.view %361, %168 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %364 = torch.aten.mm %363, %362 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %365 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %366 = torch.aten.add.Tensor %365, %364, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %367 = torch.aten.view %366, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %368 = torch.aten.add.Tensor %367, %354, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %369 = torch.aten.sum.dim_IntList %368, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %370 = torch.aten.div.Scalar %369, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %371 = torch.aten.sub.Tensor %368, %370, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %372 = torch.aten.pow.Tensor_Scalar %371, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %373 = torch.aten.sum.dim_IntList %372, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %374 = torch.aten.div.Scalar %373, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %375 = torch.aten.sub.Tensor %368, %370, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %376 = torch.aten.add.Scalar %374, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %377 = torch.aten.sqrt %376 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %378 = torch.aten.div.Tensor %375, %377 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %379 = torch.aten.mul.Tensor %35, %378 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %380 = torch.aten.add.Tensor %379, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %381 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %382 = torch.aten.view %380, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %383 = torch.aten.mm %382, %381 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %384 = torch.aten.view %383, %79 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %385 = torch.aten.view %384, %81 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %386 = torch.aten.permute %385, %83 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %387 = torch.aten.slice.Tensor %386, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %388 = torch.aten.slice.Tensor %386, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %389 = torch.aten.slice.Tensor %386, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %390 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %391 = torch.aten.unsqueeze %390, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %392 = torch.aten.slice.Tensor %391, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %393 = torch.aten.view %392, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %394 = torch.aten.permute %393, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %395 = torch.aten.add.Tensor %387, %394, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %396 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %397 = torch.aten.unsqueeze %396, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %398 = torch.aten.slice.Tensor %397, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %399 = torch.aten.view %398, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %400 = torch.aten.permute %399, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %401 = torch.aten.add.Tensor %389, %400, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %402 = torch.aten.div.Scalar %395, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %403 = torch.aten.transpose.int %388, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %404 = torch.aten.broadcast_to %402, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %405 = torch.aten.view %404, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %406 = torch.aten.broadcast_to %403, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %407 = torch.aten.view %406, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %408 = torch.aten.bmm %405, %407 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %409 = torch.aten.view %408, %112 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %410 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %411 = torch.aten.to.dtype %410, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %412 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %413 = torch.aten.broadcast_to %411, %412 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %414 = torch.aten.copy %413, %74, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %415 = torch.aten.bitwise_not %414 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %416 = torch.aten.clone %32, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %417 = torch.aten.masked_fill.Tensor %409, %415, %416 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %417, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %418 = torch.aten.sub.Tensor %417, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %419 = torch.aten.exp %418 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %420 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %421 = torch.aten.sum.dim_IntList %419, %420, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %422 = torch.aten.div.Tensor %419, %421 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %423 = torch.aten.masked_fill.Scalar %422, %415, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %424 = torch.aten.broadcast_to %423, %112 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %425 = torch.aten.view %424, %129 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %426 = torch.aten.broadcast_to %401, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %427 = torch.aten.view %426, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %428 = torch.aten.bmm %425, %427 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %429 = torch.aten.view %428, %103 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %430 = torch.aten.permute %429, %83 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %431 = torch.aten.clone %430, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %432 = torch.aten.view %431, %137 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %433 = torch.aten.transpose.int %18, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %434 = torch.aten.view %432, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %435 = torch.aten.mm %434, %433 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %436 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %437 = torch.aten.add.Tensor %436, %435, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %438 = torch.aten.view %437, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %439 = torch.aten.add.Tensor %438, %380, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %440 = torch.aten.sum.dim_IntList %439, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %441 = torch.aten.div.Scalar %440, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %442 = torch.aten.sub.Tensor %439, %441, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %443 = torch.aten.pow.Tensor_Scalar %442, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %444 = torch.aten.sum.dim_IntList %443, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %445 = torch.aten.div.Scalar %444, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %446 = torch.aten.sub.Tensor %439, %441, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %447 = torch.aten.add.Scalar %445, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %448 = torch.aten.sqrt %447 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %449 = torch.aten.div.Tensor %446, %448 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %450 = torch.aten.mul.Tensor %35, %449 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %451 = torch.aten.add.Tensor %450, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %452 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %453 = torch.aten.view %451, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %454 = torch.aten.mm %453, %452 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %455 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %456 = torch.aten.add.Tensor %455, %454, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %457 = torch.aten.view %456, %164 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %458 = torch.aten.gelu %457, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %459 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %460 = torch.aten.view %458, %168 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %461 = torch.aten.mm %460, %459 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %462 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %463 = torch.aten.add.Tensor %462, %461, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %464 = torch.aten.view %463, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %465 = torch.aten.add.Tensor %464, %451, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %466 = torch.aten.sum.dim_IntList %465, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %467 = torch.aten.div.Scalar %466, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %468 = torch.aten.sub.Tensor %465, %467, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %469 = torch.aten.pow.Tensor_Scalar %468, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %470 = torch.aten.sum.dim_IntList %469, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %471 = torch.aten.div.Scalar %470, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %472 = torch.aten.sub.Tensor %465, %467, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %473 = torch.aten.add.Scalar %471, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %474 = torch.aten.sqrt %473 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %475 = torch.aten.div.Tensor %472, %474 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %476 = torch.aten.mul.Tensor %35, %475 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %477 = torch.aten.add.Tensor %476, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %478 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %479 = torch.aten.view %477, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %480 = torch.aten.mm %479, %478 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %481 = torch.aten.view %480, %79 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %482 = torch.aten.view %481, %81 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %483 = torch.aten.permute %482, %83 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %484 = torch.aten.slice.Tensor %483, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %485 = torch.aten.slice.Tensor %483, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %486 = torch.aten.slice.Tensor %483, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %487 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %488 = torch.aten.unsqueeze %487, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %489 = torch.aten.slice.Tensor %488, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %490 = torch.aten.view %489, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %491 = torch.aten.permute %490, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %492 = torch.aten.add.Tensor %484, %491, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %493 = torch.aten.unsqueeze %34, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %494 = torch.aten.unsqueeze %493, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %495 = torch.aten.slice.Tensor %494, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %496 = torch.aten.view %495, %91 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %497 = torch.aten.permute %496, %83 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %498 = torch.aten.add.Tensor %486, %497, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %499 = torch.aten.div.Scalar %492, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %500 = torch.aten.transpose.int %485, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %501 = torch.aten.broadcast_to %499, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %502 = torch.aten.view %501, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %503 = torch.aten.broadcast_to %500, %107 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %504 = torch.aten.view %503, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %505 = torch.aten.bmm %502, %504 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %506 = torch.aten.view %505, %112 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %507 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %508 = torch.aten.to.dtype %507, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %509 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %510 = torch.aten.broadcast_to %508, %509 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %511 = torch.aten.copy %510, %74, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %512 = torch.aten.bitwise_not %511 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %513 = torch.aten.clone %32, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %514 = torch.aten.masked_fill.Tensor %506, %512, %513 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %514, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %515 = torch.aten.sub.Tensor %514, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %516 = torch.aten.exp %515 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %517 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %518 = torch.aten.sum.dim_IntList %516, %517, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %519 = torch.aten.div.Tensor %516, %518 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %520 = torch.aten.masked_fill.Scalar %519, %512, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %521 = torch.aten.broadcast_to %520, %112 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %522 = torch.aten.view %521, %129 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %523 = torch.aten.broadcast_to %498, %103 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %524 = torch.aten.view %523, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %525 = torch.aten.bmm %522, %524 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %526 = torch.aten.view %525, %103 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %527 = torch.aten.permute %526, %83 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %528 = torch.aten.clone %527, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %529 = torch.aten.view %528, %137 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %530 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %531 = torch.aten.view %529, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %532 = torch.aten.mm %531, %530 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %533 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %534 = torch.aten.add.Tensor %533, %532, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %535 = torch.aten.view %534, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %536 = torch.aten.add.Tensor %535, %477, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %537 = torch.aten.sum.dim_IntList %536, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %538 = torch.aten.div.Scalar %537, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %539 = torch.aten.sub.Tensor %536, %538, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %540 = torch.aten.pow.Tensor_Scalar %539, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %541 = torch.aten.sum.dim_IntList %540, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %542 = torch.aten.div.Scalar %541, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %543 = torch.aten.sub.Tensor %536, %538, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %544 = torch.aten.add.Scalar %542, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %545 = torch.aten.sqrt %544 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %546 = torch.aten.div.Tensor %543, %545 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %547 = torch.aten.mul.Tensor %35, %546 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %548 = torch.aten.add.Tensor %547, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %549 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %550 = torch.aten.view %548, %76 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %551 = torch.aten.mm %550, %549 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %552 = torch.aten.mul.Scalar %29, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %553 = torch.aten.add.Tensor %552, %551, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %554 = torch.aten.view %553, %164 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %555 = torch.aten.gelu %554, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %556 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %557 = torch.aten.view %555, %168 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %558 = torch.aten.mm %557, %556 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %559 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %560 = torch.aten.add.Tensor %559, %558, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %561 = torch.aten.view %560, %144 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %562 = torch.aten.add.Tensor %561, %548, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %563 = torch.aten.sum.dim_IntList %562, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %564 = torch.aten.div.Scalar %563, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %565 = torch.aten.sub.Tensor %562, %564, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %566 = torch.aten.pow.Tensor_Scalar %565, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %567 = torch.aten.sum.dim_IntList %566, %50, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %568 = torch.aten.div.Scalar %567, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %569 = torch.aten.sub.Tensor %562, %564, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %570 = torch.aten.add.Scalar %568, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %571 = torch.aten.sqrt %570 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %572 = torch.aten.div.Tensor %569, %571 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %573 = torch.aten.mul.Tensor %35, %572 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %574 = torch.aten.add.Tensor %573, %34, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %575 = torch.aten.slice.Tensor %574, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %576 = torch.aten.slice.Tensor %575, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %577 = torch.aten.squeeze.dim %576, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %578 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %579 = torch.aten.mm %577, %578 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %580 = torch.aten.mul.Scalar %34, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %581 = torch.aten.add.Tensor %580, %579, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %582 = torch.aten.gelu %581, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %583 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %584 = torch.aten.mm %582, %583 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %585 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %586 = torch.aten.add.Tensor %585, %584, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %586 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8ed9050) {
  %22 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> !torch.vtensor<[32,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f574b0)
    ** Replace : 'torch.vtensor.literal'(0x8ed9050)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f574b0) {
      %22 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %20 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %30 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %33 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %34 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %35 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %36 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %37 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %38 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %39 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %40 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %41 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %42 = torch.aten.ones %41, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %43 = torch.aten.zeros %41, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %44 = torch.aten.slice.Tensor %40, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %45 = torch.aten.slice.Tensor %44, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %46 = torch.aten.embedding %39, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %47 = torch.aten.embedding %38, %45, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.aten.add.Tensor %46, %47, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.aten.embedding %37, %43, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.aten.add.Tensor %48, %49, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %51 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %52 = torch.aten.sum.dim_IntList %50, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %53 = torch.aten.div.Scalar %52, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %54 = torch.aten.sub.Tensor %50, %53, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %55 = torch.aten.pow.Tensor_Scalar %54, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %56 = torch.aten.sum.dim_IntList %55, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %57 = torch.aten.div.Scalar %56, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %58 = torch.aten.sub.Tensor %50, %53, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %59 = torch.aten.add.Scalar %57, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %60 = torch.aten.sqrt %59 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %61 = torch.aten.div.Tensor %58, %60 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %62 = torch.aten.mul.Tensor %36, %61 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %63 = torch.aten.add.Tensor %62, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %64 = torch.aten.unsqueeze %42, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %65 = torch.aten.mul.Tensor %63, %64 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %66 = torch.aten.unsqueeze %42, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %67 = torch.aten.unsqueeze %66, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %68 = torch.aten.squeeze.dim %67, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %69 = torch.aten.unsqueeze %68, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %70 = torch.aten.mul.Tensor %67, %69 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %71 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %72 = torch.aten.to.dtype %71, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %73 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %74 = torch.aten.broadcast_to %72, %73 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %75 = torch.aten.copy %74, %70, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %76 = torch.aten.transpose.int %34, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %77 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %78 = torch.aten.view %65, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %79 = torch.aten.mm %78, %76 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %80 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %81 = torch.aten.view %79, %80 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %82 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %83 = torch.aten.view %81, %82 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %84 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %85 = torch.aten.permute %83, %84 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %86 = torch.aten.slice.Tensor %85, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %87 = torch.aten.slice.Tensor %85, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %88 = torch.aten.slice.Tensor %85, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %89 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %90 = torch.aten.unsqueeze %89, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %91 = torch.aten.slice.Tensor %90, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %92 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %93 = torch.aten.view %91, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %94 = torch.aten.permute %93, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %95 = torch.aten.add.Tensor %86, %94, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %96 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %97 = torch.aten.unsqueeze %96, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %98 = torch.aten.slice.Tensor %97, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %99 = torch.aten.view %98, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %100 = torch.aten.permute %99, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %101 = torch.aten.add.Tensor %88, %100, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %102 = torch.aten.div.Scalar %95, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %103 = torch.aten.transpose.int %87, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %104 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %105 = torch.aten.broadcast_to %102, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %106 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %107 = torch.aten.view %105, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %108 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %109 = torch.aten.broadcast_to %103, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %110 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %111 = torch.aten.view %109, %110 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %112 = torch.aten.bmm %107, %111 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %113 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %114 = torch.aten.view %112, %113 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %115 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %116 = torch.aten.to.dtype %115, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %117 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %118 = torch.aten.broadcast_to %116, %117 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %119 = torch.aten.copy %118, %75, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %120 = torch.aten.bitwise_not %119 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %121 = torch.aten.clone %33, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %122 = torch.aten.masked_fill.Tensor %114, %120, %121 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %122, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %123 = torch.aten.sub.Tensor %122, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %124 = torch.aten.exp %123 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %125 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %126 = torch.aten.sum.dim_IntList %124, %125, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %127 = torch.aten.div.Tensor %124, %126 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %128 = torch.aten.masked_fill.Scalar %127, %120, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %129 = torch.aten.broadcast_to %128, %113 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %130 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %131 = torch.aten.view %129, %130 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %132 = torch.aten.broadcast_to %101, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %133 = torch.aten.view %132, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %134 = torch.aten.bmm %131, %133 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %135 = torch.aten.view %134, %104 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %136 = torch.aten.permute %135, %84 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %137 = torch.aten.clone %136, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %138 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %139 = torch.aten.view %137, %138 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %140 = torch.aten.transpose.int %32, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %141 = torch.aten.view %139, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %142 = torch.aten.mm %141, %140 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %143 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %144 = torch.aten.add.Tensor %143, %142, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %145 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %146 = torch.aten.view %144, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %147 = torch.aten.add.Tensor %146, %65, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %148 = torch.aten.sum.dim_IntList %147, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %149 = torch.aten.div.Scalar %148, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %150 = torch.aten.sub.Tensor %147, %149, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %151 = torch.aten.pow.Tensor_Scalar %150, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %152 = torch.aten.sum.dim_IntList %151, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %153 = torch.aten.div.Scalar %152, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %154 = torch.aten.sub.Tensor %147, %149, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %155 = torch.aten.add.Scalar %153, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %156 = torch.aten.sqrt %155 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %157 = torch.aten.div.Tensor %154, %156 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %158 = torch.aten.mul.Tensor %36, %157 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %159 = torch.aten.add.Tensor %158, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %160 = torch.aten.transpose.int %31, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %161 = torch.aten.view %159, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %162 = torch.aten.mm %161, %160 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %163 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %164 = torch.aten.add.Tensor %163, %162, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %165 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %166 = torch.aten.view %164, %165 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %167 = torch.aten.gelu %166, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %168 = torch.aten.transpose.int %29, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %169 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %170 = torch.aten.view %167, %169 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %171 = torch.aten.mm %170, %168 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %172 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %173 = torch.aten.add.Tensor %172, %171, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %174 = torch.aten.view %173, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %175 = torch.aten.add.Tensor %174, %159, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %176 = torch.aten.sum.dim_IntList %175, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %177 = torch.aten.div.Scalar %176, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %178 = torch.aten.sub.Tensor %175, %177, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %179 = torch.aten.pow.Tensor_Scalar %178, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %180 = torch.aten.sum.dim_IntList %179, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %181 = torch.aten.div.Scalar %180, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %182 = torch.aten.sub.Tensor %175, %177, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %183 = torch.aten.add.Scalar %181, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %184 = torch.aten.sqrt %183 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %185 = torch.aten.div.Tensor %182, %184 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %186 = torch.aten.mul.Tensor %36, %185 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %187 = torch.aten.add.Tensor %186, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %188 = torch.aten.transpose.int %28, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %189 = torch.aten.view %187, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %190 = torch.aten.mm %189, %188 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %191 = torch.aten.view %190, %80 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %192 = torch.aten.view %191, %82 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %193 = torch.aten.permute %192, %84 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %194 = torch.aten.slice.Tensor %193, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %195 = torch.aten.slice.Tensor %193, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %196 = torch.aten.slice.Tensor %193, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %197 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %198 = torch.aten.unsqueeze %197, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %199 = torch.aten.slice.Tensor %198, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %200 = torch.aten.view %199, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %201 = torch.aten.permute %200, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %202 = torch.aten.add.Tensor %194, %201, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %203 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %204 = torch.aten.unsqueeze %203, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %205 = torch.aten.slice.Tensor %204, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %206 = torch.aten.view %205, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %207 = torch.aten.permute %206, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %208 = torch.aten.add.Tensor %196, %207, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %209 = torch.aten.div.Scalar %202, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %210 = torch.aten.transpose.int %195, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %211 = torch.aten.broadcast_to %209, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %212 = torch.aten.view %211, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %213 = torch.aten.broadcast_to %210, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %214 = torch.aten.view %213, %110 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %215 = torch.aten.bmm %212, %214 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %216 = torch.aten.view %215, %113 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %217 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %218 = torch.aten.to.dtype %217, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %219 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %220 = torch.aten.broadcast_to %218, %219 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %221 = torch.aten.copy %220, %75, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %222 = torch.aten.bitwise_not %221 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %223 = torch.aten.clone %33, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %224 = torch.aten.masked_fill.Tensor %216, %222, %223 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %224, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %225 = torch.aten.sub.Tensor %224, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %226 = torch.aten.exp %225 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %227 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %228 = torch.aten.sum.dim_IntList %226, %227, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %229 = torch.aten.div.Tensor %226, %228 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %230 = torch.aten.masked_fill.Scalar %229, %222, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %231 = torch.aten.broadcast_to %230, %113 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %232 = torch.aten.view %231, %130 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %233 = torch.aten.broadcast_to %208, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %234 = torch.aten.view %233, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %235 = torch.aten.bmm %232, %234 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %236 = torch.aten.view %235, %104 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %237 = torch.aten.permute %236, %84 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %238 = torch.aten.clone %237, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %239 = torch.aten.view %238, %138 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %240 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %241 = torch.aten.view %239, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %242 = torch.aten.mm %241, %240 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %243 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %244 = torch.aten.add.Tensor %243, %242, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %245 = torch.aten.view %244, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %246 = torch.aten.add.Tensor %245, %187, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %247 = torch.aten.sum.dim_IntList %246, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %248 = torch.aten.div.Scalar %247, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %249 = torch.aten.sub.Tensor %246, %248, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %250 = torch.aten.pow.Tensor_Scalar %249, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %251 = torch.aten.sum.dim_IntList %250, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %252 = torch.aten.div.Scalar %251, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %253 = torch.aten.sub.Tensor %246, %248, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %254 = torch.aten.add.Scalar %252, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %255 = torch.aten.sqrt %254 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %256 = torch.aten.div.Tensor %253, %255 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %257 = torch.aten.mul.Tensor %36, %256 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %258 = torch.aten.add.Tensor %257, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %259 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %260 = torch.aten.view %258, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %261 = torch.aten.mm %260, %259 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %262 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %263 = torch.aten.add.Tensor %262, %261, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %264 = torch.aten.view %263, %165 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %265 = torch.aten.gelu %264, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %266 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %267 = torch.aten.view %265, %169 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %268 = torch.aten.mm %267, %266 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %269 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %270 = torch.aten.add.Tensor %269, %268, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %271 = torch.aten.view %270, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %272 = torch.aten.add.Tensor %271, %258, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %273 = torch.aten.sum.dim_IntList %272, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %274 = torch.aten.div.Scalar %273, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %275 = torch.aten.sub.Tensor %272, %274, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %276 = torch.aten.pow.Tensor_Scalar %275, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %277 = torch.aten.sum.dim_IntList %276, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %278 = torch.aten.div.Scalar %277, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %279 = torch.aten.sub.Tensor %272, %274, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %280 = torch.aten.add.Scalar %278, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %281 = torch.aten.sqrt %280 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %282 = torch.aten.div.Tensor %279, %281 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %283 = torch.aten.mul.Tensor %36, %282 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %284 = torch.aten.add.Tensor %283, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %285 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %286 = torch.aten.view %284, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %287 = torch.aten.mm %286, %285 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %288 = torch.aten.view %287, %80 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %289 = torch.aten.view %288, %82 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %290 = torch.aten.permute %289, %84 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %291 = torch.aten.slice.Tensor %290, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %292 = torch.aten.slice.Tensor %290, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %293 = torch.aten.slice.Tensor %290, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %294 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %295 = torch.aten.unsqueeze %294, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %296 = torch.aten.slice.Tensor %295, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %297 = torch.aten.view %296, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %298 = torch.aten.permute %297, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %299 = torch.aten.add.Tensor %291, %298, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %300 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %301 = torch.aten.unsqueeze %300, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %302 = torch.aten.slice.Tensor %301, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %303 = torch.aten.view %302, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %304 = torch.aten.permute %303, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %305 = torch.aten.add.Tensor %293, %304, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %306 = torch.aten.div.Scalar %299, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %307 = torch.aten.transpose.int %292, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %308 = torch.aten.broadcast_to %306, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %309 = torch.aten.view %308, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %310 = torch.aten.broadcast_to %307, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %311 = torch.aten.view %310, %110 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %312 = torch.aten.bmm %309, %311 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %313 = torch.aten.view %312, %113 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %314 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %315 = torch.aten.to.dtype %314, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %316 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %317 = torch.aten.broadcast_to %315, %316 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %318 = torch.aten.copy %317, %75, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %319 = torch.aten.bitwise_not %318 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %320 = torch.aten.clone %33, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %321 = torch.aten.masked_fill.Tensor %313, %319, %320 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %321, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %322 = torch.aten.sub.Tensor %321, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %323 = torch.aten.exp %322 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %324 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %325 = torch.aten.sum.dim_IntList %323, %324, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %326 = torch.aten.div.Tensor %323, %325 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %327 = torch.aten.masked_fill.Scalar %326, %319, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %328 = torch.aten.broadcast_to %327, %113 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %329 = torch.aten.view %328, %130 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %330 = torch.aten.broadcast_to %305, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %331 = torch.aten.view %330, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %332 = torch.aten.bmm %329, %331 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %333 = torch.aten.view %332, %104 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %334 = torch.aten.permute %333, %84 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %335 = torch.aten.clone %334, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %336 = torch.aten.view %335, %138 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %337 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %338 = torch.aten.view %336, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %339 = torch.aten.mm %338, %337 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %340 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %341 = torch.aten.add.Tensor %340, %339, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %342 = torch.aten.view %341, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %343 = torch.aten.add.Tensor %342, %284, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %344 = torch.aten.sum.dim_IntList %343, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %345 = torch.aten.div.Scalar %344, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %346 = torch.aten.sub.Tensor %343, %345, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %347 = torch.aten.pow.Tensor_Scalar %346, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %348 = torch.aten.sum.dim_IntList %347, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %349 = torch.aten.div.Scalar %348, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %350 = torch.aten.sub.Tensor %343, %345, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %351 = torch.aten.add.Scalar %349, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %352 = torch.aten.sqrt %351 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %353 = torch.aten.div.Tensor %350, %352 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %354 = torch.aten.mul.Tensor %36, %353 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %355 = torch.aten.add.Tensor %354, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %356 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %357 = torch.aten.view %355, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %358 = torch.aten.mm %357, %356 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %359 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %360 = torch.aten.add.Tensor %359, %358, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %361 = torch.aten.view %360, %165 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %362 = torch.aten.gelu %361, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %363 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %364 = torch.aten.view %362, %169 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %365 = torch.aten.mm %364, %363 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %366 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %367 = torch.aten.add.Tensor %366, %365, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %368 = torch.aten.view %367, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %369 = torch.aten.add.Tensor %368, %355, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %370 = torch.aten.sum.dim_IntList %369, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %371 = torch.aten.div.Scalar %370, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %372 = torch.aten.sub.Tensor %369, %371, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %373 = torch.aten.pow.Tensor_Scalar %372, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %374 = torch.aten.sum.dim_IntList %373, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %375 = torch.aten.div.Scalar %374, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %376 = torch.aten.sub.Tensor %369, %371, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %377 = torch.aten.add.Scalar %375, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %378 = torch.aten.sqrt %377 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %379 = torch.aten.div.Tensor %376, %378 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %380 = torch.aten.mul.Tensor %36, %379 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %381 = torch.aten.add.Tensor %380, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %382 = torch.aten.transpose.int %20, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %383 = torch.aten.view %381, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %384 = torch.aten.mm %383, %382 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %385 = torch.aten.view %384, %80 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %386 = torch.aten.view %385, %82 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %387 = torch.aten.permute %386, %84 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %388 = torch.aten.slice.Tensor %387, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %389 = torch.aten.slice.Tensor %387, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %390 = torch.aten.slice.Tensor %387, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %391 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %392 = torch.aten.unsqueeze %391, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %393 = torch.aten.slice.Tensor %392, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %394 = torch.aten.view %393, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %395 = torch.aten.permute %394, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %396 = torch.aten.add.Tensor %388, %395, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %397 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %398 = torch.aten.unsqueeze %397, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %399 = torch.aten.slice.Tensor %398, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %400 = torch.aten.view %399, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %401 = torch.aten.permute %400, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %402 = torch.aten.add.Tensor %390, %401, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %403 = torch.aten.div.Scalar %396, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %404 = torch.aten.transpose.int %389, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %405 = torch.aten.broadcast_to %403, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %406 = torch.aten.view %405, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %407 = torch.aten.broadcast_to %404, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %408 = torch.aten.view %407, %110 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %409 = torch.aten.bmm %406, %408 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %410 = torch.aten.view %409, %113 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %411 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %412 = torch.aten.to.dtype %411, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %413 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %414 = torch.aten.broadcast_to %412, %413 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %415 = torch.aten.copy %414, %75, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %416 = torch.aten.bitwise_not %415 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %417 = torch.aten.clone %33, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %418 = torch.aten.masked_fill.Tensor %410, %416, %417 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %418, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %419 = torch.aten.sub.Tensor %418, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %420 = torch.aten.exp %419 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %421 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %422 = torch.aten.sum.dim_IntList %420, %421, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %423 = torch.aten.div.Tensor %420, %422 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %424 = torch.aten.masked_fill.Scalar %423, %416, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %425 = torch.aten.broadcast_to %424, %113 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %426 = torch.aten.view %425, %130 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %427 = torch.aten.broadcast_to %402, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %428 = torch.aten.view %427, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %429 = torch.aten.bmm %426, %428 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %430 = torch.aten.view %429, %104 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %431 = torch.aten.permute %430, %84 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %432 = torch.aten.clone %431, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %433 = torch.aten.view %432, %138 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %434 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %435 = torch.aten.view %433, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %436 = torch.aten.mm %435, %434 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %437 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %438 = torch.aten.add.Tensor %437, %436, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %439 = torch.aten.view %438, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %440 = torch.aten.add.Tensor %439, %381, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %441 = torch.aten.sum.dim_IntList %440, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %442 = torch.aten.div.Scalar %441, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %443 = torch.aten.sub.Tensor %440, %442, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %444 = torch.aten.pow.Tensor_Scalar %443, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %445 = torch.aten.sum.dim_IntList %444, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %446 = torch.aten.div.Scalar %445, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %447 = torch.aten.sub.Tensor %440, %442, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %448 = torch.aten.add.Scalar %446, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %449 = torch.aten.sqrt %448 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %450 = torch.aten.div.Tensor %447, %449 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %451 = torch.aten.mul.Tensor %36, %450 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %452 = torch.aten.add.Tensor %451, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %453 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %454 = torch.aten.view %452, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %455 = torch.aten.mm %454, %453 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %456 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %457 = torch.aten.add.Tensor %456, %455, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %458 = torch.aten.view %457, %165 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %459 = torch.aten.gelu %458, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %460 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %461 = torch.aten.view %459, %169 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %462 = torch.aten.mm %461, %460 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %463 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %464 = torch.aten.add.Tensor %463, %462, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %465 = torch.aten.view %464, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %466 = torch.aten.add.Tensor %465, %452, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %467 = torch.aten.sum.dim_IntList %466, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %468 = torch.aten.div.Scalar %467, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %469 = torch.aten.sub.Tensor %466, %468, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %470 = torch.aten.pow.Tensor_Scalar %469, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %471 = torch.aten.sum.dim_IntList %470, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %472 = torch.aten.div.Scalar %471, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %473 = torch.aten.sub.Tensor %466, %468, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %474 = torch.aten.add.Scalar %472, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %475 = torch.aten.sqrt %474 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %476 = torch.aten.div.Tensor %473, %475 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %477 = torch.aten.mul.Tensor %36, %476 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %478 = torch.aten.add.Tensor %477, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %479 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %480 = torch.aten.view %478, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %481 = torch.aten.mm %480, %479 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %482 = torch.aten.view %481, %80 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %483 = torch.aten.view %482, %82 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %484 = torch.aten.permute %483, %84 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %485 = torch.aten.slice.Tensor %484, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %486 = torch.aten.slice.Tensor %484, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %487 = torch.aten.slice.Tensor %484, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %488 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %489 = torch.aten.unsqueeze %488, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %490 = torch.aten.slice.Tensor %489, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %491 = torch.aten.view %490, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %492 = torch.aten.permute %491, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %493 = torch.aten.add.Tensor %485, %492, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %494 = torch.aten.unsqueeze %35, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %495 = torch.aten.unsqueeze %494, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %496 = torch.aten.slice.Tensor %495, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %497 = torch.aten.view %496, %92 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %498 = torch.aten.permute %497, %84 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %499 = torch.aten.add.Tensor %487, %498, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %500 = torch.aten.div.Scalar %493, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %501 = torch.aten.transpose.int %486, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %502 = torch.aten.broadcast_to %500, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %503 = torch.aten.view %502, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %504 = torch.aten.broadcast_to %501, %108 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %505 = torch.aten.view %504, %110 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %506 = torch.aten.bmm %503, %505 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %507 = torch.aten.view %506, %113 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %508 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %509 = torch.aten.to.dtype %508, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %510 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %511 = torch.aten.broadcast_to %509, %510 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %512 = torch.aten.copy %511, %75, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %513 = torch.aten.bitwise_not %512 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %514 = torch.aten.clone %33, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %515 = torch.aten.masked_fill.Tensor %507, %513, %514 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %515, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %516 = torch.aten.sub.Tensor %515, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %517 = torch.aten.exp %516 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %518 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %519 = torch.aten.sum.dim_IntList %517, %518, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %520 = torch.aten.div.Tensor %517, %519 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %521 = torch.aten.masked_fill.Scalar %520, %513, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %522 = torch.aten.broadcast_to %521, %113 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %523 = torch.aten.view %522, %130 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %524 = torch.aten.broadcast_to %499, %104 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %525 = torch.aten.view %524, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %526 = torch.aten.bmm %523, %525 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %527 = torch.aten.view %526, %104 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %528 = torch.aten.permute %527, %84 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %529 = torch.aten.clone %528, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %530 = torch.aten.view %529, %138 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %531 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %532 = torch.aten.view %530, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %533 = torch.aten.mm %532, %531 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %534 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %535 = torch.aten.add.Tensor %534, %533, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %536 = torch.aten.view %535, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %537 = torch.aten.add.Tensor %536, %478, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %538 = torch.aten.sum.dim_IntList %537, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %539 = torch.aten.div.Scalar %538, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %540 = torch.aten.sub.Tensor %537, %539, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %541 = torch.aten.pow.Tensor_Scalar %540, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %542 = torch.aten.sum.dim_IntList %541, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %543 = torch.aten.div.Scalar %542, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %544 = torch.aten.sub.Tensor %537, %539, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %545 = torch.aten.add.Scalar %543, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %546 = torch.aten.sqrt %545 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %547 = torch.aten.div.Tensor %544, %546 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %548 = torch.aten.mul.Tensor %36, %547 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %549 = torch.aten.add.Tensor %548, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %550 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %551 = torch.aten.view %549, %77 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %552 = torch.aten.mm %551, %550 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %553 = torch.aten.mul.Scalar %30, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %554 = torch.aten.add.Tensor %553, %552, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %555 = torch.aten.view %554, %165 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %556 = torch.aten.gelu %555, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %557 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %558 = torch.aten.view %556, %169 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %559 = torch.aten.mm %558, %557 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %560 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %561 = torch.aten.add.Tensor %560, %559, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %562 = torch.aten.view %561, %145 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %563 = torch.aten.add.Tensor %562, %549, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %564 = torch.aten.sum.dim_IntList %563, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %565 = torch.aten.div.Scalar %564, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %566 = torch.aten.sub.Tensor %563, %565, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %567 = torch.aten.pow.Tensor_Scalar %566, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.sum.dim_IntList %567, %51, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %569 = torch.aten.div.Scalar %568, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %570 = torch.aten.sub.Tensor %563, %565, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %571 = torch.aten.add.Scalar %569, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %572 = torch.aten.sqrt %571 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %573 = torch.aten.div.Tensor %570, %572 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %574 = torch.aten.mul.Tensor %36, %573 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %575 = torch.aten.add.Tensor %574, %35, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %576 = torch.aten.slice.Tensor %575, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %577 = torch.aten.slice.Tensor %576, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %578 = torch.aten.squeeze.dim %577, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %579 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %580 = torch.aten.mm %578, %579 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %581 = torch.aten.mul.Scalar %35, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %582 = torch.aten.add.Tensor %581, %580, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %583 = torch.aten.gelu %582, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %584 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %585 = torch.aten.mm %583, %584 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %586 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %587 = torch.aten.add.Tensor %586, %585, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %587 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8efdf00) {
  %24 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> !torch.vtensor<[96,32],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f3c860)
    ** Replace : 'torch.vtensor.literal'(0x8efdf00)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f3c860) {
      %24 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %22 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %31 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %32 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %34 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %35 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %36 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %37 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %38 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %39 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %40 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %41 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %42 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %43 = torch.aten.ones %42, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %44 = torch.aten.zeros %42, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %45 = torch.aten.slice.Tensor %41, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %46 = torch.aten.slice.Tensor %45, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %47 = torch.aten.embedding %40, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %48 = torch.aten.embedding %39, %46, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.aten.add.Tensor %47, %48, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.aten.embedding %38, %44, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %51 = torch.aten.add.Tensor %49, %50, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %52 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %53 = torch.aten.sum.dim_IntList %51, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %54 = torch.aten.div.Scalar %53, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %55 = torch.aten.sub.Tensor %51, %54, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %56 = torch.aten.pow.Tensor_Scalar %55, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.sum.dim_IntList %56, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %58 = torch.aten.div.Scalar %57, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %59 = torch.aten.sub.Tensor %51, %54, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %60 = torch.aten.add.Scalar %58, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %61 = torch.aten.sqrt %60 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %62 = torch.aten.div.Tensor %59, %61 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %63 = torch.aten.mul.Tensor %37, %62 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %64 = torch.aten.add.Tensor %63, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %65 = torch.aten.unsqueeze %43, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %66 = torch.aten.mul.Tensor %64, %65 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %67 = torch.aten.unsqueeze %43, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %68 = torch.aten.unsqueeze %67, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %69 = torch.aten.squeeze.dim %68, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %70 = torch.aten.unsqueeze %69, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %71 = torch.aten.mul.Tensor %68, %70 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %72 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %73 = torch.aten.to.dtype %72, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %74 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %75 = torch.aten.broadcast_to %73, %74 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %76 = torch.aten.copy %75, %71, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %77 = torch.aten.transpose.int %35, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %78 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %79 = torch.aten.view %66, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %80 = torch.aten.mm %79, %77 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %81 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %82 = torch.aten.view %80, %81 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %83 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %84 = torch.aten.view %82, %83 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %85 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %86 = torch.aten.permute %84, %85 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %87 = torch.aten.slice.Tensor %86, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %88 = torch.aten.slice.Tensor %86, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %89 = torch.aten.slice.Tensor %86, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %90 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %91 = torch.aten.unsqueeze %90, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %92 = torch.aten.slice.Tensor %91, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %93 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %94 = torch.aten.view %92, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %95 = torch.aten.permute %94, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %96 = torch.aten.add.Tensor %87, %95, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %97 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %98 = torch.aten.unsqueeze %97, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %99 = torch.aten.slice.Tensor %98, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %100 = torch.aten.view %99, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %101 = torch.aten.permute %100, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %102 = torch.aten.add.Tensor %89, %101, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %103 = torch.aten.div.Scalar %96, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %104 = torch.aten.transpose.int %88, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %105 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %106 = torch.aten.broadcast_to %103, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %107 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %108 = torch.aten.view %106, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %109 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %110 = torch.aten.broadcast_to %104, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %111 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %112 = torch.aten.view %110, %111 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %113 = torch.aten.bmm %108, %112 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %114 = torch.prim.ListConstruct %int1, %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %115 = torch.aten.view %113, %114 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %116 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %117 = torch.aten.to.dtype %116, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %118 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %119 = torch.aten.broadcast_to %117, %118 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %120 = torch.aten.copy %119, %76, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %121 = torch.aten.bitwise_not %120 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %122 = torch.aten.clone %34, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %123 = torch.aten.masked_fill.Tensor %115, %121, %122 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values, %indices = torch.aten.max.dim %123, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %124 = torch.aten.sub.Tensor %123, %values, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %125 = torch.aten.exp %124 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %126 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %127 = torch.aten.sum.dim_IntList %125, %126, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %128 = torch.aten.div.Tensor %125, %127 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %129 = torch.aten.masked_fill.Scalar %128, %121, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %130 = torch.aten.broadcast_to %129, %114 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %131 = torch.prim.ListConstruct %int4, %int128, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %132 = torch.aten.view %130, %131 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %133 = torch.aten.broadcast_to %102, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %134 = torch.aten.view %133, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %135 = torch.aten.bmm %132, %134 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %136 = torch.aten.view %135, %105 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %137 = torch.aten.permute %136, %85 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %138 = torch.aten.clone %137, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %139 = torch.prim.ListConstruct %int1, %int128, %int-1 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %140 = torch.aten.view %138, %139 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %141 = torch.aten.transpose.int %33, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %142 = torch.aten.view %140, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %143 = torch.aten.mm %142, %141 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %144 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %145 = torch.aten.add.Tensor %144, %143, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %146 = torch.prim.ListConstruct %int1, %int128, %int32 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %147 = torch.aten.view %145, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %148 = torch.aten.add.Tensor %147, %66, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %149 = torch.aten.sum.dim_IntList %148, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %150 = torch.aten.div.Scalar %149, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %151 = torch.aten.sub.Tensor %148, %150, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %152 = torch.aten.pow.Tensor_Scalar %151, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %153 = torch.aten.sum.dim_IntList %152, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %154 = torch.aten.div.Scalar %153, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %155 = torch.aten.sub.Tensor %148, %150, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %156 = torch.aten.add.Scalar %154, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %157 = torch.aten.sqrt %156 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %158 = torch.aten.div.Tensor %155, %157 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %159 = torch.aten.mul.Tensor %37, %158 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %160 = torch.aten.add.Tensor %159, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %161 = torch.aten.transpose.int %32, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %162 = torch.aten.view %160, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %163 = torch.aten.mm %162, %161 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %164 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %165 = torch.aten.add.Tensor %164, %163, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %166 = torch.prim.ListConstruct %int1, %int128, %int37 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %167 = torch.aten.view %165, %166 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %168 = torch.aten.gelu %167, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %169 = torch.aten.transpose.int %30, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %170 = torch.prim.ListConstruct %int128, %int37 : (!torch.int, !torch.int) -> !torch.list<int>
  %171 = torch.aten.view %168, %170 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %172 = torch.aten.mm %171, %169 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %173 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %174 = torch.aten.add.Tensor %173, %172, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %175 = torch.aten.view %174, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %176 = torch.aten.add.Tensor %175, %160, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %177 = torch.aten.sum.dim_IntList %176, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %178 = torch.aten.div.Scalar %177, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %179 = torch.aten.sub.Tensor %176, %178, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %180 = torch.aten.pow.Tensor_Scalar %179, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %181 = torch.aten.sum.dim_IntList %180, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %182 = torch.aten.div.Scalar %181, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %183 = torch.aten.sub.Tensor %176, %178, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %184 = torch.aten.add.Scalar %182, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %185 = torch.aten.sqrt %184 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %186 = torch.aten.div.Tensor %183, %185 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %187 = torch.aten.mul.Tensor %37, %186 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %188 = torch.aten.add.Tensor %187, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %189 = torch.aten.transpose.int %29, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %190 = torch.aten.view %188, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %191 = torch.aten.mm %190, %189 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %192 = torch.aten.view %191, %81 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %193 = torch.aten.view %192, %83 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %194 = torch.aten.permute %193, %85 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %195 = torch.aten.slice.Tensor %194, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %196 = torch.aten.slice.Tensor %194, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %197 = torch.aten.slice.Tensor %194, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %198 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %199 = torch.aten.unsqueeze %198, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %200 = torch.aten.slice.Tensor %199, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %201 = torch.aten.view %200, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %202 = torch.aten.permute %201, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %203 = torch.aten.add.Tensor %195, %202, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %204 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %205 = torch.aten.unsqueeze %204, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %206 = torch.aten.slice.Tensor %205, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %207 = torch.aten.view %206, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %208 = torch.aten.permute %207, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %209 = torch.aten.add.Tensor %197, %208, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %210 = torch.aten.div.Scalar %203, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %211 = torch.aten.transpose.int %196, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %212 = torch.aten.broadcast_to %210, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %213 = torch.aten.view %212, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %214 = torch.aten.broadcast_to %211, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %215 = torch.aten.view %214, %111 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %216 = torch.aten.bmm %213, %215 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %217 = torch.aten.view %216, %114 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %218 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %219 = torch.aten.to.dtype %218, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %220 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %221 = torch.aten.broadcast_to %219, %220 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %222 = torch.aten.copy %221, %76, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %223 = torch.aten.bitwise_not %222 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %224 = torch.aten.clone %34, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %225 = torch.aten.masked_fill.Tensor %217, %223, %224 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_0, %indices_1 = torch.aten.max.dim %225, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %226 = torch.aten.sub.Tensor %225, %values_0, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %227 = torch.aten.exp %226 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %228 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %229 = torch.aten.sum.dim_IntList %227, %228, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %230 = torch.aten.div.Tensor %227, %229 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %231 = torch.aten.masked_fill.Scalar %230, %223, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %232 = torch.aten.broadcast_to %231, %114 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %233 = torch.aten.view %232, %131 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %234 = torch.aten.broadcast_to %209, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %235 = torch.aten.view %234, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %236 = torch.aten.bmm %233, %235 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %237 = torch.aten.view %236, %105 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %238 = torch.aten.permute %237, %85 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %239 = torch.aten.clone %238, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %240 = torch.aten.view %239, %139 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %241 = torch.aten.transpose.int %28, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %242 = torch.aten.view %240, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %243 = torch.aten.mm %242, %241 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %244 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %245 = torch.aten.add.Tensor %244, %243, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %246 = torch.aten.view %245, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %247 = torch.aten.add.Tensor %246, %188, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %248 = torch.aten.sum.dim_IntList %247, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %249 = torch.aten.div.Scalar %248, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %250 = torch.aten.sub.Tensor %247, %249, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %251 = torch.aten.pow.Tensor_Scalar %250, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %252 = torch.aten.sum.dim_IntList %251, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %253 = torch.aten.div.Scalar %252, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %254 = torch.aten.sub.Tensor %247, %249, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %255 = torch.aten.add.Scalar %253, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %256 = torch.aten.sqrt %255 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %257 = torch.aten.div.Tensor %254, %256 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %258 = torch.aten.mul.Tensor %37, %257 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %259 = torch.aten.add.Tensor %258, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %260 = torch.aten.transpose.int %27, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %261 = torch.aten.view %259, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %262 = torch.aten.mm %261, %260 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %263 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %264 = torch.aten.add.Tensor %263, %262, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %265 = torch.aten.view %264, %166 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %266 = torch.aten.gelu %265, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %267 = torch.aten.transpose.int %26, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %268 = torch.aten.view %266, %170 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %269 = torch.aten.mm %268, %267 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %270 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %271 = torch.aten.add.Tensor %270, %269, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %272 = torch.aten.view %271, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %273 = torch.aten.add.Tensor %272, %259, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %274 = torch.aten.sum.dim_IntList %273, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %275 = torch.aten.div.Scalar %274, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %276 = torch.aten.sub.Tensor %273, %275, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %277 = torch.aten.pow.Tensor_Scalar %276, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %278 = torch.aten.sum.dim_IntList %277, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %279 = torch.aten.div.Scalar %278, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %280 = torch.aten.sub.Tensor %273, %275, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %281 = torch.aten.add.Scalar %279, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %282 = torch.aten.sqrt %281 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %283 = torch.aten.div.Tensor %280, %282 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %284 = torch.aten.mul.Tensor %37, %283 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %285 = torch.aten.add.Tensor %284, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %286 = torch.aten.transpose.int %25, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %287 = torch.aten.view %285, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %288 = torch.aten.mm %287, %286 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %289 = torch.aten.view %288, %81 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %290 = torch.aten.view %289, %83 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %291 = torch.aten.permute %290, %85 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %292 = torch.aten.slice.Tensor %291, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %293 = torch.aten.slice.Tensor %291, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %294 = torch.aten.slice.Tensor %291, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %295 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %296 = torch.aten.unsqueeze %295, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %297 = torch.aten.slice.Tensor %296, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %298 = torch.aten.view %297, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %299 = torch.aten.permute %298, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %300 = torch.aten.add.Tensor %292, %299, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %301 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %302 = torch.aten.unsqueeze %301, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %303 = torch.aten.slice.Tensor %302, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %304 = torch.aten.view %303, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %305 = torch.aten.permute %304, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %306 = torch.aten.add.Tensor %294, %305, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %307 = torch.aten.div.Scalar %300, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %308 = torch.aten.transpose.int %293, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %309 = torch.aten.broadcast_to %307, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %310 = torch.aten.view %309, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %311 = torch.aten.broadcast_to %308, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %312 = torch.aten.view %311, %111 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %313 = torch.aten.bmm %310, %312 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %314 = torch.aten.view %313, %114 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %315 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %316 = torch.aten.to.dtype %315, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %317 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %318 = torch.aten.broadcast_to %316, %317 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %319 = torch.aten.copy %318, %76, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %320 = torch.aten.bitwise_not %319 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %321 = torch.aten.clone %34, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %322 = torch.aten.masked_fill.Tensor %314, %320, %321 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_2, %indices_3 = torch.aten.max.dim %322, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %323 = torch.aten.sub.Tensor %322, %values_2, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %324 = torch.aten.exp %323 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %325 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %326 = torch.aten.sum.dim_IntList %324, %325, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %327 = torch.aten.div.Tensor %324, %326 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %328 = torch.aten.masked_fill.Scalar %327, %320, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %329 = torch.aten.broadcast_to %328, %114 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %330 = torch.aten.view %329, %131 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %331 = torch.aten.broadcast_to %306, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %332 = torch.aten.view %331, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %333 = torch.aten.bmm %330, %332 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %334 = torch.aten.view %333, %105 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %335 = torch.aten.permute %334, %85 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %336 = torch.aten.clone %335, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %337 = torch.aten.view %336, %139 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %338 = torch.aten.transpose.int %24, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %339 = torch.aten.view %337, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %340 = torch.aten.mm %339, %338 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %341 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %342 = torch.aten.add.Tensor %341, %340, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %343 = torch.aten.view %342, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %344 = torch.aten.add.Tensor %343, %285, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %345 = torch.aten.sum.dim_IntList %344, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %346 = torch.aten.div.Scalar %345, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %347 = torch.aten.sub.Tensor %344, %346, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %348 = torch.aten.pow.Tensor_Scalar %347, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %349 = torch.aten.sum.dim_IntList %348, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %350 = torch.aten.div.Scalar %349, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %351 = torch.aten.sub.Tensor %344, %346, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %352 = torch.aten.add.Scalar %350, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %353 = torch.aten.sqrt %352 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %354 = torch.aten.div.Tensor %351, %353 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %355 = torch.aten.mul.Tensor %37, %354 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %356 = torch.aten.add.Tensor %355, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %357 = torch.aten.transpose.int %23, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %358 = torch.aten.view %356, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %359 = torch.aten.mm %358, %357 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %360 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %361 = torch.aten.add.Tensor %360, %359, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %362 = torch.aten.view %361, %166 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %363 = torch.aten.gelu %362, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %364 = torch.aten.transpose.int %22, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %365 = torch.aten.view %363, %170 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %366 = torch.aten.mm %365, %364 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %367 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %368 = torch.aten.add.Tensor %367, %366, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %369 = torch.aten.view %368, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %370 = torch.aten.add.Tensor %369, %356, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %371 = torch.aten.sum.dim_IntList %370, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %372 = torch.aten.div.Scalar %371, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %373 = torch.aten.sub.Tensor %370, %372, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %374 = torch.aten.pow.Tensor_Scalar %373, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %375 = torch.aten.sum.dim_IntList %374, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %376 = torch.aten.div.Scalar %375, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %377 = torch.aten.sub.Tensor %370, %372, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %378 = torch.aten.add.Scalar %376, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %379 = torch.aten.sqrt %378 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %380 = torch.aten.div.Tensor %377, %379 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %381 = torch.aten.mul.Tensor %37, %380 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %382 = torch.aten.add.Tensor %381, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %383 = torch.aten.transpose.int %21, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %384 = torch.aten.view %382, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %385 = torch.aten.mm %384, %383 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %386 = torch.aten.view %385, %81 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %387 = torch.aten.view %386, %83 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %388 = torch.aten.permute %387, %85 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %389 = torch.aten.slice.Tensor %388, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %390 = torch.aten.slice.Tensor %388, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %391 = torch.aten.slice.Tensor %388, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %392 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %393 = torch.aten.unsqueeze %392, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %394 = torch.aten.slice.Tensor %393, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %395 = torch.aten.view %394, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %396 = torch.aten.permute %395, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %397 = torch.aten.add.Tensor %389, %396, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %398 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %399 = torch.aten.unsqueeze %398, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %400 = torch.aten.slice.Tensor %399, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %401 = torch.aten.view %400, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %402 = torch.aten.permute %401, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %403 = torch.aten.add.Tensor %391, %402, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %404 = torch.aten.div.Scalar %397, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %405 = torch.aten.transpose.int %390, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %406 = torch.aten.broadcast_to %404, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %407 = torch.aten.view %406, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %408 = torch.aten.broadcast_to %405, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %409 = torch.aten.view %408, %111 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %410 = torch.aten.bmm %407, %409 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %411 = torch.aten.view %410, %114 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %412 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %413 = torch.aten.to.dtype %412, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %414 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %415 = torch.aten.broadcast_to %413, %414 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %416 = torch.aten.copy %415, %76, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %417 = torch.aten.bitwise_not %416 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %418 = torch.aten.clone %34, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %419 = torch.aten.masked_fill.Tensor %411, %417, %418 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_4, %indices_5 = torch.aten.max.dim %419, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %420 = torch.aten.sub.Tensor %419, %values_4, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %421 = torch.aten.exp %420 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %422 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %423 = torch.aten.sum.dim_IntList %421, %422, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %424 = torch.aten.div.Tensor %421, %423 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %425 = torch.aten.masked_fill.Scalar %424, %417, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %426 = torch.aten.broadcast_to %425, %114 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %427 = torch.aten.view %426, %131 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %428 = torch.aten.broadcast_to %403, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %429 = torch.aten.view %428, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %430 = torch.aten.bmm %427, %429 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %431 = torch.aten.view %430, %105 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %432 = torch.aten.permute %431, %85 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %433 = torch.aten.clone %432, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %434 = torch.aten.view %433, %139 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %435 = torch.aten.transpose.int %19, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %436 = torch.aten.view %434, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %437 = torch.aten.mm %436, %435 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %438 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %439 = torch.aten.add.Tensor %438, %437, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %440 = torch.aten.view %439, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %441 = torch.aten.add.Tensor %440, %382, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %442 = torch.aten.sum.dim_IntList %441, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %443 = torch.aten.div.Scalar %442, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %444 = torch.aten.sub.Tensor %441, %443, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %445 = torch.aten.pow.Tensor_Scalar %444, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %446 = torch.aten.sum.dim_IntList %445, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %447 = torch.aten.div.Scalar %446, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %448 = torch.aten.sub.Tensor %441, %443, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %449 = torch.aten.add.Scalar %447, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %450 = torch.aten.sqrt %449 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %451 = torch.aten.div.Tensor %448, %450 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %452 = torch.aten.mul.Tensor %37, %451 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %453 = torch.aten.add.Tensor %452, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %454 = torch.aten.transpose.int %17, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %455 = torch.aten.view %453, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %456 = torch.aten.mm %455, %454 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %457 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %458 = torch.aten.add.Tensor %457, %456, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %459 = torch.aten.view %458, %166 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %460 = torch.aten.gelu %459, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %461 = torch.aten.transpose.int %15, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %462 = torch.aten.view %460, %170 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %463 = torch.aten.mm %462, %461 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %464 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %465 = torch.aten.add.Tensor %464, %463, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %466 = torch.aten.view %465, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %467 = torch.aten.add.Tensor %466, %453, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %468 = torch.aten.sum.dim_IntList %467, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %469 = torch.aten.div.Scalar %468, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %470 = torch.aten.sub.Tensor %467, %469, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %471 = torch.aten.pow.Tensor_Scalar %470, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %472 = torch.aten.sum.dim_IntList %471, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %473 = torch.aten.div.Scalar %472, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %474 = torch.aten.sub.Tensor %467, %469, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %475 = torch.aten.add.Scalar %473, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %476 = torch.aten.sqrt %475 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %477 = torch.aten.div.Tensor %474, %476 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %478 = torch.aten.mul.Tensor %37, %477 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %479 = torch.aten.add.Tensor %478, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %480 = torch.aten.transpose.int %13, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %481 = torch.aten.view %479, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %482 = torch.aten.mm %481, %480 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %483 = torch.aten.view %482, %81 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %484 = torch.aten.view %483, %83 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %485 = torch.aten.permute %484, %85 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %486 = torch.aten.slice.Tensor %485, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %487 = torch.aten.slice.Tensor %485, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %488 = torch.aten.slice.Tensor %485, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %489 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %490 = torch.aten.unsqueeze %489, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %491 = torch.aten.slice.Tensor %490, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %492 = torch.aten.view %491, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %493 = torch.aten.permute %492, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %494 = torch.aten.add.Tensor %486, %493, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %495 = torch.aten.unsqueeze %36, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %496 = torch.aten.unsqueeze %495, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %497 = torch.aten.slice.Tensor %496, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %498 = torch.aten.view %497, %93 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %499 = torch.aten.permute %498, %85 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %500 = torch.aten.add.Tensor %488, %499, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %501 = torch.aten.div.Scalar %494, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %502 = torch.aten.transpose.int %487, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %503 = torch.aten.broadcast_to %501, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %504 = torch.aten.view %503, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %505 = torch.aten.broadcast_to %502, %109 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %506 = torch.aten.view %505, %111 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %507 = torch.aten.bmm %504, %506 : !torch.vtensor<[4,128,8],f32>, !torch.vtensor<[4,8,128],f32> -> !torch.vtensor<[4,128,128],f32>
  %508 = torch.aten.view %507, %114 : !torch.vtensor<[4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %509 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %510 = torch.aten.to.dtype %509, %int11, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %511 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %512 = torch.aten.broadcast_to %510, %511 : !torch.vtensor<[],i1>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],i1>
  %513 = torch.aten.copy %512, %76, %false : !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[1,1,128,128],si8>, !torch.bool -> !torch.vtensor<[1,1,128,128],i1>
  %514 = torch.aten.bitwise_not %513 : !torch.vtensor<[1,1,128,128],i1> -> !torch.vtensor<[1,1,128,128],i1>
  %515 = torch.aten.clone %34, %none : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
  %516 = torch.aten.masked_fill.Tensor %508, %514, %515 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %values_6, %indices_7 = torch.aten.max.dim %516, %int-1, %true : !torch.vtensor<[1,4,128,128],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,4,128,1],f32>, !torch.vtensor<[1,4,128,1],si64>
  %517 = torch.aten.sub.Tensor %516, %values_6, %float1.000000e00 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32>, !torch.float -> !torch.vtensor<[1,4,128,128],f32>
  %518 = torch.aten.exp %517 : !torch.vtensor<[1,4,128,128],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %519 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %520 = torch.aten.sum.dim_IntList %518, %519, %true, %none : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,4,128,1],f32>
  %521 = torch.aten.div.Tensor %518, %520 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,4,128,1],f32> -> !torch.vtensor<[1,4,128,128],f32>
  %522 = torch.aten.masked_fill.Scalar %521, %514, %int0 : !torch.vtensor<[1,4,128,128],f32>, !torch.vtensor<[1,1,128,128],i1>, !torch.int -> !torch.vtensor<[1,4,128,128],f32>
  %523 = torch.aten.broadcast_to %522, %114 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,128],f32>
  %524 = torch.aten.view %523, %131 : !torch.vtensor<[1,4,128,128],f32>, !torch.list<int> -> !torch.vtensor<[4,128,128],f32>
  %525 = torch.aten.broadcast_to %500, %105 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %526 = torch.aten.view %525, %107 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %527 = torch.aten.bmm %524, %526 : !torch.vtensor<[4,128,128],f32>, !torch.vtensor<[4,128,8],f32> -> !torch.vtensor<[4,128,8],f32>
  %528 = torch.aten.view %527, %105 : !torch.vtensor<[4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %529 = torch.aten.permute %528, %85 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,8],f32>
  %530 = torch.aten.clone %529, %int0 : !torch.vtensor<[1,128,4,8],f32>, !torch.int -> !torch.vtensor<[1,128,4,8],f32>
  %531 = torch.aten.view %530, %139 : !torch.vtensor<[1,128,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %532 = torch.aten.transpose.int %11, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %533 = torch.aten.view %531, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %534 = torch.aten.mm %533, %532 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[128,32],f32>
  %535 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %536 = torch.aten.add.Tensor %535, %534, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %537 = torch.aten.view %536, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %538 = torch.aten.add.Tensor %537, %479, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %539 = torch.aten.sum.dim_IntList %538, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %540 = torch.aten.div.Scalar %539, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %541 = torch.aten.sub.Tensor %538, %540, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %542 = torch.aten.pow.Tensor_Scalar %541, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %543 = torch.aten.sum.dim_IntList %542, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %544 = torch.aten.div.Scalar %543, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %545 = torch.aten.sub.Tensor %538, %540, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %546 = torch.aten.add.Scalar %544, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %547 = torch.aten.sqrt %546 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %548 = torch.aten.div.Tensor %545, %547 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %549 = torch.aten.mul.Tensor %37, %548 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %550 = torch.aten.add.Tensor %549, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %551 = torch.aten.transpose.int %9, %int0, %int1 : !torch.vtensor<[37,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,37],f32>
  %552 = torch.aten.view %550, %78 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %553 = torch.aten.mm %552, %551 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,37],f32> -> !torch.vtensor<[128,37],f32>
  %554 = torch.aten.mul.Scalar %31, %int1 : !torch.vtensor<[37],f32>, !torch.int -> !torch.vtensor<[37],f32>
  %555 = torch.aten.add.Tensor %554, %553, %int1 : !torch.vtensor<[37],f32>, !torch.vtensor<[128,37],f32>, !torch.int -> !torch.vtensor<[128,37],f32>
  %556 = torch.aten.view %555, %166 : !torch.vtensor<[128,37],f32>, !torch.list<int> -> !torch.vtensor<[1,128,37],f32>
  %557 = torch.aten.gelu %556, %str : !torch.vtensor<[1,128,37],f32>, !torch.str -> !torch.vtensor<[1,128,37],f32>
  %558 = torch.aten.transpose.int %7, %int0, %int1 : !torch.vtensor<[32,37],f32>, !torch.int, !torch.int -> !torch.vtensor<[37,32],f32>
  %559 = torch.aten.view %557, %170 : !torch.vtensor<[1,128,37],f32>, !torch.list<int> -> !torch.vtensor<[128,37],f32>
  %560 = torch.aten.mm %559, %558 : !torch.vtensor<[128,37],f32>, !torch.vtensor<[37,32],f32> -> !torch.vtensor<[128,32],f32>
  %561 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %562 = torch.aten.add.Tensor %561, %560, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[128,32],f32>, !torch.int -> !torch.vtensor<[128,32],f32>
  %563 = torch.aten.view %562, %146 : !torch.vtensor<[128,32],f32>, !torch.list<int> -> !torch.vtensor<[1,128,32],f32>
  %564 = torch.aten.add.Tensor %563, %550, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %565 = torch.aten.sum.dim_IntList %564, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %566 = torch.aten.div.Scalar %565, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %567 = torch.aten.sub.Tensor %564, %566, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %568 = torch.aten.pow.Tensor_Scalar %567, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %569 = torch.aten.sum.dim_IntList %568, %52, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %570 = torch.aten.div.Scalar %569, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %571 = torch.aten.sub.Tensor %564, %566, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %572 = torch.aten.add.Scalar %570, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %573 = torch.aten.sqrt %572 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %574 = torch.aten.div.Tensor %571, %573 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %575 = torch.aten.mul.Tensor %37, %574 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %576 = torch.aten.add.Tensor %575, %36, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %577 = torch.aten.slice.Tensor %576, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %578 = torch.aten.slice.Tensor %577, %int1, %int0, %int1, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %579 = torch.aten.squeeze.dim %578, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %580 = torch.aten.transpose.int %5, %int0, %int1 : !torch.vtensor<[32,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,32],f32>
  %581 = torch.aten.mm %579, %580 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,32],f32> -> !torch.vtensor<[1,32],f32>
  %582 = torch.aten.mul.Scalar %36, %int1 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[32],f32>
  %583 = torch.aten.add.Tensor %582, %581, %int1 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %584 = torch.aten.gelu %583, %str : !torch.vtensor<[1,32],f32>, !torch.str -> !torch.vtensor<[1,32],f32>
  %585 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[2,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,2],f32>
  %586 = torch.aten.mm %584, %585 : !torch.vtensor<[1,32],f32>, !torch.vtensor<[32,2],f32> -> !torch.vtensor<[1,2],f32>
  %587 = torch.aten.mul.Scalar %1, %int1 : !torch.vtensor<[2],f32>, !torch.int -> !torch.vtensor<[2],f32>
  %588 = torch.aten.add.Tensor %587, %586, %int1 : !torch.vtensor<[2],f32>, !torch.vtensor<[1,2],f32>, !torch.int -> !torch.vtensor<[1,2],f32>
  return %588 : !torch.vtensor<[1,2],f32>
 }


 } -> SUCCESS
 //===-------------------------------------------===//

 //===-------------------------------------------===//
 Legalizing operation : 'torch.vtensor.literal'(0x8efdfa0) {
  %26 = "torch.vtensor.literal"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> !torch.vtensor<[32,37],f32>

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'torch.vtensor.literal -> ()' {
 Trying to match "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>"
    ** Insert  : 'tosa.const'(0x8f3aac0)
    ** Replace : 'torch.vtensor.literal'(0x8efdfa0)
 "(anonymous namespace)::ConvertAtenOp<mlir::torch::Torch::ValueTensorLiteralOp>" result 1

    //===-------------------------------------------===//
    Legalizing operation : 'tosa.const'(0x8f3aac0) {
      %26 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>

    } -> SUCCESS : operation marked legal by the target
    //===-------------------------------------------===//
  } -> SUCCESS : pattern applied successfully
 // *** IR Dump After Pattern Application ***
 mlir-asm-printer: Verifying operation: func.func
 func.func @forward(%arg0: !torch.vtensor<[1,128],si64>) -> !torch.vtensor<[1,2],f32> {
  %int1 = torch.constant.int 1
  %int32 = torch.constant.int 32
  %int128 = torch.constant.int 128
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
  %1 = torch.vtensor.literal(dense<0.000000e+00> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2x32xf32>} : () -> tensor<2x32xf32>
  %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32xf32>) : !torch.vtensor<[2,32],f32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %5 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %7 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %9 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %11 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %13 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %15 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<37x32xf32>} : () -> tensor<37x32xf32>
  %17 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x32xf32>} : () -> tensor<32x32xf32>
  %19 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<96x32xf32>} : () -> tensor<96x32xf32>
  %21 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %22 = "tosa.const"() {value = dense_resource<__elided__> : tensor<32x37xf32>} : () -> tensor<32x37xf32>
  %23 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %24 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %25 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %26 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %27 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %28 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %29 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %30 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %31 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x37xf32>) : !torch.vtensor<[32,37],f32>
  %32 = torch.vtensor.literal(dense<0.000000e+00> : tensor<37xf32>) : !torch.vtensor<[37],f32>
  %33 = torch.vtensor.literal(dense_resource<__elided__> : tensor<37x32xf32>) : !torch.vtensor<[37,32],f32>
  %34 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x32xf32>) : !torch.vtensor<[32,32],f32>
  %35 = torch.vtensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.vtensor<[],f32>
  %36 = torch.vtensor.literal(dense_resource<__elided__> : tensor<96x32xf32>) : !torch.vtensor<[96,32],f32>
  %37 = torch.vtensor.literal(dense<0.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %38 = torch.vtensor.literal(dense<1.000000e+00> : tensor<32xf32>) : !torch.vtensor<[32],f32>
  %39 = torch.vtensor.literal(dense_resource<__elided__> : tensor<16x32xf32>) : !torch.vtensor<[16,32],f32>
  %40 = torch.vtensor.literal(dense_resource<__elided__> : tensor<512x32xf32>) : !torch.vtensor<[512,32],f32>
  %41 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1000x32xf32>) : !torch.vtensor<[1000,32],f32>
  %42 = torch.vtensor.literal(dense_resource<__elided__> : tensor<1x512xsi64>) : !torch.vtensor<[1,512],si64>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int-1 = torch.constant.int -1
  %true = torch.constant.bool true
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %str = torch.constant.str "none"
  %int0 = torch.constant.int 0
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int2 = torch.constant.int 2
  %float9.999990e-08 = torch.constant.float 9.9999999999999995E-8
  %int96 = torch.constant.int 96
  %int4 = torch.constant.int 4
  %int3 = torch.constant.int 3
  %int8 = torch.constant.int 8
  %int16 = torch.constant.int 16
  %int24 = torch.constant.int 24
  %float4.000000e00 = torch.constant.float 4.000000e+00
  %int37 = torch.constant.int 37
  %cpu = torch.constant.device "cpu"
  %43 = torch.prim.ListConstruct %int1, %int128 : (!torch.int, !torch.int) -> !torch.list<int>
  %44 = torch.aten.ones %43, %none, %none, %cpu, %false : !torch.list<int>, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],f32>
  %45 = torch.aten.zeros %43, %int4, %none, %cpu, %false : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,128],si64>
  %46 = torch.aten.slice.Tensor %42, %int0, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512],si64>
  %47 = torch.aten.slice.Tensor %46, %int1, %int0, %int128, %int1 : !torch.vtensor<[1,512],si64>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,128],si64>
  %48 = torch.aten.embedding %41, %arg0, %int0, %false, %false : !torch.vtensor<[1000,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %49 = torch.aten.embedding %40, %47, %int-1, %false, %false : !torch.vtensor<[512,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %50 = torch.aten.add.Tensor %48, %49, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %51 = torch.aten.embedding %39, %45, %int-1, %false, %false : !torch.vtensor<[16,32],f32>, !torch.vtensor<[1,128],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,128,32],f32>
  %52 = torch.aten.add.Tensor %50, %51, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %53 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %54 = torch.aten.sum.dim_IntList %52, %53, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %55 = torch.aten.div.Scalar %54, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %56 = torch.aten.sub.Tensor %52, %55, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %57 = torch.aten.pow.Tensor_Scalar %56, %int2 : !torch.vtensor<[1,128,32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %58 = torch.aten.sum.dim_IntList %57, %53, %true, %none : !torch.vtensor<[1,128,32],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,128,1],f32>
  %59 = torch.aten.div.Scalar %58, %int32 : !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %60 = torch.aten.sub.Tensor %52, %55, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %61 = torch.aten.add.Scalar %59, %float9.999990e-08, %int1 : !torch.vtensor<[1,128,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %62 = torch.aten.sqrt %61 : !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,1],f32>
  %63 = torch.aten.div.Tensor %60, %62 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %64 = torch.aten.mul.Tensor %38, %63 : !torch.vtensor<[32],f32>, !torch.vtensor<[1,128,32],f32> -> !torch.vtensor<[1,128,32],f32>
  %65 = torch.aten.add.Tensor %64, %37, %int1 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,128,32],f32>
  %66 = torch.aten.unsqueeze %44, %int2 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128,1],f32>
  %67 = torch.aten.mul.Tensor %65, %66 : !torch.vtensor<[1,128,32],f32>, !torch.vtensor<[1,128,1],f32> -> !torch.vtensor<[1,128,32],f32>
  %68 = torch.aten.unsqueeze %44, %int1 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %69 = torch.aten.unsqueeze %68, %int2 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,1,128],f32>
  %70 = torch.aten.squeeze.dim %69, %int-2 : !torch.vtensor<[1,1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128],f32>
  %71 = torch.aten.unsqueeze %70, %int-1 : !torch.vtensor<[1,1,128],f32>, !torch.int -> !torch.vtensor<[1,1,128,1],f32>
  %72 = torch.aten.mul.Tensor %69, %71 : !torch.vtensor<[1,1,1,128],f32>, !torch.vtensor<[1,1,128,1],f32> -> !torch.vtensor<[1,1,128,128],f32>
  %73 = torch.prim.NumToTensor.Scalar %int0 : !torch.int -> !torch.vtensor<[],si64>
  %74 = torch.aten.to.dtype %73, %int1, %false, %false, %none : !torch.vtensor<[],si64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],si8>
  %75 = torch.prim.ListConstruct %int1, %int1, %int128, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %76 = torch.aten.broadcast_to %74, %75 : !torch.vtensor<[],si8>, !torch.list<int> -> !torch.vtensor<[1,1,128,128],si8>
  %77 = torch.aten.copy %76, %72, %false : !torch.vtensor<[1,1,128,128],si8>, !torch.vtensor<[1,1,128,128],f32>, !torch.bool -> !torch.vtensor<[1,1,128,128],si8>
  %78 = torch.aten.transpose.int %36, %int0, %int1 : !torch.vtensor<[96,32],f32>, !torch.int, !torch.int -> !torch.vtensor<[32,96],f32>
  %79 = torch.prim.ListConstruct %int128, %int32 : (!torch.int, !torch.int) -> !torch.list<int>
  %80 = torch.aten.view %67, %79 : !torch.vtensor<[1,128,32],f32>, !torch.list<int> -> !torch.vtensor<[128,32],f32>
  %81 = torch.aten.mm %80, %78 : !torch.vtensor<[128,32],f32>, !torch.vtensor<[32,96],f32> -> !torch.vtensor<[128,96],f32>
  %82 = torch.prim.ListConstruct %int1, %int128, %int96 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %83 = torch.aten.view %81, %82 : !torch.vtensor<[128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,96],f32>
  %84 = torch.prim.ListConstruct %int1, %int128, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %85 = torch.aten.view %83, %84 : !torch.vtensor<[1,128,96],f32>, !torch.list<int> -> !torch.vtensor<[1,128,4,24],f32>
  %86 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %87 = torch.aten.permute %85, %86 : !torch.vtensor<[1,128,4,24],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,24],f32>
  %88 = torch.aten.slice.Tensor %87, %int-1, %int0, %int8, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %89 = torch.aten.slice.Tensor %87, %int-1, %int8, %int16, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %90 = torch.aten.slice.Tensor %87, %int-1, %int16, %int24, %int1 : !torch.vtensor<[1,4,128,24],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %91 = torch.aten.unsqueeze %37, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %92 = torch.aten.unsqueeze %91, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %93 = torch.aten.slice.Tensor %92, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %94 = torch.prim.ListConstruct %int1, %int1, %int4, %int-1 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %95 = torch.aten.view %93, %94 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %96 = torch.aten.permute %95, %86 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %97 = torch.aten.add.Tensor %88, %96, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %98 = torch.aten.unsqueeze %37, %int0 : !torch.vtensor<[32],f32>, !torch.int -> !torch.vtensor<[1,32],f32>
  %99 = torch.aten.unsqueeze %98, %int1 : !torch.vtensor<[1,32],f32>, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %100 = torch.aten.slice.Tensor %99, %int2, %int0, %int9223372036854775807, %int1 : !torch.vtensor<[1,1,32],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,32],f32>
  %101 = torch.aten.view %100, %94 : !torch.vtensor<[1,1,32],f32>, !torch.list<int> -> !torch.vtensor<[1,1,4,8],f32>
  %102 = torch.aten.permute %101, %86 : !torch.vtensor<[1,1,4,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,1,8],f32>
  %103 = torch.aten.add.Tensor %90, %102, %int1 : !torch.vtensor<[1,4,128,8],f32>, !torch.vtensor<[1,4,1,8],f32>, !torch.int -> !torch.vtensor<[1,4,128,8],f32>
  %104 = torch.aten.div.Scalar %97, %float4.000000e00 : !torch.vtensor<[1,4,128,8],f32>, !torch.float -> !torch.vtensor<[1,4,128,8],f32>
  %105 = torch.aten.transpose.int %89, %int-1, %int-2 : !torch.vtensor<[1,4,128,8],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,4,8,128],f32>
  %106 = torch.prim.ListConstruct %int1, %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %107 = torch.aten.broadcast_to %104, %106 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[1,4,128,8],f32>
  %108 = torch.prim.ListConstruct %int4, %int128, %int8 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %109 = torch.aten.view %107, %108 : !torch.vtensor<[1,4,128,8],f32>, !torch.list<int> -> !torch.vtensor<[4,128,8],f32>
  %110 = torch.prim.ListConstruct %int1, %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %111 = torch.aten.broadcast_to %105, %110 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[1,4,8,128],f32>
  %112 = torch.prim.ListConstruct %int4, %int8, %int128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %113 = torch.aten.view %111, %112 : !torch.vtensor<[1,4,8,128],f32>, !torch.list<int> -> !torch.vtensor<[4,8,128],f32>
  %114 = torch.aten.bmm %109, %113 : !torch.vtensor<[4,128,8],f32>, !torch.vten