AmosLewis · September 29, 2022 00:31
diff --git a/gpttosa_debug.txt b/gpttosa_debug.txt
 ➜  SHARK git:(gpt) ✗ torch-mlir-opt -pass-pipeline='torch-backend-to-tosa-backend-pipeline' /tmp/_lambda.mlir -mlir-print-ir-after-all -mlir-pretty-debuginfo -mlir-disable-threading
 // -----// IR Dump After ConvertTorchToTosa (convert-torch-to-tosa) //----- //
 func.func @forward(%arg0: !torch.vtensor<[1,5],si64>) -> !torch.vtensor<[1,5,50257],f32> {
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
  %int5 = torch.constant.int 5
  %1 = torch_c.to_i64 %int5
  %int1 = torch.constant.int 1
  %2 = torch_c.to_i64 %int1
  %true = torch.constant.bool true
  %float0.000000e00 = torch.constant.float 0.000000e+00
  %3 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %5 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %7 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %9 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %11 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %13 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %15 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %17 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %19 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %21 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %22 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %23 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %24 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %25 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %26 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %27 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %28 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %29 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %30 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %31 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %32 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %33 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %34 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %35 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %36 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %37 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %38 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %39 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %40 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %41 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %42 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %43 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %44 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %45 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %46 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %47 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %48 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %49 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %50 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %51 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %52 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %53 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %54 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %55 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %56 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %57 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %58 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %59 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %60 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %61 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %62 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %63 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %64 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %65 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %66 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %67 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %68 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %69 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %70 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %71 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %72 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %73 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %74 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %75 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %76 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %77 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %78 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %79 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %80 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %81 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %82 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %83 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %84 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %85 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %86 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %87 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %88 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %89 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %90 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %91 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %92 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %93 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %94 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %95 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %96 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %97 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %98 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %99 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %100 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %101 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %102 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %103 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %104 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %105 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %106 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %107 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %108 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %109 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %110 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %111 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %112 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %113 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %114 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %115 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %116 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %117 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %118 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %119 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %120 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %121 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %122 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %123 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %124 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %125 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %126 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %127 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %128 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %129 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %130 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %131 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %132 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %133 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %134 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %135 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %136 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %137 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %138 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %139 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %140 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %141 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %142 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %143 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %144 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %145 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
  %146 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
  %147 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %148 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %149 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %150 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %151 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %152 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
  %153 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
  %false = torch.constant.bool false
  %154 = torch_c.to_i1 %false
  %none = torch.constant.none
  %int-2 = torch.constant.int -2
  %int11 = torch.constant.int 11
  %int4 = torch.constant.int 4
  %int-1 = torch.constant.int -1
  %155 = torch_c.to_i64 %int-1
  %int0 = torch.constant.int 0
  %156 = torch_c.to_i64 %int0
  %int768 = torch.constant.int 768
  %157 = torch_c.to_i64 %int768
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %int2 = torch.constant.int 2
  %158 = torch_c.to_i64 %int2
  %int2304 = torch.constant.int 2304
  %159 = torch_c.to_i64 %int2304
  %int1536 = torch.constant.int 1536
  %int12 = torch.constant.int 12
  %int64 = torch.constant.int 64
  %int3 = torch.constant.int 3
  %160 = torch_c.to_i64 %int3
  %int9223372036854775807 = torch.constant.int 9223372036854775807
  %int3072 = torch.constant.int 3072
  %float5.000000e-01 = torch.constant.float 5.000000e-01
  %float3.000000e00 = torch.constant.float 3.000000e+00
  %float4.471500e-02 = torch.constant.float 4.471500e-02
  %float7.978850e-01 = torch.constant.float 0.79788456080286541
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int50257 = torch.constant.int 50257
  %cpu = torch.constant.device "cpu"
  %161 = torch.prim.ListConstruct %int-1, %int5 : (!torch.int, !torch.int) -> !torch.list<int>
  %162 = "tosa.reshape"(%0) {new_shape = [-1, 5]} : (tensor<1x5xi64>) -> tensor<1x5xi64>
  %163 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
  %164 = torch_c.to_builtin_tensor %163 : !torch.vtensor<[5],si64> -> tensor<5xi64>
  %165 = "tosa.reshape"(%164) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
  %166 = "tosa.reshape"(%165) {new_shape = [-1, 5]} : (tensor<1x5xi64>) -> tensor<1x5xi64>
  %167 = "tosa.reshape"(%153) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
  %168 = "tosa.reshape"(%162) {new_shape = [1, 5]} : (tensor<1x5xi64>) -> tensor<1x5xi64>
  %169 = "tosa.cast"(%168) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %170 = "tosa.gather"(%167, %169) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %171 = "tosa.reshape"(%170) {new_shape = [1, 5, 768]} : (tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %172 = "tosa.reshape"(%152) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
  %173 = "tosa.reshape"(%166) {new_shape = [1, 5]} : (tensor<1x5xi64>) -> tensor<1x5xi64>
  %174 = "tosa.cast"(%173) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %175 = "tosa.gather"(%172, %174) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %176 = "tosa.reshape"(%175) {new_shape = [1, 5, 768]} : (tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %177 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %178 = "tosa.mul"(%176, %177) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %179 = "tosa.add"(%171, %178) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %180 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
  %181 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %182 = "tosa.reciprocal"(%181) : (tensor<1xf32>) -> tensor<1xf32>
  %183 = "tosa.reduce_sum"(%179) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %184 = "tosa.reshape"(%183) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %185 = "tosa.mul"(%184, %182) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %186 = "tosa.sub"(%179, %185) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %187 = "tosa.mul"(%186, %186) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %188 = "tosa.reduce_sum"(%187) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %189 = "tosa.reshape"(%188) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %190 = "tosa.mul"(%189, %182) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %191 = "tosa.reshape"(%151) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %192 = "tosa.reshape"(%150) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %193 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %194 = "tosa.sub"(%179, %185) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %195 = "tosa.add"(%190, %193) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %196 = "tosa.rsqrt"(%195) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %197 = "tosa.mul"(%194, %196) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %198 = "tosa.mul"(%197, %191) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %199 = "tosa.add"(%198, %192) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %200 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
  %201 = "tosa.reshape"(%199) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %202 = "tosa.reshape"(%201) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %203 = "tosa.reshape"(%148) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %204 = "tosa.matmul"(%202, %203) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %205 = "tosa.reshape"(%204) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %206 = tensor.cast %205 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %207 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %208 = "tosa.mul"(%149, %207) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %209 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %210 = "tosa.mul"(%206, %209) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %211 = "tosa.add"(%208, %210) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %212 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %213 = "tosa.reshape"(%211) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %214 = "tosa.slice"(%213) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %215 = "tosa.slice"(%213) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %216 = "tosa.slice"(%213) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %217 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %218 = "tosa.reshape"(%214) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %219 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %220 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %221 = "tosa.transpose"(%218, %220) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %222 = "tosa.reshape"(%215) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %223 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %224 = "tosa.transpose"(%222, %223) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %225 = "tosa.reshape"(%216) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %226 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %227 = "tosa.transpose"(%225, %226) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %228 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %229 = "tosa.transpose"(%224, %228) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %230 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %231 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %232 = "tosa.reshape"(%221) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %233 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %234 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %235 = "tosa.reshape"(%229) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %236 = "tosa.matmul"(%232, %235) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %237 = tensor.cast %236 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %238 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %239 = "tosa.reshape"(%237) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %240 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %241 = "tosa.reciprocal"(%240) : (tensor<f32>) -> tensor<f32>
  %242 = "tosa.mul"(%239, %241) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %243 = torch_c.from_builtin_tensor %242 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %244 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %245 = "tosa.slice"(%244) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %246 = "tosa.slice"(%245) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %247 = "tosa.slice"(%246) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %248 = torch_c.from_builtin_tensor %247 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %249 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %250 = torch.aten.to.dtype %249, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %251 = torch_c.to_builtin_tensor %250 : !torch.vtensor<[],i1> -> tensor<i1>
  %252 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %253 = torch.valsem.aten.copy %250, %248, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %254 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %255 = torch_c.from_builtin_tensor %254 : tensor<f32> -> !torch.vtensor<[],f32>
  %256 = torch.aten.where.self %253, %243, %255 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %257 = torch_c.to_builtin_tensor %256 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %258 = "tosa.reduce_max"(%257) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %259 = "tosa.argmax"(%257) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %260 = "tosa.reshape"(%259) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %261 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %262 = "tosa.mul"(%258, %261) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %263 = "tosa.sub"(%257, %262) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %264 = "tosa.exp"(%263) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %265 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %266 = "tosa.reduce_sum"(%264) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %267 = "tosa.reciprocal"(%266) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %268 = "tosa.mul"(%264, %267) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %269 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %270 = "tosa.reshape"(%268) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %271 = "tosa.reshape"(%227) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %272 = "tosa.matmul"(%270, %271) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %273 = tensor.cast %272 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %274 = "tosa.reshape"(%273) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %275 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %276 = "tosa.transpose"(%274, %275) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %277 = "tosa.cast"(%276) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %278 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %279 = "tosa.reshape"(%277) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %280 = "tosa.reshape"(%279) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %281 = "tosa.reshape"(%280) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %282 = "tosa.reshape"(%143) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %283 = "tosa.matmul"(%281, %282) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %284 = "tosa.reshape"(%283) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %285 = tensor.cast %284 : tensor<5x768xf32> to tensor<5x768xf32>
  %286 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %287 = "tosa.mul"(%144, %286) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %288 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %289 = "tosa.mul"(%285, %288) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %290 = "tosa.add"(%287, %289) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %291 = "tosa.reshape"(%290) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %292 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %293 = "tosa.mul"(%179, %292) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %294 = "tosa.add"(%291, %293) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %295 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %296 = "tosa.reciprocal"(%295) : (tensor<1xf32>) -> tensor<1xf32>
  %297 = "tosa.reduce_sum"(%294) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %298 = "tosa.reshape"(%297) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %299 = "tosa.mul"(%298, %296) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %300 = "tosa.sub"(%294, %299) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %301 = "tosa.mul"(%300, %300) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %302 = "tosa.reduce_sum"(%301) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %303 = "tosa.reshape"(%302) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %304 = "tosa.mul"(%303, %296) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %305 = "tosa.reshape"(%142) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %306 = "tosa.reshape"(%141) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %307 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %308 = "tosa.sub"(%294, %299) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %309 = "tosa.add"(%304, %307) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %310 = "tosa.rsqrt"(%309) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %311 = "tosa.mul"(%308, %310) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %312 = "tosa.mul"(%311, %305) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %313 = "tosa.add"(%312, %306) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %314 = "tosa.reshape"(%313) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %315 = "tosa.reshape"(%314) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %316 = "tosa.reshape"(%139) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %317 = "tosa.matmul"(%315, %316) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %318 = "tosa.reshape"(%317) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %319 = tensor.cast %318 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %320 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %321 = "tosa.mul"(%140, %320) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %322 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %323 = "tosa.mul"(%319, %322) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %324 = "tosa.add"(%321, %323) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %325 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %326 = "tosa.reshape"(%324) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %327 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %328 = "tosa.mul"(%326, %327) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %329 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %330 = "tosa.pow"(%326, %329) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %331 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %332 = "tosa.mul"(%330, %331) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %333 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %334 = "tosa.mul"(%332, %333) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %335 = "tosa.add"(%326, %334) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %336 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %337 = "tosa.mul"(%335, %336) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %338 = "tosa.tanh"(%337) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %339 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %340 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %341 = "tosa.mul"(%339, %340) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %342 = "tosa.add"(%338, %341) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %343 = "tosa.mul"(%328, %342) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %344 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
  %345 = "tosa.reshape"(%343) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %346 = "tosa.reshape"(%345) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %347 = "tosa.reshape"(%137) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %348 = "tosa.matmul"(%346, %347) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %349 = "tosa.reshape"(%348) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %350 = tensor.cast %349 : tensor<5x768xf32> to tensor<5x768xf32>
  %351 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %352 = "tosa.mul"(%138, %351) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %353 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %354 = "tosa.mul"(%350, %353) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %355 = "tosa.add"(%352, %354) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %356 = "tosa.reshape"(%355) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %357 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %358 = "tosa.mul"(%356, %357) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %359 = "tosa.add"(%294, %358) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %360 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %361 = "tosa.reciprocal"(%360) : (tensor<1xf32>) -> tensor<1xf32>
  %362 = "tosa.reduce_sum"(%359) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %363 = "tosa.reshape"(%362) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %364 = "tosa.mul"(%363, %361) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %365 = "tosa.sub"(%359, %364) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %366 = "tosa.mul"(%365, %365) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %367 = "tosa.reduce_sum"(%366) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %368 = "tosa.reshape"(%367) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %369 = "tosa.mul"(%368, %361) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %370 = "tosa.reshape"(%136) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %371 = "tosa.reshape"(%135) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %372 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %373 = "tosa.sub"(%359, %364) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %374 = "tosa.add"(%369, %372) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %375 = "tosa.rsqrt"(%374) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %376 = "tosa.mul"(%373, %375) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %377 = "tosa.mul"(%376, %370) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %378 = "tosa.add"(%377, %371) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %379 = "tosa.reshape"(%378) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %380 = "tosa.reshape"(%379) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %381 = "tosa.reshape"(%133) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %382 = "tosa.matmul"(%380, %381) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %383 = "tosa.reshape"(%382) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %384 = tensor.cast %383 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %385 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %386 = "tosa.mul"(%134, %385) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %387 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %388 = "tosa.mul"(%384, %387) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %389 = "tosa.add"(%386, %388) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %390 = "tosa.reshape"(%389) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %391 = "tosa.slice"(%390) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %392 = "tosa.slice"(%390) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %393 = "tosa.slice"(%390) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %394 = "tosa.reshape"(%391) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %395 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %396 = "tosa.transpose"(%394, %395) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %397 = "tosa.reshape"(%392) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %398 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %399 = "tosa.transpose"(%397, %398) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %400 = "tosa.reshape"(%393) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %401 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %402 = "tosa.transpose"(%400, %401) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %403 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %404 = "tosa.transpose"(%399, %403) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %405 = "tosa.reshape"(%396) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %406 = "tosa.reshape"(%404) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %407 = "tosa.matmul"(%405, %406) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %408 = tensor.cast %407 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %409 = "tosa.reshape"(%408) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %410 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %411 = "tosa.reciprocal"(%410) : (tensor<f32>) -> tensor<f32>
  %412 = "tosa.mul"(%409, %411) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %413 = torch_c.from_builtin_tensor %412 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %414 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %415 = "tosa.slice"(%414) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %416 = "tosa.slice"(%415) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %417 = "tosa.slice"(%416) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %418 = torch_c.from_builtin_tensor %417 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %419 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %420 = torch.aten.to.dtype %419, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %421 = torch_c.to_builtin_tensor %420 : !torch.vtensor<[],i1> -> tensor<i1>
  %422 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %423 = torch.valsem.aten.copy %420, %418, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %424 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %425 = torch_c.from_builtin_tensor %424 : tensor<f32> -> !torch.vtensor<[],f32>
  %426 = torch.aten.where.self %423, %413, %425 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %427 = torch_c.to_builtin_tensor %426 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %428 = "tosa.reduce_max"(%427) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %429 = "tosa.argmax"(%427) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %430 = "tosa.reshape"(%429) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %431 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %432 = "tosa.mul"(%428, %431) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %433 = "tosa.sub"(%427, %432) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %434 = "tosa.exp"(%433) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %435 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %436 = "tosa.reduce_sum"(%434) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %437 = "tosa.reciprocal"(%436) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %438 = "tosa.mul"(%434, %437) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %439 = "tosa.reshape"(%438) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %440 = "tosa.reshape"(%402) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %441 = "tosa.matmul"(%439, %440) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %442 = tensor.cast %441 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %443 = "tosa.reshape"(%442) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %444 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %445 = "tosa.transpose"(%443, %444) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %446 = "tosa.cast"(%445) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %447 = "tosa.reshape"(%446) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %448 = "tosa.reshape"(%447) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %449 = "tosa.reshape"(%448) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %450 = "tosa.reshape"(%131) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %451 = "tosa.matmul"(%449, %450) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %452 = "tosa.reshape"(%451) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %453 = tensor.cast %452 : tensor<5x768xf32> to tensor<5x768xf32>
  %454 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %455 = "tosa.mul"(%132, %454) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %456 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %457 = "tosa.mul"(%453, %456) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %458 = "tosa.add"(%455, %457) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %459 = "tosa.reshape"(%458) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %460 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %461 = "tosa.mul"(%359, %460) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %462 = "tosa.add"(%459, %461) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %463 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %464 = "tosa.reciprocal"(%463) : (tensor<1xf32>) -> tensor<1xf32>
  %465 = "tosa.reduce_sum"(%462) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %466 = "tosa.reshape"(%465) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %467 = "tosa.mul"(%466, %464) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %468 = "tosa.sub"(%462, %467) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %469 = "tosa.mul"(%468, %468) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %470 = "tosa.reduce_sum"(%469) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %471 = "tosa.reshape"(%470) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %472 = "tosa.mul"(%471, %464) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %473 = "tosa.reshape"(%130) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %474 = "tosa.reshape"(%129) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %475 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %476 = "tosa.sub"(%462, %467) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %477 = "tosa.add"(%472, %475) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %478 = "tosa.rsqrt"(%477) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %479 = "tosa.mul"(%476, %478) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %480 = "tosa.mul"(%479, %473) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %481 = "tosa.add"(%480, %474) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %482 = "tosa.reshape"(%481) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %483 = "tosa.reshape"(%482) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %484 = "tosa.reshape"(%127) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %485 = "tosa.matmul"(%483, %484) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %486 = "tosa.reshape"(%485) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %487 = tensor.cast %486 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %488 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %489 = "tosa.mul"(%128, %488) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %490 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %491 = "tosa.mul"(%487, %490) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %492 = "tosa.add"(%489, %491) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %493 = "tosa.reshape"(%492) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %494 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %495 = "tosa.mul"(%493, %494) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %496 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %497 = "tosa.pow"(%493, %496) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %498 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %499 = "tosa.mul"(%497, %498) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %500 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %501 = "tosa.mul"(%499, %500) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %502 = "tosa.add"(%493, %501) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %503 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %504 = "tosa.mul"(%502, %503) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %505 = "tosa.tanh"(%504) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %506 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %507 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %508 = "tosa.mul"(%506, %507) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %509 = "tosa.add"(%505, %508) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %510 = "tosa.mul"(%495, %509) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %511 = "tosa.reshape"(%510) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %512 = "tosa.reshape"(%511) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %513 = "tosa.reshape"(%125) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %514 = "tosa.matmul"(%512, %513) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %515 = "tosa.reshape"(%514) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %516 = tensor.cast %515 : tensor<5x768xf32> to tensor<5x768xf32>
  %517 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %518 = "tosa.mul"(%126, %517) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %519 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %520 = "tosa.mul"(%516, %519) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %521 = "tosa.add"(%518, %520) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %522 = "tosa.reshape"(%521) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %523 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %524 = "tosa.mul"(%522, %523) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %525 = "tosa.add"(%462, %524) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %526 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %527 = "tosa.reciprocal"(%526) : (tensor<1xf32>) -> tensor<1xf32>
  %528 = "tosa.reduce_sum"(%525) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %529 = "tosa.reshape"(%528) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %530 = "tosa.mul"(%529, %527) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %531 = "tosa.sub"(%525, %530) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %532 = "tosa.mul"(%531, %531) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %533 = "tosa.reduce_sum"(%532) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %534 = "tosa.reshape"(%533) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %535 = "tosa.mul"(%534, %527) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %536 = "tosa.reshape"(%124) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %537 = "tosa.reshape"(%123) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %538 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %539 = "tosa.sub"(%525, %530) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %540 = "tosa.add"(%535, %538) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %541 = "tosa.rsqrt"(%540) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %542 = "tosa.mul"(%539, %541) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %543 = "tosa.mul"(%542, %536) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %544 = "tosa.add"(%543, %537) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %545 = "tosa.reshape"(%544) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %546 = "tosa.reshape"(%545) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %547 = "tosa.reshape"(%121) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %548 = "tosa.matmul"(%546, %547) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %549 = "tosa.reshape"(%548) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %550 = tensor.cast %549 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %551 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %552 = "tosa.mul"(%122, %551) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %553 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %554 = "tosa.mul"(%550, %553) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %555 = "tosa.add"(%552, %554) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %556 = "tosa.reshape"(%555) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %557 = "tosa.slice"(%556) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %558 = "tosa.slice"(%556) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %559 = "tosa.slice"(%556) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %560 = "tosa.reshape"(%557) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %561 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %562 = "tosa.transpose"(%560, %561) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %563 = "tosa.reshape"(%558) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %564 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %565 = "tosa.transpose"(%563, %564) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %566 = "tosa.reshape"(%559) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %567 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %568 = "tosa.transpose"(%566, %567) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %569 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %570 = "tosa.transpose"(%565, %569) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %571 = "tosa.reshape"(%562) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %572 = "tosa.reshape"(%570) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %573 = "tosa.matmul"(%571, %572) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %574 = tensor.cast %573 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %575 = "tosa.reshape"(%574) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %576 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %577 = "tosa.reciprocal"(%576) : (tensor<f32>) -> tensor<f32>
  %578 = "tosa.mul"(%575, %577) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %579 = torch_c.from_builtin_tensor %578 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %580 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %581 = "tosa.slice"(%580) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %582 = "tosa.slice"(%581) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %583 = "tosa.slice"(%582) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %584 = torch_c.from_builtin_tensor %583 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %585 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %586 = torch.aten.to.dtype %585, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %587 = torch_c.to_builtin_tensor %586 : !torch.vtensor<[],i1> -> tensor<i1>
  %588 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %589 = torch.valsem.aten.copy %586, %584, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %590 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %591 = torch_c.from_builtin_tensor %590 : tensor<f32> -> !torch.vtensor<[],f32>
  %592 = torch.aten.where.self %589, %579, %591 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %593 = torch_c.to_builtin_tensor %592 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %594 = "tosa.reduce_max"(%593) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %595 = "tosa.argmax"(%593) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %596 = "tosa.reshape"(%595) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %597 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %598 = "tosa.mul"(%594, %597) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %599 = "tosa.sub"(%593, %598) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %600 = "tosa.exp"(%599) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %601 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %602 = "tosa.reduce_sum"(%600) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %603 = "tosa.reciprocal"(%602) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %604 = "tosa.mul"(%600, %603) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %605 = "tosa.reshape"(%604) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %606 = "tosa.reshape"(%568) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %607 = "tosa.matmul"(%605, %606) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %608 = tensor.cast %607 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %609 = "tosa.reshape"(%608) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %610 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %611 = "tosa.transpose"(%609, %610) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %612 = "tosa.cast"(%611) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %613 = "tosa.reshape"(%612) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %614 = "tosa.reshape"(%613) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %615 = "tosa.reshape"(%614) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %616 = "tosa.reshape"(%119) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %617 = "tosa.matmul"(%615, %616) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %618 = "tosa.reshape"(%617) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %619 = tensor.cast %618 : tensor<5x768xf32> to tensor<5x768xf32>
  %620 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %621 = "tosa.mul"(%120, %620) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %622 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %623 = "tosa.mul"(%619, %622) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %624 = "tosa.add"(%621, %623) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %625 = "tosa.reshape"(%624) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %626 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %627 = "tosa.mul"(%525, %626) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %628 = "tosa.add"(%625, %627) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %629 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %630 = "tosa.reciprocal"(%629) : (tensor<1xf32>) -> tensor<1xf32>
  %631 = "tosa.reduce_sum"(%628) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %632 = "tosa.reshape"(%631) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %633 = "tosa.mul"(%632, %630) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %634 = "tosa.sub"(%628, %633) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %635 = "tosa.mul"(%634, %634) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %636 = "tosa.reduce_sum"(%635) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %637 = "tosa.reshape"(%636) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %638 = "tosa.mul"(%637, %630) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %639 = "tosa.reshape"(%118) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %640 = "tosa.reshape"(%117) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %641 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %642 = "tosa.sub"(%628, %633) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %643 = "tosa.add"(%638, %641) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %644 = "tosa.rsqrt"(%643) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %645 = "tosa.mul"(%642, %644) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %646 = "tosa.mul"(%645, %639) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %647 = "tosa.add"(%646, %640) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %648 = "tosa.reshape"(%647) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %649 = "tosa.reshape"(%648) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %650 = "tosa.reshape"(%115) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %651 = "tosa.matmul"(%649, %650) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %652 = "tosa.reshape"(%651) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %653 = tensor.cast %652 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %654 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %655 = "tosa.mul"(%116, %654) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %656 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %657 = "tosa.mul"(%653, %656) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %658 = "tosa.add"(%655, %657) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %659 = "tosa.reshape"(%658) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %660 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %661 = "tosa.mul"(%659, %660) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %662 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %663 = "tosa.pow"(%659, %662) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %664 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %665 = "tosa.mul"(%663, %664) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %666 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %667 = "tosa.mul"(%665, %666) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %668 = "tosa.add"(%659, %667) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %669 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %670 = "tosa.mul"(%668, %669) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %671 = "tosa.tanh"(%670) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %672 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %673 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %674 = "tosa.mul"(%672, %673) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %675 = "tosa.add"(%671, %674) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %676 = "tosa.mul"(%661, %675) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %677 = "tosa.reshape"(%676) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %678 = "tosa.reshape"(%677) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %679 = "tosa.reshape"(%113) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %680 = "tosa.matmul"(%678, %679) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %681 = "tosa.reshape"(%680) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %682 = tensor.cast %681 : tensor<5x768xf32> to tensor<5x768xf32>
  %683 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %684 = "tosa.mul"(%114, %683) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %685 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %686 = "tosa.mul"(%682, %685) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %687 = "tosa.add"(%684, %686) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %688 = "tosa.reshape"(%687) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %689 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %690 = "tosa.mul"(%688, %689) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %691 = "tosa.add"(%628, %690) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %692 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %693 = "tosa.reciprocal"(%692) : (tensor<1xf32>) -> tensor<1xf32>
  %694 = "tosa.reduce_sum"(%691) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %695 = "tosa.reshape"(%694) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %696 = "tosa.mul"(%695, %693) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %697 = "tosa.sub"(%691, %696) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %698 = "tosa.mul"(%697, %697) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %699 = "tosa.reduce_sum"(%698) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %700 = "tosa.reshape"(%699) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %701 = "tosa.mul"(%700, %693) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %702 = "tosa.reshape"(%112) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %703 = "tosa.reshape"(%111) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %704 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %705 = "tosa.sub"(%691, %696) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %706 = "tosa.add"(%701, %704) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %707 = "tosa.rsqrt"(%706) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %708 = "tosa.mul"(%705, %707) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %709 = "tosa.mul"(%708, %702) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %710 = "tosa.add"(%709, %703) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %711 = "tosa.reshape"(%710) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %712 = "tosa.reshape"(%711) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %713 = "tosa.reshape"(%109) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %714 = "tosa.matmul"(%712, %713) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %715 = "tosa.reshape"(%714) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %716 = tensor.cast %715 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %717 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %718 = "tosa.mul"(%110, %717) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %719 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %720 = "tosa.mul"(%716, %719) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %721 = "tosa.add"(%718, %720) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %722 = "tosa.reshape"(%721) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %723 = "tosa.slice"(%722) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %724 = "tosa.slice"(%722) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %725 = "tosa.slice"(%722) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %726 = "tosa.reshape"(%723) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %727 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %728 = "tosa.transpose"(%726, %727) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %729 = "tosa.reshape"(%724) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %730 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %731 = "tosa.transpose"(%729, %730) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %732 = "tosa.reshape"(%725) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %733 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %734 = "tosa.transpose"(%732, %733) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %735 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %736 = "tosa.transpose"(%731, %735) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %737 = "tosa.reshape"(%728) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %738 = "tosa.reshape"(%736) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %739 = "tosa.matmul"(%737, %738) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %740 = tensor.cast %739 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %741 = "tosa.reshape"(%740) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %742 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %743 = "tosa.reciprocal"(%742) : (tensor<f32>) -> tensor<f32>
  %744 = "tosa.mul"(%741, %743) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %745 = torch_c.from_builtin_tensor %744 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %746 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %747 = "tosa.slice"(%746) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %748 = "tosa.slice"(%747) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %749 = "tosa.slice"(%748) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %750 = torch_c.from_builtin_tensor %749 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %751 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %752 = torch.aten.to.dtype %751, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %753 = torch_c.to_builtin_tensor %752 : !torch.vtensor<[],i1> -> tensor<i1>
  %754 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %755 = torch.valsem.aten.copy %752, %750, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %756 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %757 = torch_c.from_builtin_tensor %756 : tensor<f32> -> !torch.vtensor<[],f32>
  %758 = torch.aten.where.self %755, %745, %757 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %759 = torch_c.to_builtin_tensor %758 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %760 = "tosa.reduce_max"(%759) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %761 = "tosa.argmax"(%759) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %762 = "tosa.reshape"(%761) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %763 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %764 = "tosa.mul"(%760, %763) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %765 = "tosa.sub"(%759, %764) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %766 = "tosa.exp"(%765) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %767 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %768 = "tosa.reduce_sum"(%766) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %769 = "tosa.reciprocal"(%768) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %770 = "tosa.mul"(%766, %769) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %771 = "tosa.reshape"(%770) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %772 = "tosa.reshape"(%734) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %773 = "tosa.matmul"(%771, %772) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %774 = tensor.cast %773 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %775 = "tosa.reshape"(%774) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %776 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %777 = "tosa.transpose"(%775, %776) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %778 = "tosa.cast"(%777) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %779 = "tosa.reshape"(%778) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %780 = "tosa.reshape"(%779) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %781 = "tosa.reshape"(%780) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %782 = "tosa.reshape"(%107) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %783 = "tosa.matmul"(%781, %782) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %784 = "tosa.reshape"(%783) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %785 = tensor.cast %784 : tensor<5x768xf32> to tensor<5x768xf32>
  %786 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %787 = "tosa.mul"(%108, %786) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %788 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %789 = "tosa.mul"(%785, %788) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %790 = "tosa.add"(%787, %789) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %791 = "tosa.reshape"(%790) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %792 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %793 = "tosa.mul"(%691, %792) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %794 = "tosa.add"(%791, %793) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %795 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %796 = "tosa.reciprocal"(%795) : (tensor<1xf32>) -> tensor<1xf32>
  %797 = "tosa.reduce_sum"(%794) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %798 = "tosa.reshape"(%797) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %799 = "tosa.mul"(%798, %796) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %800 = "tosa.sub"(%794, %799) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %801 = "tosa.mul"(%800, %800) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %802 = "tosa.reduce_sum"(%801) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %803 = "tosa.reshape"(%802) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %804 = "tosa.mul"(%803, %796) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %805 = "tosa.reshape"(%106) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %806 = "tosa.reshape"(%105) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %807 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %808 = "tosa.sub"(%794, %799) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %809 = "tosa.add"(%804, %807) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %810 = "tosa.rsqrt"(%809) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %811 = "tosa.mul"(%808, %810) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %812 = "tosa.mul"(%811, %805) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %813 = "tosa.add"(%812, %806) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %814 = "tosa.reshape"(%813) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %815 = "tosa.reshape"(%814) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %816 = "tosa.reshape"(%103) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %817 = "tosa.matmul"(%815, %816) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %818 = "tosa.reshape"(%817) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %819 = tensor.cast %818 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %820 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %821 = "tosa.mul"(%104, %820) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %822 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %823 = "tosa.mul"(%819, %822) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %824 = "tosa.add"(%821, %823) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %825 = "tosa.reshape"(%824) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %826 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %827 = "tosa.mul"(%825, %826) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %828 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %829 = "tosa.pow"(%825, %828) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %830 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %831 = "tosa.mul"(%829, %830) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %832 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %833 = "tosa.mul"(%831, %832) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %834 = "tosa.add"(%825, %833) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %835 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %836 = "tosa.mul"(%834, %835) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %837 = "tosa.tanh"(%836) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %838 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %839 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %840 = "tosa.mul"(%838, %839) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %841 = "tosa.add"(%837, %840) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %842 = "tosa.mul"(%827, %841) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %843 = "tosa.reshape"(%842) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %844 = "tosa.reshape"(%843) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %845 = "tosa.reshape"(%101) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %846 = "tosa.matmul"(%844, %845) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %847 = "tosa.reshape"(%846) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %848 = tensor.cast %847 : tensor<5x768xf32> to tensor<5x768xf32>
  %849 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %850 = "tosa.mul"(%102, %849) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %851 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %852 = "tosa.mul"(%848, %851) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %853 = "tosa.add"(%850, %852) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %854 = "tosa.reshape"(%853) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %855 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %856 = "tosa.mul"(%854, %855) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %857 = "tosa.add"(%794, %856) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %858 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %859 = "tosa.reciprocal"(%858) : (tensor<1xf32>) -> tensor<1xf32>
  %860 = "tosa.reduce_sum"(%857) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %861 = "tosa.reshape"(%860) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %862 = "tosa.mul"(%861, %859) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %863 = "tosa.sub"(%857, %862) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %864 = "tosa.mul"(%863, %863) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %865 = "tosa.reduce_sum"(%864) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %866 = "tosa.reshape"(%865) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %867 = "tosa.mul"(%866, %859) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %868 = "tosa.reshape"(%100) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %869 = "tosa.reshape"(%99) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %870 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %871 = "tosa.sub"(%857, %862) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %872 = "tosa.add"(%867, %870) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %873 = "tosa.rsqrt"(%872) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %874 = "tosa.mul"(%871, %873) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %875 = "tosa.mul"(%874, %868) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %876 = "tosa.add"(%875, %869) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %877 = "tosa.reshape"(%876) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %878 = "tosa.reshape"(%877) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %879 = "tosa.reshape"(%97) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %880 = "tosa.matmul"(%878, %879) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %881 = "tosa.reshape"(%880) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %882 = tensor.cast %881 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %883 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %884 = "tosa.mul"(%98, %883) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %885 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %886 = "tosa.mul"(%882, %885) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %887 = "tosa.add"(%884, %886) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %888 = "tosa.reshape"(%887) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %889 = "tosa.slice"(%888) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %890 = "tosa.slice"(%888) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %891 = "tosa.slice"(%888) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %892 = "tosa.reshape"(%889) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %893 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %894 = "tosa.transpose"(%892, %893) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %895 = "tosa.reshape"(%890) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %896 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %897 = "tosa.transpose"(%895, %896) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %898 = "tosa.reshape"(%891) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %899 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %900 = "tosa.transpose"(%898, %899) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %901 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %902 = "tosa.transpose"(%897, %901) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %903 = "tosa.reshape"(%894) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %904 = "tosa.reshape"(%902) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %905 = "tosa.matmul"(%903, %904) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %906 = tensor.cast %905 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %907 = "tosa.reshape"(%906) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %908 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %909 = "tosa.reciprocal"(%908) : (tensor<f32>) -> tensor<f32>
  %910 = "tosa.mul"(%907, %909) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %911 = torch_c.from_builtin_tensor %910 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %912 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %913 = "tosa.slice"(%912) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %914 = "tosa.slice"(%913) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %915 = "tosa.slice"(%914) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %916 = torch_c.from_builtin_tensor %915 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %917 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %918 = torch.aten.to.dtype %917, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %919 = torch_c.to_builtin_tensor %918 : !torch.vtensor<[],i1> -> tensor<i1>
  %920 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %921 = torch.valsem.aten.copy %918, %916, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %922 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %923 = torch_c.from_builtin_tensor %922 : tensor<f32> -> !torch.vtensor<[],f32>
  %924 = torch.aten.where.self %921, %911, %923 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %925 = torch_c.to_builtin_tensor %924 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %926 = "tosa.reduce_max"(%925) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %927 = "tosa.argmax"(%925) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %928 = "tosa.reshape"(%927) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %929 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %930 = "tosa.mul"(%926, %929) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %931 = "tosa.sub"(%925, %930) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %932 = "tosa.exp"(%931) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %933 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %934 = "tosa.reduce_sum"(%932) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %935 = "tosa.reciprocal"(%934) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %936 = "tosa.mul"(%932, %935) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %937 = "tosa.reshape"(%936) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %938 = "tosa.reshape"(%900) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %939 = "tosa.matmul"(%937, %938) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %940 = tensor.cast %939 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %941 = "tosa.reshape"(%940) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %942 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %943 = "tosa.transpose"(%941, %942) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %944 = "tosa.cast"(%943) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %945 = "tosa.reshape"(%944) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %946 = "tosa.reshape"(%945) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %947 = "tosa.reshape"(%946) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %948 = "tosa.reshape"(%95) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %949 = "tosa.matmul"(%947, %948) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %950 = "tosa.reshape"(%949) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %951 = tensor.cast %950 : tensor<5x768xf32> to tensor<5x768xf32>
  %952 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %953 = "tosa.mul"(%96, %952) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %954 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %955 = "tosa.mul"(%951, %954) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %956 = "tosa.add"(%953, %955) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %957 = "tosa.reshape"(%956) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %958 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %959 = "tosa.mul"(%857, %958) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %960 = "tosa.add"(%957, %959) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %961 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %962 = "tosa.reciprocal"(%961) : (tensor<1xf32>) -> tensor<1xf32>
  %963 = "tosa.reduce_sum"(%960) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %964 = "tosa.reshape"(%963) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %965 = "tosa.mul"(%964, %962) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %966 = "tosa.sub"(%960, %965) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %967 = "tosa.mul"(%966, %966) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %968 = "tosa.reduce_sum"(%967) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %969 = "tosa.reshape"(%968) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %970 = "tosa.mul"(%969, %962) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %971 = "tosa.reshape"(%94) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %972 = "tosa.reshape"(%93) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %973 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %974 = "tosa.sub"(%960, %965) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %975 = "tosa.add"(%970, %973) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %976 = "tosa.rsqrt"(%975) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %977 = "tosa.mul"(%974, %976) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %978 = "tosa.mul"(%977, %971) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %979 = "tosa.add"(%978, %972) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %980 = "tosa.reshape"(%979) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %981 = "tosa.reshape"(%980) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %982 = "tosa.reshape"(%91) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %983 = "tosa.matmul"(%981, %982) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %984 = "tosa.reshape"(%983) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %985 = tensor.cast %984 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %986 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %987 = "tosa.mul"(%92, %986) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %988 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %989 = "tosa.mul"(%985, %988) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %990 = "tosa.add"(%987, %989) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %991 = "tosa.reshape"(%990) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %992 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %993 = "tosa.mul"(%991, %992) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %994 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %995 = "tosa.pow"(%991, %994) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %996 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %997 = "tosa.mul"(%995, %996) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %998 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %999 = "tosa.mul"(%997, %998) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1000 = "tosa.add"(%991, %999) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1001 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1002 = "tosa.mul"(%1000, %1001) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1003 = "tosa.tanh"(%1002) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1004 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1005 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1006 = "tosa.mul"(%1004, %1005) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %1007 = "tosa.add"(%1003, %1006) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1008 = "tosa.mul"(%993, %1007) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1009 = "tosa.reshape"(%1008) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1010 = "tosa.reshape"(%1009) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1011 = "tosa.reshape"(%89) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1012 = "tosa.matmul"(%1010, %1011) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1013 = "tosa.reshape"(%1012) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1014 = tensor.cast %1013 : tensor<5x768xf32> to tensor<5x768xf32>
  %1015 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1016 = "tosa.mul"(%90, %1015) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1017 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1018 = "tosa.mul"(%1014, %1017) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1019 = "tosa.add"(%1016, %1018) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1020 = "tosa.reshape"(%1019) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1021 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1022 = "tosa.mul"(%1020, %1021) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1023 = "tosa.add"(%960, %1022) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1024 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1025 = "tosa.reciprocal"(%1024) : (tensor<1xf32>) -> tensor<1xf32>
  %1026 = "tosa.reduce_sum"(%1023) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1027 = "tosa.reshape"(%1026) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1028 = "tosa.mul"(%1027, %1025) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1029 = "tosa.sub"(%1023, %1028) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1030 = "tosa.mul"(%1029, %1029) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1031 = "tosa.reduce_sum"(%1030) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1032 = "tosa.reshape"(%1031) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1033 = "tosa.mul"(%1032, %1025) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1034 = "tosa.reshape"(%88) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1035 = "tosa.reshape"(%87) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1036 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1037 = "tosa.sub"(%1023, %1028) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1038 = "tosa.add"(%1033, %1036) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1039 = "tosa.rsqrt"(%1038) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1040 = "tosa.mul"(%1037, %1039) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1041 = "tosa.mul"(%1040, %1034) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1042 = "tosa.add"(%1041, %1035) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1043 = "tosa.reshape"(%1042) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1044 = "tosa.reshape"(%1043) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1045 = "tosa.reshape"(%85) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1046 = "tosa.matmul"(%1044, %1045) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1047 = "tosa.reshape"(%1046) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1048 = tensor.cast %1047 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %1049 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1050 = "tosa.mul"(%86, %1049) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %1051 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1052 = "tosa.mul"(%1048, %1051) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %1053 = "tosa.add"(%1050, %1052) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1054 = "tosa.reshape"(%1053) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1055 = "tosa.slice"(%1054) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1056 = "tosa.slice"(%1054) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1057 = "tosa.slice"(%1054) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1058 = "tosa.reshape"(%1055) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1059 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1060 = "tosa.transpose"(%1058, %1059) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1061 = "tosa.reshape"(%1056) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1062 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1063 = "tosa.transpose"(%1061, %1062) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1064 = "tosa.reshape"(%1057) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1065 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1066 = "tosa.transpose"(%1064, %1065) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1067 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %1068 = "tosa.transpose"(%1063, %1067) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1069 = "tosa.reshape"(%1060) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1070 = "tosa.reshape"(%1068) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1071 = "tosa.matmul"(%1069, %1070) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1072 = tensor.cast %1071 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %1073 = "tosa.reshape"(%1072) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1074 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %1075 = "tosa.reciprocal"(%1074) : (tensor<f32>) -> tensor<f32>
  %1076 = "tosa.mul"(%1073, %1075) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %1077 = torch_c.from_builtin_tensor %1076 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1078 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1079 = "tosa.slice"(%1078) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1080 = "tosa.slice"(%1079) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1081 = "tosa.slice"(%1080) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1082 = torch_c.from_builtin_tensor %1081 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1083 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1084 = torch.aten.to.dtype %1083, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1085 = torch_c.to_builtin_tensor %1084 : !torch.vtensor<[],i1> -> tensor<i1>
  %1086 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %1087 = torch.valsem.aten.copy %1084, %1082, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1088 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %1089 = torch_c.from_builtin_tensor %1088 : tensor<f32> -> !torch.vtensor<[],f32>
  %1090 = torch.aten.where.self %1087, %1077, %1089 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1091 = torch_c.to_builtin_tensor %1090 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1092 = "tosa.reduce_max"(%1091) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1093 = "tosa.argmax"(%1091) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %1094 = "tosa.reshape"(%1093) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %1095 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1096 = "tosa.mul"(%1092, %1095) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %1097 = "tosa.sub"(%1091, %1096) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1098 = "tosa.exp"(%1097) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1099 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %1100 = "tosa.reduce_sum"(%1098) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1101 = "tosa.reciprocal"(%1100) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1102 = "tosa.mul"(%1098, %1101) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1103 = "tosa.reshape"(%1102) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1104 = "tosa.reshape"(%1066) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1105 = "tosa.matmul"(%1103, %1104) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1106 = tensor.cast %1105 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %1107 = "tosa.reshape"(%1106) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1108 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1109 = "tosa.transpose"(%1107, %1108) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1110 = "tosa.cast"(%1109) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %1111 = "tosa.reshape"(%1110) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1112 = "tosa.reshape"(%1111) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1113 = "tosa.reshape"(%1112) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1114 = "tosa.reshape"(%83) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1115 = "tosa.matmul"(%1113, %1114) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1116 = "tosa.reshape"(%1115) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1117 = tensor.cast %1116 : tensor<5x768xf32> to tensor<5x768xf32>
  %1118 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1119 = "tosa.mul"(%84, %1118) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1120 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1121 = "tosa.mul"(%1117, %1120) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1122 = "tosa.add"(%1119, %1121) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1123 = "tosa.reshape"(%1122) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1124 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1125 = "tosa.mul"(%1023, %1124) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1126 = "tosa.add"(%1123, %1125) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1127 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1128 = "tosa.reciprocal"(%1127) : (tensor<1xf32>) -> tensor<1xf32>
  %1129 = "tosa.reduce_sum"(%1126) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1130 = "tosa.reshape"(%1129) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1131 = "tosa.mul"(%1130, %1128) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1132 = "tosa.sub"(%1126, %1131) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1133 = "tosa.mul"(%1132, %1132) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1134 = "tosa.reduce_sum"(%1133) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1135 = "tosa.reshape"(%1134) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1136 = "tosa.mul"(%1135, %1128) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1137 = "tosa.reshape"(%82) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1138 = "tosa.reshape"(%81) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1139 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1140 = "tosa.sub"(%1126, %1131) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1141 = "tosa.add"(%1136, %1139) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1142 = "tosa.rsqrt"(%1141) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1143 = "tosa.mul"(%1140, %1142) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1144 = "tosa.mul"(%1143, %1137) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1145 = "tosa.add"(%1144, %1138) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1146 = "tosa.reshape"(%1145) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1147 = "tosa.reshape"(%1146) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1148 = "tosa.reshape"(%79) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1149 = "tosa.matmul"(%1147, %1148) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1150 = "tosa.reshape"(%1149) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1151 = tensor.cast %1150 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %1152 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1153 = "tosa.mul"(%80, %1152) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %1154 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1155 = "tosa.mul"(%1151, %1154) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %1156 = "tosa.add"(%1153, %1155) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1157 = "tosa.reshape"(%1156) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1158 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %1159 = "tosa.mul"(%1157, %1158) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1160 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1161 = "tosa.pow"(%1157, %1160) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1162 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %1163 = "tosa.mul"(%1161, %1162) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1164 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1165 = "tosa.mul"(%1163, %1164) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1166 = "tosa.add"(%1157, %1165) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1167 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1168 = "tosa.mul"(%1166, %1167) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1169 = "tosa.tanh"(%1168) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1170 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1171 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1172 = "tosa.mul"(%1170, %1171) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %1173 = "tosa.add"(%1169, %1172) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1174 = "tosa.mul"(%1159, %1173) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1175 = "tosa.reshape"(%1174) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1176 = "tosa.reshape"(%1175) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1177 = "tosa.reshape"(%77) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1178 = "tosa.matmul"(%1176, %1177) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1179 = "tosa.reshape"(%1178) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1180 = tensor.cast %1179 : tensor<5x768xf32> to tensor<5x768xf32>
  %1181 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1182 = "tosa.mul"(%78, %1181) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1183 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1184 = "tosa.mul"(%1180, %1183) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1185 = "tosa.add"(%1182, %1184) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1186 = "tosa.reshape"(%1185) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1187 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1188 = "tosa.mul"(%1186, %1187) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1189 = "tosa.add"(%1126, %1188) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1190 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1191 = "tosa.reciprocal"(%1190) : (tensor<1xf32>) -> tensor<1xf32>
  %1192 = "tosa.reduce_sum"(%1189) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1193 = "tosa.reshape"(%1192) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1194 = "tosa.mul"(%1193, %1191) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1195 = "tosa.sub"(%1189, %1194) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1196 = "tosa.mul"(%1195, %1195) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1197 = "tosa.reduce_sum"(%1196) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1198 = "tosa.reshape"(%1197) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1199 = "tosa.mul"(%1198, %1191) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1200 = "tosa.reshape"(%76) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1201 = "tosa.reshape"(%75) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1202 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1203 = "tosa.sub"(%1189, %1194) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1204 = "tosa.add"(%1199, %1202) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1205 = "tosa.rsqrt"(%1204) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1206 = "tosa.mul"(%1203, %1205) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1207 = "tosa.mul"(%1206, %1200) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1208 = "tosa.add"(%1207, %1201) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1209 = "tosa.reshape"(%1208) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1210 = "tosa.reshape"(%1209) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1211 = "tosa.reshape"(%73) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1212 = "tosa.matmul"(%1210, %1211) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1213 = "tosa.reshape"(%1212) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1214 = tensor.cast %1213 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %1215 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1216 = "tosa.mul"(%74, %1215) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %1217 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1218 = "tosa.mul"(%1214, %1217) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %1219 = "tosa.add"(%1216, %1218) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1220 = "tosa.reshape"(%1219) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1221 = "tosa.slice"(%1220) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1222 = "tosa.slice"(%1220) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1223 = "tosa.slice"(%1220) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1224 = "tosa.reshape"(%1221) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1225 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1226 = "tosa.transpose"(%1224, %1225) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1227 = "tosa.reshape"(%1222) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1228 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1229 = "tosa.transpose"(%1227, %1228) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1230 = "tosa.reshape"(%1223) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1231 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1232 = "tosa.transpose"(%1230, %1231) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1233 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %1234 = "tosa.transpose"(%1229, %1233) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1235 = "tosa.reshape"(%1226) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1236 = "tosa.reshape"(%1234) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1237 = "tosa.matmul"(%1235, %1236) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1238 = tensor.cast %1237 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %1239 = "tosa.reshape"(%1238) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1240 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %1241 = "tosa.reciprocal"(%1240) : (tensor<f32>) -> tensor<f32>
  %1242 = "tosa.mul"(%1239, %1241) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %1243 = torch_c.from_builtin_tensor %1242 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1244 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1245 = "tosa.slice"(%1244) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1246 = "tosa.slice"(%1245) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1247 = "tosa.slice"(%1246) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1248 = torch_c.from_builtin_tensor %1247 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1249 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1250 = torch.aten.to.dtype %1249, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1251 = torch_c.to_builtin_tensor %1250 : !torch.vtensor<[],i1> -> tensor<i1>
  %1252 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %1253 = torch.valsem.aten.copy %1250, %1248, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1254 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %1255 = torch_c.from_builtin_tensor %1254 : tensor<f32> -> !torch.vtensor<[],f32>
  %1256 = torch.aten.where.self %1253, %1243, %1255 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1257 = torch_c.to_builtin_tensor %1256 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1258 = "tosa.reduce_max"(%1257) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1259 = "tosa.argmax"(%1257) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %1260 = "tosa.reshape"(%1259) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %1261 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1262 = "tosa.mul"(%1258, %1261) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %1263 = "tosa.sub"(%1257, %1262) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1264 = "tosa.exp"(%1263) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1265 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %1266 = "tosa.reduce_sum"(%1264) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1267 = "tosa.reciprocal"(%1266) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1268 = "tosa.mul"(%1264, %1267) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1269 = "tosa.reshape"(%1268) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1270 = "tosa.reshape"(%1232) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1271 = "tosa.matmul"(%1269, %1270) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1272 = tensor.cast %1271 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %1273 = "tosa.reshape"(%1272) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1274 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1275 = "tosa.transpose"(%1273, %1274) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1276 = "tosa.cast"(%1275) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %1277 = "tosa.reshape"(%1276) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1278 = "tosa.reshape"(%1277) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1279 = "tosa.reshape"(%1278) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1280 = "tosa.reshape"(%71) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1281 = "tosa.matmul"(%1279, %1280) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1282 = "tosa.reshape"(%1281) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1283 = tensor.cast %1282 : tensor<5x768xf32> to tensor<5x768xf32>
  %1284 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1285 = "tosa.mul"(%72, %1284) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1286 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1287 = "tosa.mul"(%1283, %1286) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1288 = "tosa.add"(%1285, %1287) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1289 = "tosa.reshape"(%1288) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1290 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1291 = "tosa.mul"(%1189, %1290) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1292 = "tosa.add"(%1289, %1291) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1293 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1294 = "tosa.reciprocal"(%1293) : (tensor<1xf32>) -> tensor<1xf32>
  %1295 = "tosa.reduce_sum"(%1292) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1296 = "tosa.reshape"(%1295) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1297 = "tosa.mul"(%1296, %1294) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1298 = "tosa.sub"(%1292, %1297) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1299 = "tosa.mul"(%1298, %1298) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1300 = "tosa.reduce_sum"(%1299) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1301 = "tosa.reshape"(%1300) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1302 = "tosa.mul"(%1301, %1294) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1303 = "tosa.reshape"(%70) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1304 = "tosa.reshape"(%69) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1305 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1306 = "tosa.sub"(%1292, %1297) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1307 = "tosa.add"(%1302, %1305) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1308 = "tosa.rsqrt"(%1307) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1309 = "tosa.mul"(%1306, %1308) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1310 = "tosa.mul"(%1309, %1303) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1311 = "tosa.add"(%1310, %1304) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1312 = "tosa.reshape"(%1311) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1313 = "tosa.reshape"(%1312) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1314 = "tosa.reshape"(%67) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1315 = "tosa.matmul"(%1313, %1314) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1316 = "tosa.reshape"(%1315) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1317 = tensor.cast %1316 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %1318 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1319 = "tosa.mul"(%68, %1318) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %1320 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1321 = "tosa.mul"(%1317, %1320) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %1322 = "tosa.add"(%1319, %1321) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1323 = "tosa.reshape"(%1322) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1324 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %1325 = "tosa.mul"(%1323, %1324) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1326 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1327 = "tosa.pow"(%1323, %1326) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1328 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %1329 = "tosa.mul"(%1327, %1328) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1330 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1331 = "tosa.mul"(%1329, %1330) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1332 = "tosa.add"(%1323, %1331) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1333 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1334 = "tosa.mul"(%1332, %1333) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1335 = "tosa.tanh"(%1334) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1336 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1337 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1338 = "tosa.mul"(%1336, %1337) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %1339 = "tosa.add"(%1335, %1338) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1340 = "tosa.mul"(%1325, %1339) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1341 = "tosa.reshape"(%1340) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1342 = "tosa.reshape"(%1341) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1343 = "tosa.reshape"(%65) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1344 = "tosa.matmul"(%1342, %1343) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1345 = "tosa.reshape"(%1344) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1346 = tensor.cast %1345 : tensor<5x768xf32> to tensor<5x768xf32>
  %1347 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1348 = "tosa.mul"(%66, %1347) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1349 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1350 = "tosa.mul"(%1346, %1349) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1351 = "tosa.add"(%1348, %1350) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1352 = "tosa.reshape"(%1351) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1353 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1354 = "tosa.mul"(%1352, %1353) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1355 = "tosa.add"(%1292, %1354) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1356 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1357 = "tosa.reciprocal"(%1356) : (tensor<1xf32>) -> tensor<1xf32>
  %1358 = "tosa.reduce_sum"(%1355) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1359 = "tosa.reshape"(%1358) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1360 = "tosa.mul"(%1359, %1357) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1361 = "tosa.sub"(%1355, %1360) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1362 = "tosa.mul"(%1361, %1361) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1363 = "tosa.reduce_sum"(%1362) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1364 = "tosa.reshape"(%1363) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1365 = "tosa.mul"(%1364, %1357) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1366 = "tosa.reshape"(%64) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1367 = "tosa.reshape"(%63) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1368 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1369 = "tosa.sub"(%1355, %1360) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1370 = "tosa.add"(%1365, %1368) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1371 = "tosa.rsqrt"(%1370) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1372 = "tosa.mul"(%1369, %1371) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1373 = "tosa.mul"(%1372, %1366) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1374 = "tosa.add"(%1373, %1367) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1375 = "tosa.reshape"(%1374) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1376 = "tosa.reshape"(%1375) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1377 = "tosa.reshape"(%61) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1378 = "tosa.matmul"(%1376, %1377) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1379 = "tosa.reshape"(%1378) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1380 = tensor.cast %1379 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %1381 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1382 = "tosa.mul"(%62, %1381) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %1383 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1384 = "tosa.mul"(%1380, %1383) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %1385 = "tosa.add"(%1382, %1384) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1386 = "tosa.reshape"(%1385) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1387 = "tosa.slice"(%1386) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1388 = "tosa.slice"(%1386) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1389 = "tosa.slice"(%1386) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1390 = "tosa.reshape"(%1387) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1391 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1392 = "tosa.transpose"(%1390, %1391) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1393 = "tosa.reshape"(%1388) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1394 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1395 = "tosa.transpose"(%1393, %1394) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1396 = "tosa.reshape"(%1389) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1397 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1398 = "tosa.transpose"(%1396, %1397) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1399 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %1400 = "tosa.transpose"(%1395, %1399) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1401 = "tosa.reshape"(%1392) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1402 = "tosa.reshape"(%1400) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1403 = "tosa.matmul"(%1401, %1402) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1404 = tensor.cast %1403 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %1405 = "tosa.reshape"(%1404) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1406 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %1407 = "tosa.reciprocal"(%1406) : (tensor<f32>) -> tensor<f32>
  %1408 = "tosa.mul"(%1405, %1407) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %1409 = torch_c.from_builtin_tensor %1408 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1410 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1411 = "tosa.slice"(%1410) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1412 = "tosa.slice"(%1411) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1413 = "tosa.slice"(%1412) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1414 = torch_c.from_builtin_tensor %1413 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1415 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1416 = torch.aten.to.dtype %1415, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1417 = torch_c.to_builtin_tensor %1416 : !torch.vtensor<[],i1> -> tensor<i1>
  %1418 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %1419 = torch.valsem.aten.copy %1416, %1414, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1420 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %1421 = torch_c.from_builtin_tensor %1420 : tensor<f32> -> !torch.vtensor<[],f32>
  %1422 = torch.aten.where.self %1419, %1409, %1421 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1423 = torch_c.to_builtin_tensor %1422 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1424 = "tosa.reduce_max"(%1423) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1425 = "tosa.argmax"(%1423) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %1426 = "tosa.reshape"(%1425) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %1427 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1428 = "tosa.mul"(%1424, %1427) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %1429 = "tosa.sub"(%1423, %1428) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1430 = "tosa.exp"(%1429) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1431 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %1432 = "tosa.reduce_sum"(%1430) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1433 = "tosa.reciprocal"(%1432) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1434 = "tosa.mul"(%1430, %1433) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1435 = "tosa.reshape"(%1434) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1436 = "tosa.reshape"(%1398) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1437 = "tosa.matmul"(%1435, %1436) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1438 = tensor.cast %1437 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %1439 = "tosa.reshape"(%1438) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1440 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1441 = "tosa.transpose"(%1439, %1440) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1442 = "tosa.cast"(%1441) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %1443 = "tosa.reshape"(%1442) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1444 = "tosa.reshape"(%1443) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1445 = "tosa.reshape"(%1444) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1446 = "tosa.reshape"(%59) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1447 = "tosa.matmul"(%1445, %1446) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1448 = "tosa.reshape"(%1447) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1449 = tensor.cast %1448 : tensor<5x768xf32> to tensor<5x768xf32>
  %1450 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1451 = "tosa.mul"(%60, %1450) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1452 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1453 = "tosa.mul"(%1449, %1452) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1454 = "tosa.add"(%1451, %1453) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1455 = "tosa.reshape"(%1454) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1456 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1457 = "tosa.mul"(%1355, %1456) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1458 = "tosa.add"(%1455, %1457) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1459 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1460 = "tosa.reciprocal"(%1459) : (tensor<1xf32>) -> tensor<1xf32>
  %1461 = "tosa.reduce_sum"(%1458) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1462 = "tosa.reshape"(%1461) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1463 = "tosa.mul"(%1462, %1460) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1464 = "tosa.sub"(%1458, %1463) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1465 = "tosa.mul"(%1464, %1464) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1466 = "tosa.reduce_sum"(%1465) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1467 = "tosa.reshape"(%1466) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1468 = "tosa.mul"(%1467, %1460) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1469 = "tosa.reshape"(%58) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1470 = "tosa.reshape"(%57) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1471 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1472 = "tosa.sub"(%1458, %1463) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1473 = "tosa.add"(%1468, %1471) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1474 = "tosa.rsqrt"(%1473) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1475 = "tosa.mul"(%1472, %1474) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1476 = "tosa.mul"(%1475, %1469) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1477 = "tosa.add"(%1476, %1470) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1478 = "tosa.reshape"(%1477) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1479 = "tosa.reshape"(%1478) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1480 = "tosa.reshape"(%55) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1481 = "tosa.matmul"(%1479, %1480) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1482 = "tosa.reshape"(%1481) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1483 = tensor.cast %1482 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %1484 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1485 = "tosa.mul"(%56, %1484) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %1486 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1487 = "tosa.mul"(%1483, %1486) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %1488 = "tosa.add"(%1485, %1487) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1489 = "tosa.reshape"(%1488) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1490 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %1491 = "tosa.mul"(%1489, %1490) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1492 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1493 = "tosa.pow"(%1489, %1492) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1494 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %1495 = "tosa.mul"(%1493, %1494) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1496 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1497 = "tosa.mul"(%1495, %1496) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1498 = "tosa.add"(%1489, %1497) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1499 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1500 = "tosa.mul"(%1498, %1499) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1501 = "tosa.tanh"(%1500) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1502 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1503 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1504 = "tosa.mul"(%1502, %1503) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %1505 = "tosa.add"(%1501, %1504) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1506 = "tosa.mul"(%1491, %1505) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1507 = "tosa.reshape"(%1506) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1508 = "tosa.reshape"(%1507) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1509 = "tosa.reshape"(%53) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1510 = "tosa.matmul"(%1508, %1509) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1511 = "tosa.reshape"(%1510) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1512 = tensor.cast %1511 : tensor<5x768xf32> to tensor<5x768xf32>
  %1513 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1514 = "tosa.mul"(%54, %1513) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1515 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1516 = "tosa.mul"(%1512, %1515) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1517 = "tosa.add"(%1514, %1516) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1518 = "tosa.reshape"(%1517) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1519 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1520 = "tosa.mul"(%1518, %1519) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1521 = "tosa.add"(%1458, %1520) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1522 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1523 = "tosa.reciprocal"(%1522) : (tensor<1xf32>) -> tensor<1xf32>
  %1524 = "tosa.reduce_sum"(%1521) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1525 = "tosa.reshape"(%1524) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1526 = "tosa.mul"(%1525, %1523) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1527 = "tosa.sub"(%1521, %1526) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1528 = "tosa.mul"(%1527, %1527) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1529 = "tosa.reduce_sum"(%1528) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1530 = "tosa.reshape"(%1529) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1531 = "tosa.mul"(%1530, %1523) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1532 = "tosa.reshape"(%52) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1533 = "tosa.reshape"(%51) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1534 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1535 = "tosa.sub"(%1521, %1526) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1536 = "tosa.add"(%1531, %1534) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1537 = "tosa.rsqrt"(%1536) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1538 = "tosa.mul"(%1535, %1537) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1539 = "tosa.mul"(%1538, %1532) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1540 = "tosa.add"(%1539, %1533) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1541 = "tosa.reshape"(%1540) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1542 = "tosa.reshape"(%1541) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1543 = "tosa.reshape"(%49) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1544 = "tosa.matmul"(%1542, %1543) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1545 = "tosa.reshape"(%1544) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1546 = tensor.cast %1545 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %1547 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1548 = "tosa.mul"(%50, %1547) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %1549 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1550 = "tosa.mul"(%1546, %1549) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %1551 = "tosa.add"(%1548, %1550) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1552 = "tosa.reshape"(%1551) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1553 = "tosa.slice"(%1552) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1554 = "tosa.slice"(%1552) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1555 = "tosa.slice"(%1552) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1556 = "tosa.reshape"(%1553) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1557 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1558 = "tosa.transpose"(%1556, %1557) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1559 = "tosa.reshape"(%1554) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1560 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1561 = "tosa.transpose"(%1559, %1560) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1562 = "tosa.reshape"(%1555) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1563 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1564 = "tosa.transpose"(%1562, %1563) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1565 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %1566 = "tosa.transpose"(%1561, %1565) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1567 = "tosa.reshape"(%1558) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1568 = "tosa.reshape"(%1566) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1569 = "tosa.matmul"(%1567, %1568) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1570 = tensor.cast %1569 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %1571 = "tosa.reshape"(%1570) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1572 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %1573 = "tosa.reciprocal"(%1572) : (tensor<f32>) -> tensor<f32>
  %1574 = "tosa.mul"(%1571, %1573) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %1575 = torch_c.from_builtin_tensor %1574 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1576 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1577 = "tosa.slice"(%1576) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1578 = "tosa.slice"(%1577) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1579 = "tosa.slice"(%1578) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1580 = torch_c.from_builtin_tensor %1579 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1581 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1582 = torch.aten.to.dtype %1581, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1583 = torch_c.to_builtin_tensor %1582 : !torch.vtensor<[],i1> -> tensor<i1>
  %1584 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %1585 = torch.valsem.aten.copy %1582, %1580, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1586 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %1587 = torch_c.from_builtin_tensor %1586 : tensor<f32> -> !torch.vtensor<[],f32>
  %1588 = torch.aten.where.self %1585, %1575, %1587 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1589 = torch_c.to_builtin_tensor %1588 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1590 = "tosa.reduce_max"(%1589) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1591 = "tosa.argmax"(%1589) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %1592 = "tosa.reshape"(%1591) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %1593 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1594 = "tosa.mul"(%1590, %1593) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %1595 = "tosa.sub"(%1589, %1594) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1596 = "tosa.exp"(%1595) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1597 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %1598 = "tosa.reduce_sum"(%1596) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1599 = "tosa.reciprocal"(%1598) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1600 = "tosa.mul"(%1596, %1599) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1601 = "tosa.reshape"(%1600) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1602 = "tosa.reshape"(%1564) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1603 = "tosa.matmul"(%1601, %1602) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1604 = tensor.cast %1603 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %1605 = "tosa.reshape"(%1604) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1606 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1607 = "tosa.transpose"(%1605, %1606) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1608 = "tosa.cast"(%1607) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %1609 = "tosa.reshape"(%1608) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1610 = "tosa.reshape"(%1609) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1611 = "tosa.reshape"(%1610) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1612 = "tosa.reshape"(%47) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1613 = "tosa.matmul"(%1611, %1612) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1614 = "tosa.reshape"(%1613) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1615 = tensor.cast %1614 : tensor<5x768xf32> to tensor<5x768xf32>
  %1616 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1617 = "tosa.mul"(%48, %1616) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1618 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1619 = "tosa.mul"(%1615, %1618) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1620 = "tosa.add"(%1617, %1619) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1621 = "tosa.reshape"(%1620) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1622 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1623 = "tosa.mul"(%1521, %1622) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1624 = "tosa.add"(%1621, %1623) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1625 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1626 = "tosa.reciprocal"(%1625) : (tensor<1xf32>) -> tensor<1xf32>
  %1627 = "tosa.reduce_sum"(%1624) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1628 = "tosa.reshape"(%1627) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1629 = "tosa.mul"(%1628, %1626) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1630 = "tosa.sub"(%1624, %1629) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1631 = "tosa.mul"(%1630, %1630) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1632 = "tosa.reduce_sum"(%1631) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1633 = "tosa.reshape"(%1632) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1634 = "tosa.mul"(%1633, %1626) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1635 = "tosa.reshape"(%46) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1636 = "tosa.reshape"(%45) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1637 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1638 = "tosa.sub"(%1624, %1629) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1639 = "tosa.add"(%1634, %1637) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1640 = "tosa.rsqrt"(%1639) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1641 = "tosa.mul"(%1638, %1640) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1642 = "tosa.mul"(%1641, %1635) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1643 = "tosa.add"(%1642, %1636) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1644 = "tosa.reshape"(%1643) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1645 = "tosa.reshape"(%1644) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1646 = "tosa.reshape"(%43) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1647 = "tosa.matmul"(%1645, %1646) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1648 = "tosa.reshape"(%1647) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1649 = tensor.cast %1648 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %1650 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1651 = "tosa.mul"(%44, %1650) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %1652 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1653 = "tosa.mul"(%1649, %1652) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %1654 = "tosa.add"(%1651, %1653) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1655 = "tosa.reshape"(%1654) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1656 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %1657 = "tosa.mul"(%1655, %1656) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1658 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1659 = "tosa.pow"(%1655, %1658) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1660 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %1661 = "tosa.mul"(%1659, %1660) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1662 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1663 = "tosa.mul"(%1661, %1662) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1664 = "tosa.add"(%1655, %1663) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1665 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1666 = "tosa.mul"(%1664, %1665) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1667 = "tosa.tanh"(%1666) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1668 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1669 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1670 = "tosa.mul"(%1668, %1669) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %1671 = "tosa.add"(%1667, %1670) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1672 = "tosa.mul"(%1657, %1671) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1673 = "tosa.reshape"(%1672) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1674 = "tosa.reshape"(%1673) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1675 = "tosa.reshape"(%41) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1676 = "tosa.matmul"(%1674, %1675) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1677 = "tosa.reshape"(%1676) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1678 = tensor.cast %1677 : tensor<5x768xf32> to tensor<5x768xf32>
  %1679 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1680 = "tosa.mul"(%42, %1679) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1681 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1682 = "tosa.mul"(%1678, %1681) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1683 = "tosa.add"(%1680, %1682) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1684 = "tosa.reshape"(%1683) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1685 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1686 = "tosa.mul"(%1684, %1685) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1687 = "tosa.add"(%1624, %1686) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1688 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1689 = "tosa.reciprocal"(%1688) : (tensor<1xf32>) -> tensor<1xf32>
  %1690 = "tosa.reduce_sum"(%1687) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1691 = "tosa.reshape"(%1690) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1692 = "tosa.mul"(%1691, %1689) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1693 = "tosa.sub"(%1687, %1692) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1694 = "tosa.mul"(%1693, %1693) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1695 = "tosa.reduce_sum"(%1694) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1696 = "tosa.reshape"(%1695) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1697 = "tosa.mul"(%1696, %1689) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1698 = "tosa.reshape"(%40) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1699 = "tosa.reshape"(%39) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1700 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1701 = "tosa.sub"(%1687, %1692) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1702 = "tosa.add"(%1697, %1700) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1703 = "tosa.rsqrt"(%1702) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1704 = "tosa.mul"(%1701, %1703) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1705 = "tosa.mul"(%1704, %1698) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1706 = "tosa.add"(%1705, %1699) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1707 = "tosa.reshape"(%1706) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1708 = "tosa.reshape"(%1707) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1709 = "tosa.reshape"(%37) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1710 = "tosa.matmul"(%1708, %1709) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1711 = "tosa.reshape"(%1710) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1712 = tensor.cast %1711 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %1713 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1714 = "tosa.mul"(%38, %1713) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %1715 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1716 = "tosa.mul"(%1712, %1715) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %1717 = "tosa.add"(%1714, %1716) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1718 = "tosa.reshape"(%1717) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1719 = "tosa.slice"(%1718) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1720 = "tosa.slice"(%1718) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1721 = "tosa.slice"(%1718) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1722 = "tosa.reshape"(%1719) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1723 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1724 = "tosa.transpose"(%1722, %1723) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1725 = "tosa.reshape"(%1720) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1726 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1727 = "tosa.transpose"(%1725, %1726) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1728 = "tosa.reshape"(%1721) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1729 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1730 = "tosa.transpose"(%1728, %1729) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1731 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %1732 = "tosa.transpose"(%1727, %1731) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1733 = "tosa.reshape"(%1724) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1734 = "tosa.reshape"(%1732) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1735 = "tosa.matmul"(%1733, %1734) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1736 = tensor.cast %1735 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %1737 = "tosa.reshape"(%1736) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1738 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %1739 = "tosa.reciprocal"(%1738) : (tensor<f32>) -> tensor<f32>
  %1740 = "tosa.mul"(%1737, %1739) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %1741 = torch_c.from_builtin_tensor %1740 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1742 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1743 = "tosa.slice"(%1742) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1744 = "tosa.slice"(%1743) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1745 = "tosa.slice"(%1744) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1746 = torch_c.from_builtin_tensor %1745 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1747 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1748 = torch.aten.to.dtype %1747, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1749 = torch_c.to_builtin_tensor %1748 : !torch.vtensor<[],i1> -> tensor<i1>
  %1750 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %1751 = torch.valsem.aten.copy %1748, %1746, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1752 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %1753 = torch_c.from_builtin_tensor %1752 : tensor<f32> -> !torch.vtensor<[],f32>
  %1754 = torch.aten.where.self %1751, %1741, %1753 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1755 = torch_c.to_builtin_tensor %1754 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1756 = "tosa.reduce_max"(%1755) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1757 = "tosa.argmax"(%1755) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %1758 = "tosa.reshape"(%1757) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %1759 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1760 = "tosa.mul"(%1756, %1759) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %1761 = "tosa.sub"(%1755, %1760) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1762 = "tosa.exp"(%1761) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1763 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %1764 = "tosa.reduce_sum"(%1762) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1765 = "tosa.reciprocal"(%1764) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1766 = "tosa.mul"(%1762, %1765) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1767 = "tosa.reshape"(%1766) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1768 = "tosa.reshape"(%1730) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1769 = "tosa.matmul"(%1767, %1768) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1770 = tensor.cast %1769 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %1771 = "tosa.reshape"(%1770) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1772 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1773 = "tosa.transpose"(%1771, %1772) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1774 = "tosa.cast"(%1773) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %1775 = "tosa.reshape"(%1774) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1776 = "tosa.reshape"(%1775) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1777 = "tosa.reshape"(%1776) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1778 = "tosa.reshape"(%35) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1779 = "tosa.matmul"(%1777, %1778) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1780 = "tosa.reshape"(%1779) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1781 = tensor.cast %1780 : tensor<5x768xf32> to tensor<5x768xf32>
  %1782 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1783 = "tosa.mul"(%36, %1782) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1784 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1785 = "tosa.mul"(%1781, %1784) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1786 = "tosa.add"(%1783, %1785) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1787 = "tosa.reshape"(%1786) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1788 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1789 = "tosa.mul"(%1687, %1788) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1790 = "tosa.add"(%1787, %1789) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1791 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1792 = "tosa.reciprocal"(%1791) : (tensor<1xf32>) -> tensor<1xf32>
  %1793 = "tosa.reduce_sum"(%1790) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1794 = "tosa.reshape"(%1793) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1795 = "tosa.mul"(%1794, %1792) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1796 = "tosa.sub"(%1790, %1795) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1797 = "tosa.mul"(%1796, %1796) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1798 = "tosa.reduce_sum"(%1797) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1799 = "tosa.reshape"(%1798) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1800 = "tosa.mul"(%1799, %1792) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1801 = "tosa.reshape"(%34) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1802 = "tosa.reshape"(%33) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1803 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1804 = "tosa.sub"(%1790, %1795) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1805 = "tosa.add"(%1800, %1803) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1806 = "tosa.rsqrt"(%1805) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1807 = "tosa.mul"(%1804, %1806) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1808 = "tosa.mul"(%1807, %1801) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1809 = "tosa.add"(%1808, %1802) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1810 = "tosa.reshape"(%1809) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1811 = "tosa.reshape"(%1810) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1812 = "tosa.reshape"(%31) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1813 = "tosa.matmul"(%1811, %1812) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1814 = "tosa.reshape"(%1813) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1815 = tensor.cast %1814 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %1816 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1817 = "tosa.mul"(%32, %1816) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %1818 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1819 = "tosa.mul"(%1815, %1818) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %1820 = "tosa.add"(%1817, %1819) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1821 = "tosa.reshape"(%1820) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1822 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %1823 = "tosa.mul"(%1821, %1822) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1824 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1825 = "tosa.pow"(%1821, %1824) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1826 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %1827 = "tosa.mul"(%1825, %1826) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1828 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1829 = "tosa.mul"(%1827, %1828) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1830 = "tosa.add"(%1821, %1829) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1831 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1832 = "tosa.mul"(%1830, %1831) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1833 = "tosa.tanh"(%1832) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1834 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1835 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1836 = "tosa.mul"(%1834, %1835) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %1837 = "tosa.add"(%1833, %1836) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1838 = "tosa.mul"(%1823, %1837) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1839 = "tosa.reshape"(%1838) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1840 = "tosa.reshape"(%1839) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1841 = "tosa.reshape"(%29) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1842 = "tosa.matmul"(%1840, %1841) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1843 = "tosa.reshape"(%1842) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1844 = tensor.cast %1843 : tensor<5x768xf32> to tensor<5x768xf32>
  %1845 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1846 = "tosa.mul"(%30, %1845) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1847 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1848 = "tosa.mul"(%1844, %1847) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1849 = "tosa.add"(%1846, %1848) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1850 = "tosa.reshape"(%1849) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1851 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1852 = "tosa.mul"(%1850, %1851) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1853 = "tosa.add"(%1790, %1852) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1854 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1855 = "tosa.reciprocal"(%1854) : (tensor<1xf32>) -> tensor<1xf32>
  %1856 = "tosa.reduce_sum"(%1853) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1857 = "tosa.reshape"(%1856) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1858 = "tosa.mul"(%1857, %1855) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1859 = "tosa.sub"(%1853, %1858) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1860 = "tosa.mul"(%1859, %1859) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1861 = "tosa.reduce_sum"(%1860) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1862 = "tosa.reshape"(%1861) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1863 = "tosa.mul"(%1862, %1855) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1864 = "tosa.reshape"(%28) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1865 = "tosa.reshape"(%27) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1866 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1867 = "tosa.sub"(%1853, %1858) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1868 = "tosa.add"(%1863, %1866) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1869 = "tosa.rsqrt"(%1868) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1870 = "tosa.mul"(%1867, %1869) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1871 = "tosa.mul"(%1870, %1864) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1872 = "tosa.add"(%1871, %1865) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1873 = "tosa.reshape"(%1872) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1874 = "tosa.reshape"(%1873) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1875 = "tosa.reshape"(%25) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1876 = "tosa.matmul"(%1874, %1875) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1877 = "tosa.reshape"(%1876) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1878 = tensor.cast %1877 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %1879 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1880 = "tosa.mul"(%26, %1879) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %1881 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1882 = "tosa.mul"(%1878, %1881) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %1883 = "tosa.add"(%1880, %1882) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1884 = "tosa.reshape"(%1883) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1885 = "tosa.slice"(%1884) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1886 = "tosa.slice"(%1884) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1887 = "tosa.slice"(%1884) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1888 = "tosa.reshape"(%1885) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1889 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1890 = "tosa.transpose"(%1888, %1889) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1891 = "tosa.reshape"(%1886) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1892 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1893 = "tosa.transpose"(%1891, %1892) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1894 = "tosa.reshape"(%1887) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1895 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1896 = "tosa.transpose"(%1894, %1895) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1897 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %1898 = "tosa.transpose"(%1893, %1897) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1899 = "tosa.reshape"(%1890) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1900 = "tosa.reshape"(%1898) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1901 = "tosa.matmul"(%1899, %1900) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1902 = tensor.cast %1901 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %1903 = "tosa.reshape"(%1902) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1904 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %1905 = "tosa.reciprocal"(%1904) : (tensor<f32>) -> tensor<f32>
  %1906 = "tosa.mul"(%1903, %1905) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %1907 = torch_c.from_builtin_tensor %1906 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1908 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1909 = "tosa.slice"(%1908) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %1910 = "tosa.slice"(%1909) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1911 = "tosa.slice"(%1910) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1912 = torch_c.from_builtin_tensor %1911 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1913 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1914 = torch.aten.to.dtype %1913, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1915 = torch_c.to_builtin_tensor %1914 : !torch.vtensor<[],i1> -> tensor<i1>
  %1916 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %1917 = torch.valsem.aten.copy %1914, %1912, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1918 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %1919 = torch_c.from_builtin_tensor %1918 : tensor<f32> -> !torch.vtensor<[],f32>
  %1920 = torch.aten.where.self %1917, %1907, %1919 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1921 = torch_c.to_builtin_tensor %1920 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1922 = "tosa.reduce_max"(%1921) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1923 = "tosa.argmax"(%1921) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %1924 = "tosa.reshape"(%1923) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %1925 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1926 = "tosa.mul"(%1922, %1925) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %1927 = "tosa.sub"(%1921, %1926) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1928 = "tosa.exp"(%1927) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1929 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %1930 = "tosa.reduce_sum"(%1928) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1931 = "tosa.reciprocal"(%1930) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1932 = "tosa.mul"(%1928, %1931) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1933 = "tosa.reshape"(%1932) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1934 = "tosa.reshape"(%1896) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1935 = "tosa.matmul"(%1933, %1934) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1936 = tensor.cast %1935 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %1937 = "tosa.reshape"(%1936) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1938 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %1939 = "tosa.transpose"(%1937, %1938) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1940 = "tosa.cast"(%1939) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %1941 = "tosa.reshape"(%1940) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1942 = "tosa.reshape"(%1941) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1943 = "tosa.reshape"(%1942) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1944 = "tosa.reshape"(%23) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1945 = "tosa.matmul"(%1943, %1944) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1946 = "tosa.reshape"(%1945) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1947 = tensor.cast %1946 : tensor<5x768xf32> to tensor<5x768xf32>
  %1948 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1949 = "tosa.mul"(%24, %1948) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %1950 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1951 = "tosa.mul"(%1947, %1950) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %1952 = "tosa.add"(%1949, %1951) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1953 = "tosa.reshape"(%1952) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1954 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1955 = "tosa.mul"(%1853, %1954) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %1956 = "tosa.add"(%1953, %1955) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1957 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %1958 = "tosa.reciprocal"(%1957) : (tensor<1xf32>) -> tensor<1xf32>
  %1959 = "tosa.reduce_sum"(%1956) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1960 = "tosa.reshape"(%1959) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1961 = "tosa.mul"(%1960, %1958) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1962 = "tosa.sub"(%1956, %1961) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1963 = "tosa.mul"(%1962, %1962) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1964 = "tosa.reduce_sum"(%1963) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1965 = "tosa.reshape"(%1964) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1966 = "tosa.mul"(%1965, %1958) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %1967 = "tosa.reshape"(%22) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1968 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1969 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %1970 = "tosa.sub"(%1956, %1961) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1971 = "tosa.add"(%1966, %1969) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %1972 = "tosa.rsqrt"(%1971) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1973 = "tosa.mul"(%1970, %1972) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1974 = "tosa.mul"(%1973, %1967) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1975 = "tosa.add"(%1974, %1968) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1976 = "tosa.reshape"(%1975) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1977 = "tosa.reshape"(%1976) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1978 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1979 = "tosa.matmul"(%1977, %1978) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1980 = "tosa.reshape"(%1979) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1981 = tensor.cast %1980 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %1982 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1983 = "tosa.mul"(%20, %1982) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %1984 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1985 = "tosa.mul"(%1981, %1984) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %1986 = "tosa.add"(%1983, %1985) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1987 = "tosa.reshape"(%1986) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1988 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %1989 = "tosa.mul"(%1987, %1988) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1990 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1991 = "tosa.pow"(%1987, %1990) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1992 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %1993 = "tosa.mul"(%1991, %1992) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1994 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %1995 = "tosa.mul"(%1993, %1994) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1996 = "tosa.add"(%1987, %1995) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1997 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %1998 = "tosa.mul"(%1996, %1997) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %1999 = "tosa.tanh"(%1998) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %2000 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2001 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2002 = "tosa.mul"(%2000, %2001) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %2003 = "tosa.add"(%1999, %2002) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2004 = "tosa.mul"(%1989, %2003) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %2005 = "tosa.reshape"(%2004) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %2006 = "tosa.reshape"(%2005) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %2007 = "tosa.reshape"(%17) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %2008 = "tosa.matmul"(%2006, %2007) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %2009 = "tosa.reshape"(%2008) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2010 = tensor.cast %2009 : tensor<5x768xf32> to tensor<5x768xf32>
  %2011 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2012 = "tosa.mul"(%18, %2011) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %2013 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2014 = "tosa.mul"(%2010, %2013) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %2015 = "tosa.add"(%2012, %2014) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %2016 = "tosa.reshape"(%2015) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2017 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2018 = "tosa.mul"(%2016, %2017) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %2019 = "tosa.add"(%1956, %2018) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2020 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %2021 = "tosa.reciprocal"(%2020) : (tensor<1xf32>) -> tensor<1xf32>
  %2022 = "tosa.reduce_sum"(%2019) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %2023 = "tosa.reshape"(%2022) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2024 = "tosa.mul"(%2023, %2021) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %2025 = "tosa.sub"(%2019, %2024) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2026 = "tosa.mul"(%2025, %2025) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2027 = "tosa.reduce_sum"(%2026) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %2028 = "tosa.reshape"(%2027) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2029 = "tosa.mul"(%2028, %2021) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %2030 = "tosa.reshape"(%16) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %2031 = "tosa.reshape"(%15) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %2032 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %2033 = "tosa.sub"(%2019, %2024) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2034 = "tosa.add"(%2029, %2032) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %2035 = "tosa.rsqrt"(%2034) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2036 = "tosa.mul"(%2033, %2035) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2037 = "tosa.mul"(%2036, %2030) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %2038 = "tosa.add"(%2037, %2031) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %2039 = "tosa.reshape"(%2038) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2040 = "tosa.reshape"(%2039) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2041 = "tosa.reshape"(%13) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %2042 = "tosa.matmul"(%2040, %2041) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %2043 = "tosa.reshape"(%2042) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %2044 = tensor.cast %2043 : tensor<5x2304xf32> to tensor<5x2304xf32>
  %2045 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2046 = "tosa.mul"(%14, %2045) {shift = 0 : i32} : (tensor<2304xf32>, tensor<f32>) -> tensor<2304xf32>
  %2047 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2048 = "tosa.mul"(%2044, %2047) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<f32>) -> tensor<5x2304xf32>
  %2049 = "tosa.add"(%2046, %2048) : (tensor<2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %2050 = "tosa.reshape"(%2049) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %2051 = "tosa.slice"(%2050) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %2052 = "tosa.slice"(%2050) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %2053 = "tosa.slice"(%2050) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %2054 = "tosa.reshape"(%2051) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %2055 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %2056 = "tosa.transpose"(%2054, %2055) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %2057 = "tosa.reshape"(%2052) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %2058 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %2059 = "tosa.transpose"(%2057, %2058) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %2060 = "tosa.reshape"(%2053) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %2061 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %2062 = "tosa.transpose"(%2060, %2061) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %2063 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %2064 = "tosa.transpose"(%2059, %2063) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %2065 = "tosa.reshape"(%2056) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %2066 = "tosa.reshape"(%2064) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %2067 = "tosa.matmul"(%2065, %2066) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %2068 = tensor.cast %2067 : tensor<12x5x5xf32> to tensor<12x5x5xf32>
  %2069 = "tosa.reshape"(%2068) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %2070 = "tosa.cast"(%147) : (tensor<f32>) -> tensor<f32>
  %2071 = "tosa.reciprocal"(%2070) : (tensor<f32>) -> tensor<f32>
  %2072 = "tosa.mul"(%2069, %2071) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<f32>) -> tensor<1x12x5x5xf32>
  %2073 = torch_c.from_builtin_tensor %2072 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %2074 = "tosa.slice"(%146) {size = [9223372036854775807, 1, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %2075 = "tosa.slice"(%2074) {size = [1, 9223372036854775807, 1024, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x1024x1024xi8>
  %2076 = "tosa.slice"(%2075) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %2077 = "tosa.slice"(%2076) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %2078 = torch_c.from_builtin_tensor %2077 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %2079 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %2080 = torch.aten.to.dtype %2079, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %2081 = torch_c.to_builtin_tensor %2080 : !torch.vtensor<[],i1> -> tensor<i1>
  %2082 = torch.prim.ListConstruct %int1, %int1, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %2083 = torch.valsem.aten.copy %2080, %2078, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %2084 = "tosa.cast"(%145) : (tensor<f32>) -> tensor<f32>
  %2085 = torch_c.from_builtin_tensor %2084 : tensor<f32> -> !torch.vtensor<[],f32>
  %2086 = torch.aten.where.self %2083, %2073, %2085 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %2087 = torch_c.to_builtin_tensor %2086 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %2088 = "tosa.reduce_max"(%2087) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %2089 = "tosa.argmax"(%2087) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5xi64>
  %2090 = "tosa.reshape"(%2089) {new_shape = [1, 12, 5, 1]} : (tensor<1x12x5xi64>) -> tensor<1x12x5x1xi64>
  %2091 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2092 = "tosa.mul"(%2088, %2091) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<f32>) -> tensor<1x12x5x1xf32>
  %2093 = "tosa.sub"(%2087, %2092) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %2094 = "tosa.exp"(%2093) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %2095 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
  %2096 = "tosa.reduce_sum"(%2094) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %2097 = "tosa.reciprocal"(%2096) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %2098 = "tosa.mul"(%2094, %2097) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %2099 = "tosa.reshape"(%2098) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %2100 = "tosa.reshape"(%2062) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %2101 = "tosa.matmul"(%2099, %2100) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %2102 = tensor.cast %2101 : tensor<12x5x64xf32> to tensor<12x5x64xf32>
  %2103 = "tosa.reshape"(%2102) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %2104 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %2105 = "tosa.transpose"(%2103, %2104) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %2106 = "tosa.cast"(%2105) : (tensor<1x5x12x64xf32>) -> tensor<1x5x12x64xf32>
  %2107 = "tosa.reshape"(%2106) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %2108 = "tosa.reshape"(%2107) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2109 = "tosa.reshape"(%2108) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2110 = "tosa.reshape"(%11) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %2111 = "tosa.matmul"(%2109, %2110) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %2112 = "tosa.reshape"(%2111) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2113 = tensor.cast %2112 : tensor<5x768xf32> to tensor<5x768xf32>
  %2114 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2115 = "tosa.mul"(%12, %2114) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %2116 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2117 = "tosa.mul"(%2113, %2116) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %2118 = "tosa.add"(%2115, %2117) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %2119 = "tosa.reshape"(%2118) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2120 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2121 = "tosa.mul"(%2019, %2120) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %2122 = "tosa.add"(%2119, %2121) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2123 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %2124 = "tosa.reciprocal"(%2123) : (tensor<1xf32>) -> tensor<1xf32>
  %2125 = "tosa.reduce_sum"(%2122) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %2126 = "tosa.reshape"(%2125) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2127 = "tosa.mul"(%2126, %2124) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %2128 = "tosa.sub"(%2122, %2127) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2129 = "tosa.mul"(%2128, %2128) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2130 = "tosa.reduce_sum"(%2129) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %2131 = "tosa.reshape"(%2130) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2132 = "tosa.mul"(%2131, %2124) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %2133 = "tosa.reshape"(%10) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %2134 = "tosa.reshape"(%9) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %2135 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %2136 = "tosa.sub"(%2122, %2127) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2137 = "tosa.add"(%2132, %2135) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %2138 = "tosa.rsqrt"(%2137) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2139 = "tosa.mul"(%2136, %2138) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2140 = "tosa.mul"(%2139, %2133) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %2141 = "tosa.add"(%2140, %2134) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %2142 = "tosa.reshape"(%2141) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2143 = "tosa.reshape"(%2142) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2144 = "tosa.reshape"(%7) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %2145 = "tosa.matmul"(%2143, %2144) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %2146 = "tosa.reshape"(%2145) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %2147 = tensor.cast %2146 : tensor<5x3072xf32> to tensor<5x3072xf32>
  %2148 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2149 = "tosa.mul"(%8, %2148) {shift = 0 : i32} : (tensor<3072xf32>, tensor<f32>) -> tensor<3072xf32>
  %2150 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2151 = "tosa.mul"(%2147, %2150) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<f32>) -> tensor<5x3072xf32>
  %2152 = "tosa.add"(%2149, %2151) : (tensor<3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %2153 = "tosa.reshape"(%2152) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %2154 = "tosa.const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
  %2155 = "tosa.mul"(%2153, %2154) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2156 = "tosa.const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2157 = "tosa.pow"(%2153, %2156) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2158 = "tosa.const"() {value = dense<4.471500e-02> : tensor<f32>} : () -> tensor<f32>
  %2159 = "tosa.mul"(%2157, %2158) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2160 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2161 = "tosa.mul"(%2159, %2160) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2162 = "tosa.add"(%2153, %2161) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %2163 = "tosa.const"() {value = dense<0.797884583> : tensor<f32>} : () -> tensor<f32>
  %2164 = "tosa.mul"(%2162, %2163) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2165 = "tosa.tanh"(%2164) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %2166 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2167 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2168 = "tosa.mul"(%2166, %2167) {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
  %2169 = "tosa.add"(%2165, %2168) : (tensor<1x5x3072xf32>, tensor<f32>) -> tensor<1x5x3072xf32>
  %2170 = "tosa.mul"(%2155, %2169) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %2171 = "tosa.reshape"(%2170) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %2172 = "tosa.reshape"(%2171) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %2173 = "tosa.reshape"(%5) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %2174 = "tosa.matmul"(%2172, %2173) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %2175 = "tosa.reshape"(%2174) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2176 = tensor.cast %2175 : tensor<5x768xf32> to tensor<5x768xf32>
  %2177 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2178 = "tosa.mul"(%6, %2177) {shift = 0 : i32} : (tensor<768xf32>, tensor<f32>) -> tensor<768xf32>
  %2179 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2180 = "tosa.mul"(%2176, %2179) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<f32>) -> tensor<5x768xf32>
  %2181 = "tosa.add"(%2178, %2180) : (tensor<768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %2182 = "tosa.reshape"(%2181) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2183 = "tosa.const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %2184 = "tosa.mul"(%2182, %2183) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<f32>) -> tensor<1x5x768xf32>
  %2185 = "tosa.add"(%2122, %2184) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2186 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %2187 = "tosa.reciprocal"(%2186) : (tensor<1xf32>) -> tensor<1xf32>
  %2188 = "tosa.reduce_sum"(%2185) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %2189 = "tosa.reshape"(%2188) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2190 = "tosa.mul"(%2189, %2187) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %2191 = "tosa.sub"(%2185, %2190) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2192 = "tosa.mul"(%2191, %2191) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2193 = "tosa.reduce_sum"(%2192) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %2194 = "tosa.reshape"(%2193) {new_shape = [1, 5, 1]} : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2195 = "tosa.mul"(%2194, %2187) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1xf32>) -> tensor<1x5x1xf32>
  %2196 = "tosa.reshape"(%4) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %2197 = "tosa.reshape"(%3) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %2198 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
  %2199 = "tosa.sub"(%2185, %2190) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2200 = "tosa.add"(%2195, %2198) : (tensor<1x5x1xf32>, tensor<f32>) -> tensor<1x5x1xf32>
  %2201 = "tosa.rsqrt"(%2200) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %2202 = "tosa.mul"(%2199, %2201) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %2203 = "tosa.mul"(%2202, %2196) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %2204 = "tosa.add"(%2203, %2197) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %2205 = "tosa.reshape"(%2204) {new_shape = [1, 5, 768]} : (tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %2206 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
  %2207 = "tosa.transpose"(%153, %2206) : (tensor<50257x768xf32>, tensor<2xi32>) -> tensor<768x50257xf32>
  %2208 = torch.prim.ListConstruct %int5, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
  %2209 = "tosa.reshape"(%2205) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %2210 = "tosa.reshape"(%2209) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %2211 = "tosa.reshape"(%2207) {new_shape = [1, 768, 50257]} : (tensor<768x50257xf32>) -> tensor<1x768x50257xf32>
  %2212 = "tosa.matmul"(%2210, %2211) : (tensor<1x5x768xf32>, tensor<1x768x50257xf32>) -> tensor<1x5x50257xf32>
  %2213 = "tosa.reshape"(%2212) {new_shape = [5, 50257]} : (tensor<1x5x50257xf32>) -> tensor<5x50257xf32>
  %2214 = tensor.cast %2213 : tensor<5x50257xf32> to tensor<5x50257xf32>
  %2215 = torch.prim.ListConstruct %int1, %int5, %int50257 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
  %2216 = "tosa.reshape"(%2214) {new_shape = [1, 5, 50257]} : (tensor<5x50257xf32>) -> tensor<1x5x50257xf32>
  %2217 = torch_c.from_builtin_tensor %2216 : tensor<1x5x50257xf32> -> !torch.vtensor<[1,5,50257],f32>
  return %2217 : !torch.vtensor<[1,5,50257],f32>
 }

 // -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
 func.func @forward(%arg0: !torch.vtensor<[1,5],si64>) -> !torch.vtensor<[1,5,50257],f32> {
  %0 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %1 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %2 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
  %3 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
  %4 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1x1xf32>} : () -> tensor<1x1x1x1xf32>
  %5 = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %6 = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %7 = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %8 = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %9 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
  %10 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %11 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %12 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %int0 = torch.constant.int 0
  %int4 = torch.constant.int 4
  %int11 = torch.constant.int 11
  %none = torch.constant.none
  %false = torch.constant.bool false
  %13 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
  %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
  %15 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
  %17 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %19 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %21 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %22 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %23 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %24 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %float0.000000e00 = torch.constant.float 0.000000e+00
  %int1 = torch.constant.int 1
  %int5 = torch.constant.int 5
  %25 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
  %cpu = torch.constant.device "cpu"
  %26 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
  %27 = torch_c.to_builtin_tensor %26 : !torch.vtensor<[5],si64> -> tensor<5xi64>
  %28 = "tosa.reshape"(%27) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
  %29 = "tosa.reshape"(%13) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
  %30 = "tosa.cast"(%25) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %31 = "tosa.gather"(%29, %30) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %32 = "tosa.reshape"(%14) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
  %33 = "tosa.cast"(%28) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %34 = "tosa.gather"(%32, %33) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %35 = "tosa.mul"(%34, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %36 = "tosa.add"(%31, %35) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %37 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %38 = "tosa.reduce_sum"(%36) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %39 = "tosa.reshape"(%37) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %40 = "tosa.mul"(%38, %39) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %41 = "tosa.sub"(%36, %40) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %42 = "tosa.mul"(%41, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %43 = "tosa.reduce_sum"(%42) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %44 = "tosa.reshape"(%37) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %45 = "tosa.mul"(%43, %44) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %46 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %47 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %48 = "tosa.sub"(%36, %40) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %49 = "tosa.add"(%45, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %50 = "tosa.rsqrt"(%49) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %51 = "tosa.mul"(%48, %50) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %52 = "tosa.mul"(%51, %46) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %53 = "tosa.add"(%52, %47) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %54 = "tosa.reshape"(%53) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %55 = "tosa.reshape"(%54) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %56 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %57 = "tosa.matmul"(%55, %56) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %58 = "tosa.reshape"(%57) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %59 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %60 = "tosa.mul"(%58, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %61 = "tosa.reshape"(%59) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %62 = "tosa.add"(%61, %60) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %63 = "tosa.reshape"(%62) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %64 = "tosa.slice"(%63) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %65 = "tosa.slice"(%63) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %66 = "tosa.slice"(%63) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %67 = "tosa.reshape"(%64) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %68 = "tosa.transpose"(%67, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %69 = "tosa.reshape"(%65) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %70 = "tosa.transpose"(%69, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %71 = "tosa.reshape"(%66) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %72 = "tosa.transpose"(%71, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %73 = "tosa.transpose"(%70, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %74 = "tosa.reshape"(%68) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %75 = "tosa.reshape"(%73) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %76 = "tosa.matmul"(%74, %75) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %77 = "tosa.reshape"(%76) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %78 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %79 = "tosa.reshape"(%78) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %80 = "tosa.mul"(%77, %79) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %81 = torch_c.from_builtin_tensor %80 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %82 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %83 = "tosa.slice"(%82) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %84 = torch_c.from_builtin_tensor %83 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %85 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %86 = torch.aten.to.dtype %85, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %87 = torch.valsem.aten.copy %86, %84, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %88 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %89 = torch.aten.where.self %87, %81, %88 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %90 = torch_c.to_builtin_tensor %89 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %91 = "tosa.reduce_max"(%90) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %92 = "tosa.mul"(%91, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %93 = "tosa.sub"(%90, %92) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %94 = "tosa.exp"(%93) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %95 = "tosa.reduce_sum"(%94) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %96 = "tosa.reciprocal"(%95) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %97 = "tosa.mul"(%94, %96) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %98 = "tosa.reshape"(%97) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %99 = "tosa.reshape"(%72) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %100 = "tosa.matmul"(%98, %99) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %101 = "tosa.reshape"(%100) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %102 = "tosa.transpose"(%101, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %103 = "tosa.reshape"(%102) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %104 = "tosa.reshape"(%103) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %105 = "tosa.reshape"(%104) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %106 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %107 = "tosa.matmul"(%105, %106) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %108 = "tosa.reshape"(%107) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %109 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %110 = "tosa.mul"(%108, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %111 = "tosa.reshape"(%109) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %112 = "tosa.add"(%111, %110) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %113 = "tosa.reshape"(%112) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %114 = "tosa.mul"(%36, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %115 = "tosa.add"(%113, %114) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %116 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %117 = "tosa.reduce_sum"(%115) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %118 = "tosa.reshape"(%116) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %119 = "tosa.mul"(%117, %118) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %120 = "tosa.sub"(%115, %119) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %121 = "tosa.mul"(%120, %120) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %122 = "tosa.reduce_sum"(%121) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %123 = "tosa.reshape"(%116) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %124 = "tosa.mul"(%122, %123) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %125 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %126 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %127 = "tosa.sub"(%115, %119) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %128 = "tosa.add"(%124, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %129 = "tosa.rsqrt"(%128) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %130 = "tosa.mul"(%127, %129) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %131 = "tosa.mul"(%130, %125) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %132 = "tosa.add"(%131, %126) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %133 = "tosa.reshape"(%132) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %134 = "tosa.reshape"(%133) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %135 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %136 = "tosa.matmul"(%134, %135) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %137 = "tosa.reshape"(%136) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %138 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %139 = "tosa.mul"(%137, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %140 = "tosa.reshape"(%138) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %141 = "tosa.add"(%140, %139) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %142 = "tosa.reshape"(%141) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %143 = "tosa.mul"(%142, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %144 = "tosa.pow"(%142, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %145 = "tosa.mul"(%144, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %146 = "tosa.mul"(%145, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %147 = "tosa.add"(%142, %146) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %148 = "tosa.mul"(%147, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %149 = "tosa.tanh"(%148) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %150 = "tosa.add"(%149, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %151 = "tosa.mul"(%143, %150) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %152 = "tosa.reshape"(%151) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %153 = "tosa.reshape"(%152) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %154 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %155 = "tosa.matmul"(%153, %154) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %156 = "tosa.reshape"(%155) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %157 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %158 = "tosa.mul"(%156, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %159 = "tosa.reshape"(%157) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %160 = "tosa.add"(%159, %158) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %161 = "tosa.reshape"(%160) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %162 = "tosa.mul"(%161, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %163 = "tosa.add"(%115, %162) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %164 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %165 = "tosa.reduce_sum"(%163) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %166 = "tosa.reshape"(%164) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %167 = "tosa.mul"(%165, %166) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %168 = "tosa.sub"(%163, %167) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %169 = "tosa.mul"(%168, %168) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %170 = "tosa.reduce_sum"(%169) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %171 = "tosa.reshape"(%164) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %172 = "tosa.mul"(%170, %171) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %173 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %174 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %175 = "tosa.sub"(%163, %167) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %176 = "tosa.add"(%172, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %177 = "tosa.rsqrt"(%176) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %178 = "tosa.mul"(%175, %177) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %179 = "tosa.mul"(%178, %173) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %180 = "tosa.add"(%179, %174) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %181 = "tosa.reshape"(%180) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %182 = "tosa.reshape"(%181) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %183 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %184 = "tosa.matmul"(%182, %183) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %185 = "tosa.reshape"(%184) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %186 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %187 = "tosa.mul"(%185, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %188 = "tosa.reshape"(%186) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %189 = "tosa.add"(%188, %187) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %190 = "tosa.reshape"(%189) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %191 = "tosa.slice"(%190) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %192 = "tosa.slice"(%190) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %193 = "tosa.slice"(%190) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %194 = "tosa.reshape"(%191) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %195 = "tosa.transpose"(%194, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %196 = "tosa.reshape"(%192) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %197 = "tosa.transpose"(%196, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %198 = "tosa.reshape"(%193) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %199 = "tosa.transpose"(%198, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %200 = "tosa.transpose"(%197, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %201 = "tosa.reshape"(%195) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %202 = "tosa.reshape"(%200) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %203 = "tosa.matmul"(%201, %202) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %204 = "tosa.reshape"(%203) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %205 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %206 = "tosa.reshape"(%205) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %207 = "tosa.mul"(%204, %206) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %208 = torch_c.from_builtin_tensor %207 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %209 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %210 = "tosa.slice"(%209) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %211 = torch_c.from_builtin_tensor %210 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %212 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %213 = torch.aten.to.dtype %212, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %214 = torch.valsem.aten.copy %213, %211, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %215 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %216 = torch.aten.where.self %214, %208, %215 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %217 = torch_c.to_builtin_tensor %216 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %218 = "tosa.reduce_max"(%217) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %219 = "tosa.mul"(%218, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %220 = "tosa.sub"(%217, %219) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %221 = "tosa.exp"(%220) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %222 = "tosa.reduce_sum"(%221) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %223 = "tosa.reciprocal"(%222) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %224 = "tosa.mul"(%221, %223) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %225 = "tosa.reshape"(%224) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %226 = "tosa.reshape"(%199) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %227 = "tosa.matmul"(%225, %226) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %228 = "tosa.reshape"(%227) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %229 = "tosa.transpose"(%228, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %230 = "tosa.reshape"(%229) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %231 = "tosa.reshape"(%230) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %232 = "tosa.reshape"(%231) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %233 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %234 = "tosa.matmul"(%232, %233) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %235 = "tosa.reshape"(%234) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %236 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %237 = "tosa.mul"(%235, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %238 = "tosa.reshape"(%236) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %239 = "tosa.add"(%238, %237) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %240 = "tosa.reshape"(%239) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %241 = "tosa.mul"(%163, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %242 = "tosa.add"(%240, %241) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %243 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %244 = "tosa.reduce_sum"(%242) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %245 = "tosa.reshape"(%243) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %246 = "tosa.mul"(%244, %245) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %247 = "tosa.sub"(%242, %246) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %248 = "tosa.mul"(%247, %247) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %249 = "tosa.reduce_sum"(%248) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %250 = "tosa.reshape"(%243) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %251 = "tosa.mul"(%249, %250) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %252 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %253 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %254 = "tosa.sub"(%242, %246) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %255 = "tosa.add"(%251, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %256 = "tosa.rsqrt"(%255) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %257 = "tosa.mul"(%254, %256) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %258 = "tosa.mul"(%257, %252) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %259 = "tosa.add"(%258, %253) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %260 = "tosa.reshape"(%259) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %261 = "tosa.reshape"(%260) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %262 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %263 = "tosa.matmul"(%261, %262) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %264 = "tosa.reshape"(%263) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %265 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %266 = "tosa.mul"(%264, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %267 = "tosa.reshape"(%265) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %268 = "tosa.add"(%267, %266) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %269 = "tosa.reshape"(%268) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %270 = "tosa.mul"(%269, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %271 = "tosa.pow"(%269, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %272 = "tosa.mul"(%271, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %273 = "tosa.mul"(%272, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %274 = "tosa.add"(%269, %273) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %275 = "tosa.mul"(%274, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %276 = "tosa.tanh"(%275) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %277 = "tosa.add"(%276, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %278 = "tosa.mul"(%270, %277) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %279 = "tosa.reshape"(%278) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %280 = "tosa.reshape"(%279) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %281 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %282 = "tosa.matmul"(%280, %281) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %283 = "tosa.reshape"(%282) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %284 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %285 = "tosa.mul"(%283, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %286 = "tosa.reshape"(%284) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %287 = "tosa.add"(%286, %285) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %288 = "tosa.reshape"(%287) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %289 = "tosa.mul"(%288, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %290 = "tosa.add"(%242, %289) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %291 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %292 = "tosa.reduce_sum"(%290) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %293 = "tosa.reshape"(%291) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %294 = "tosa.mul"(%292, %293) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %295 = "tosa.sub"(%290, %294) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %296 = "tosa.mul"(%295, %295) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %297 = "tosa.reduce_sum"(%296) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %298 = "tosa.reshape"(%291) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %299 = "tosa.mul"(%297, %298) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %300 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %301 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %302 = "tosa.sub"(%290, %294) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %303 = "tosa.add"(%299, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %304 = "tosa.rsqrt"(%303) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %305 = "tosa.mul"(%302, %304) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %306 = "tosa.mul"(%305, %300) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %307 = "tosa.add"(%306, %301) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %308 = "tosa.reshape"(%307) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %309 = "tosa.reshape"(%308) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %310 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %311 = "tosa.matmul"(%309, %310) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %312 = "tosa.reshape"(%311) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %313 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %314 = "tosa.mul"(%312, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %315 = "tosa.reshape"(%313) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %316 = "tosa.add"(%315, %314) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %317 = "tosa.reshape"(%316) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %318 = "tosa.slice"(%317) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %319 = "tosa.slice"(%317) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %320 = "tosa.slice"(%317) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %321 = "tosa.reshape"(%318) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %322 = "tosa.transpose"(%321, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %323 = "tosa.reshape"(%319) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %324 = "tosa.transpose"(%323, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %325 = "tosa.reshape"(%320) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %326 = "tosa.transpose"(%325, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %327 = "tosa.transpose"(%324, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %328 = "tosa.reshape"(%322) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %329 = "tosa.reshape"(%327) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %330 = "tosa.matmul"(%328, %329) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %331 = "tosa.reshape"(%330) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %332 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %333 = "tosa.reshape"(%332) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %334 = "tosa.mul"(%331, %333) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %335 = torch_c.from_builtin_tensor %334 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %336 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %337 = "tosa.slice"(%336) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %338 = torch_c.from_builtin_tensor %337 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %339 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %340 = torch.aten.to.dtype %339, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %341 = torch.valsem.aten.copy %340, %338, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %342 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %343 = torch.aten.where.self %341, %335, %342 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %344 = torch_c.to_builtin_tensor %343 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %345 = "tosa.reduce_max"(%344) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %346 = "tosa.mul"(%345, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %347 = "tosa.sub"(%344, %346) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %348 = "tosa.exp"(%347) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %349 = "tosa.reduce_sum"(%348) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %350 = "tosa.reciprocal"(%349) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %351 = "tosa.mul"(%348, %350) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %352 = "tosa.reshape"(%351) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %353 = "tosa.reshape"(%326) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %354 = "tosa.matmul"(%352, %353) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %355 = "tosa.reshape"(%354) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %356 = "tosa.transpose"(%355, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %357 = "tosa.reshape"(%356) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %358 = "tosa.reshape"(%357) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %359 = "tosa.reshape"(%358) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %360 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %361 = "tosa.matmul"(%359, %360) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %362 = "tosa.reshape"(%361) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %363 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %364 = "tosa.mul"(%362, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %365 = "tosa.reshape"(%363) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %366 = "tosa.add"(%365, %364) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %367 = "tosa.reshape"(%366) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %368 = "tosa.mul"(%290, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %369 = "tosa.add"(%367, %368) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %370 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %371 = "tosa.reduce_sum"(%369) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %372 = "tosa.reshape"(%370) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %373 = "tosa.mul"(%371, %372) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %374 = "tosa.sub"(%369, %373) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %375 = "tosa.mul"(%374, %374) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %376 = "tosa.reduce_sum"(%375) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %377 = "tosa.reshape"(%370) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %378 = "tosa.mul"(%376, %377) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %379 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %380 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %381 = "tosa.sub"(%369, %373) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %382 = "tosa.add"(%378, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %383 = "tosa.rsqrt"(%382) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %384 = "tosa.mul"(%381, %383) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %385 = "tosa.mul"(%384, %379) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %386 = "tosa.add"(%385, %380) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %387 = "tosa.reshape"(%386) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %388 = "tosa.reshape"(%387) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %389 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %390 = "tosa.matmul"(%388, %389) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %391 = "tosa.reshape"(%390) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %392 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %393 = "tosa.mul"(%391, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %394 = "tosa.reshape"(%392) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %395 = "tosa.add"(%394, %393) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %396 = "tosa.reshape"(%395) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %397 = "tosa.mul"(%396, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %398 = "tosa.pow"(%396, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %399 = "tosa.mul"(%398, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %400 = "tosa.mul"(%399, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %401 = "tosa.add"(%396, %400) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %402 = "tosa.mul"(%401, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %403 = "tosa.tanh"(%402) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %404 = "tosa.add"(%403, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %405 = "tosa.mul"(%397, %404) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %406 = "tosa.reshape"(%405) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %407 = "tosa.reshape"(%406) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %408 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %409 = "tosa.matmul"(%407, %408) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %410 = "tosa.reshape"(%409) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %411 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %412 = "tosa.mul"(%410, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %413 = "tosa.reshape"(%411) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %414 = "tosa.add"(%413, %412) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %415 = "tosa.reshape"(%414) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %416 = "tosa.mul"(%415, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %417 = "tosa.add"(%369, %416) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %418 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %419 = "tosa.reduce_sum"(%417) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %420 = "tosa.reshape"(%418) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %421 = "tosa.mul"(%419, %420) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %422 = "tosa.sub"(%417, %421) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %423 = "tosa.mul"(%422, %422) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %424 = "tosa.reduce_sum"(%423) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %425 = "tosa.reshape"(%418) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %426 = "tosa.mul"(%424, %425) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %427 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %428 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %429 = "tosa.sub"(%417, %421) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %430 = "tosa.add"(%426, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %431 = "tosa.rsqrt"(%430) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %432 = "tosa.mul"(%429, %431) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %433 = "tosa.mul"(%432, %427) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %434 = "tosa.add"(%433, %428) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %435 = "tosa.reshape"(%434) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %436 = "tosa.reshape"(%435) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %437 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %438 = "tosa.matmul"(%436, %437) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %439 = "tosa.reshape"(%438) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %440 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %441 = "tosa.mul"(%439, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %442 = "tosa.reshape"(%440) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %443 = "tosa.add"(%442, %441) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %444 = "tosa.reshape"(%443) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %445 = "tosa.slice"(%444) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %446 = "tosa.slice"(%444) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %447 = "tosa.slice"(%444) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %448 = "tosa.reshape"(%445) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %449 = "tosa.transpose"(%448, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %450 = "tosa.reshape"(%446) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %451 = "tosa.transpose"(%450, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %452 = "tosa.reshape"(%447) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %453 = "tosa.transpose"(%452, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %454 = "tosa.transpose"(%451, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %455 = "tosa.reshape"(%449) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %456 = "tosa.reshape"(%454) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %457 = "tosa.matmul"(%455, %456) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %458 = "tosa.reshape"(%457) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %459 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %460 = "tosa.reshape"(%459) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %461 = "tosa.mul"(%458, %460) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %462 = torch_c.from_builtin_tensor %461 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %463 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %464 = "tosa.slice"(%463) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %465 = torch_c.from_builtin_tensor %464 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %466 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %467 = torch.aten.to.dtype %466, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %468 = torch.valsem.aten.copy %467, %465, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %469 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %470 = torch.aten.where.self %468, %462, %469 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %471 = torch_c.to_builtin_tensor %470 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %472 = "tosa.reduce_max"(%471) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %473 = "tosa.mul"(%472, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %474 = "tosa.sub"(%471, %473) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %475 = "tosa.exp"(%474) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %476 = "tosa.reduce_sum"(%475) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %477 = "tosa.reciprocal"(%476) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %478 = "tosa.mul"(%475, %477) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %479 = "tosa.reshape"(%478) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %480 = "tosa.reshape"(%453) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %481 = "tosa.matmul"(%479, %480) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %482 = "tosa.reshape"(%481) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %483 = "tosa.transpose"(%482, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %484 = "tosa.reshape"(%483) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %485 = "tosa.reshape"(%484) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %486 = "tosa.reshape"(%485) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %487 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %488 = "tosa.matmul"(%486, %487) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %489 = "tosa.reshape"(%488) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %490 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %491 = "tosa.mul"(%489, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %492 = "tosa.reshape"(%490) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %493 = "tosa.add"(%492, %491) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %494 = "tosa.reshape"(%493) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %495 = "tosa.mul"(%417, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %496 = "tosa.add"(%494, %495) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %497 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %498 = "tosa.reduce_sum"(%496) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %499 = "tosa.reshape"(%497) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %500 = "tosa.mul"(%498, %499) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %501 = "tosa.sub"(%496, %500) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %502 = "tosa.mul"(%501, %501) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %503 = "tosa.reduce_sum"(%502) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %504 = "tosa.reshape"(%497) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %505 = "tosa.mul"(%503, %504) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %506 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %507 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %508 = "tosa.sub"(%496, %500) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %509 = "tosa.add"(%505, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %510 = "tosa.rsqrt"(%509) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %511 = "tosa.mul"(%508, %510) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %512 = "tosa.mul"(%511, %506) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %513 = "tosa.add"(%512, %507) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %514 = "tosa.reshape"(%513) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %515 = "tosa.reshape"(%514) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %516 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %517 = "tosa.matmul"(%515, %516) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %518 = "tosa.reshape"(%517) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %519 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %520 = "tosa.mul"(%518, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %521 = "tosa.reshape"(%519) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %522 = "tosa.add"(%521, %520) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %523 = "tosa.reshape"(%522) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %524 = "tosa.mul"(%523, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %525 = "tosa.pow"(%523, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %526 = "tosa.mul"(%525, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %527 = "tosa.mul"(%526, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %528 = "tosa.add"(%523, %527) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %529 = "tosa.mul"(%528, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %530 = "tosa.tanh"(%529) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %531 = "tosa.add"(%530, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %532 = "tosa.mul"(%524, %531) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %533 = "tosa.reshape"(%532) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %534 = "tosa.reshape"(%533) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %535 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %536 = "tosa.matmul"(%534, %535) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %537 = "tosa.reshape"(%536) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %538 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %539 = "tosa.mul"(%537, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %540 = "tosa.reshape"(%538) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %541 = "tosa.add"(%540, %539) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %542 = "tosa.reshape"(%541) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %543 = "tosa.mul"(%542, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %544 = "tosa.add"(%496, %543) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %545 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %546 = "tosa.reduce_sum"(%544) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %547 = "tosa.reshape"(%545) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %548 = "tosa.mul"(%546, %547) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %549 = "tosa.sub"(%544, %548) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %550 = "tosa.mul"(%549, %549) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %551 = "tosa.reduce_sum"(%550) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %552 = "tosa.reshape"(%545) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %553 = "tosa.mul"(%551, %552) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %554 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %555 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %556 = "tosa.sub"(%544, %548) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %557 = "tosa.add"(%553, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %558 = "tosa.rsqrt"(%557) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %559 = "tosa.mul"(%556, %558) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %560 = "tosa.mul"(%559, %554) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %561 = "tosa.add"(%560, %555) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %562 = "tosa.reshape"(%561) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %563 = "tosa.reshape"(%562) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %564 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %565 = "tosa.matmul"(%563, %564) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %566 = "tosa.reshape"(%565) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %567 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %568 = "tosa.mul"(%566, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %569 = "tosa.reshape"(%567) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %570 = "tosa.add"(%569, %568) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %571 = "tosa.reshape"(%570) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %572 = "tosa.slice"(%571) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %573 = "tosa.slice"(%571) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %574 = "tosa.slice"(%571) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %575 = "tosa.reshape"(%572) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %576 = "tosa.transpose"(%575, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %577 = "tosa.reshape"(%573) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %578 = "tosa.transpose"(%577, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %579 = "tosa.reshape"(%574) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %580 = "tosa.transpose"(%579, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %581 = "tosa.transpose"(%578, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %582 = "tosa.reshape"(%576) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %583 = "tosa.reshape"(%581) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %584 = "tosa.matmul"(%582, %583) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %585 = "tosa.reshape"(%584) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %586 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %587 = "tosa.reshape"(%586) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %588 = "tosa.mul"(%585, %587) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %589 = torch_c.from_builtin_tensor %588 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %590 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %591 = "tosa.slice"(%590) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %592 = torch_c.from_builtin_tensor %591 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %593 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %594 = torch.aten.to.dtype %593, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %595 = torch.valsem.aten.copy %594, %592, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %596 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %597 = torch.aten.where.self %595, %589, %596 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %598 = torch_c.to_builtin_tensor %597 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %599 = "tosa.reduce_max"(%598) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %600 = "tosa.mul"(%599, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %601 = "tosa.sub"(%598, %600) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %602 = "tosa.exp"(%601) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %603 = "tosa.reduce_sum"(%602) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %604 = "tosa.reciprocal"(%603) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %605 = "tosa.mul"(%602, %604) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %606 = "tosa.reshape"(%605) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %607 = "tosa.reshape"(%580) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %608 = "tosa.matmul"(%606, %607) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %609 = "tosa.reshape"(%608) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %610 = "tosa.transpose"(%609, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %611 = "tosa.reshape"(%610) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %612 = "tosa.reshape"(%611) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %613 = "tosa.reshape"(%612) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %614 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %615 = "tosa.matmul"(%613, %614) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %616 = "tosa.reshape"(%615) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %617 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %618 = "tosa.mul"(%616, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %619 = "tosa.reshape"(%617) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %620 = "tosa.add"(%619, %618) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %621 = "tosa.reshape"(%620) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %622 = "tosa.mul"(%544, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %623 = "tosa.add"(%621, %622) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %624 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %625 = "tosa.reduce_sum"(%623) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %626 = "tosa.reshape"(%624) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %627 = "tosa.mul"(%625, %626) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %628 = "tosa.sub"(%623, %627) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %629 = "tosa.mul"(%628, %628) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %630 = "tosa.reduce_sum"(%629) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %631 = "tosa.reshape"(%624) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %632 = "tosa.mul"(%630, %631) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %633 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %634 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %635 = "tosa.sub"(%623, %627) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %636 = "tosa.add"(%632, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %637 = "tosa.rsqrt"(%636) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %638 = "tosa.mul"(%635, %637) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %639 = "tosa.mul"(%638, %633) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %640 = "tosa.add"(%639, %634) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %641 = "tosa.reshape"(%640) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %642 = "tosa.reshape"(%641) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %643 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %644 = "tosa.matmul"(%642, %643) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %645 = "tosa.reshape"(%644) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %646 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %647 = "tosa.mul"(%645, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %648 = "tosa.reshape"(%646) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %649 = "tosa.add"(%648, %647) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %650 = "tosa.reshape"(%649) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %651 = "tosa.mul"(%650, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %652 = "tosa.pow"(%650, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %653 = "tosa.mul"(%652, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %654 = "tosa.mul"(%653, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %655 = "tosa.add"(%650, %654) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %656 = "tosa.mul"(%655, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %657 = "tosa.tanh"(%656) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %658 = "tosa.add"(%657, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %659 = "tosa.mul"(%651, %658) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %660 = "tosa.reshape"(%659) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %661 = "tosa.reshape"(%660) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %662 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %663 = "tosa.matmul"(%661, %662) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %664 = "tosa.reshape"(%663) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %665 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %666 = "tosa.mul"(%664, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %667 = "tosa.reshape"(%665) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %668 = "tosa.add"(%667, %666) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %669 = "tosa.reshape"(%668) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %670 = "tosa.mul"(%669, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %671 = "tosa.add"(%623, %670) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %672 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %673 = "tosa.reduce_sum"(%671) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %674 = "tosa.reshape"(%672) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %675 = "tosa.mul"(%673, %674) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %676 = "tosa.sub"(%671, %675) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %677 = "tosa.mul"(%676, %676) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %678 = "tosa.reduce_sum"(%677) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %679 = "tosa.reshape"(%672) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %680 = "tosa.mul"(%678, %679) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %681 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %682 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %683 = "tosa.sub"(%671, %675) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %684 = "tosa.add"(%680, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %685 = "tosa.rsqrt"(%684) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %686 = "tosa.mul"(%683, %685) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %687 = "tosa.mul"(%686, %681) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %688 = "tosa.add"(%687, %682) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %689 = "tosa.reshape"(%688) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %690 = "tosa.reshape"(%689) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %691 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %692 = "tosa.matmul"(%690, %691) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %693 = "tosa.reshape"(%692) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %694 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %695 = "tosa.mul"(%693, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %696 = "tosa.reshape"(%694) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %697 = "tosa.add"(%696, %695) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %698 = "tosa.reshape"(%697) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %699 = "tosa.slice"(%698) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %700 = "tosa.slice"(%698) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %701 = "tosa.slice"(%698) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %702 = "tosa.reshape"(%699) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %703 = "tosa.transpose"(%702, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %704 = "tosa.reshape"(%700) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %705 = "tosa.transpose"(%704, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %706 = "tosa.reshape"(%701) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %707 = "tosa.transpose"(%706, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %708 = "tosa.transpose"(%705, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %709 = "tosa.reshape"(%703) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %710 = "tosa.reshape"(%708) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %711 = "tosa.matmul"(%709, %710) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %712 = "tosa.reshape"(%711) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %713 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %714 = "tosa.reshape"(%713) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %715 = "tosa.mul"(%712, %714) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %716 = torch_c.from_builtin_tensor %715 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %717 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %718 = "tosa.slice"(%717) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %719 = torch_c.from_builtin_tensor %718 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %720 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %721 = torch.aten.to.dtype %720, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %722 = torch.valsem.aten.copy %721, %719, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %723 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %724 = torch.aten.where.self %722, %716, %723 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %725 = torch_c.to_builtin_tensor %724 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %726 = "tosa.reduce_max"(%725) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %727 = "tosa.mul"(%726, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %728 = "tosa.sub"(%725, %727) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %729 = "tosa.exp"(%728) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %730 = "tosa.reduce_sum"(%729) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %731 = "tosa.reciprocal"(%730) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %732 = "tosa.mul"(%729, %731) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %733 = "tosa.reshape"(%732) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %734 = "tosa.reshape"(%707) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %735 = "tosa.matmul"(%733, %734) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %736 = "tosa.reshape"(%735) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %737 = "tosa.transpose"(%736, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %738 = "tosa.reshape"(%737) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %739 = "tosa.reshape"(%738) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %740 = "tosa.reshape"(%739) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %741 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %742 = "tosa.matmul"(%740, %741) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %743 = "tosa.reshape"(%742) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %744 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %745 = "tosa.mul"(%743, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %746 = "tosa.reshape"(%744) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %747 = "tosa.add"(%746, %745) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %748 = "tosa.reshape"(%747) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %749 = "tosa.mul"(%671, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %750 = "tosa.add"(%748, %749) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %751 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %752 = "tosa.reduce_sum"(%750) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %753 = "tosa.reshape"(%751) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %754 = "tosa.mul"(%752, %753) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %755 = "tosa.sub"(%750, %754) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %756 = "tosa.mul"(%755, %755) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %757 = "tosa.reduce_sum"(%756) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %758 = "tosa.reshape"(%751) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %759 = "tosa.mul"(%757, %758) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %760 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %761 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %762 = "tosa.sub"(%750, %754) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %763 = "tosa.add"(%759, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %764 = "tosa.rsqrt"(%763) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %765 = "tosa.mul"(%762, %764) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %766 = "tosa.mul"(%765, %760) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %767 = "tosa.add"(%766, %761) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %768 = "tosa.reshape"(%767) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %769 = "tosa.reshape"(%768) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %770 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %771 = "tosa.matmul"(%769, %770) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %772 = "tosa.reshape"(%771) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %773 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %774 = "tosa.mul"(%772, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %775 = "tosa.reshape"(%773) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %776 = "tosa.add"(%775, %774) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %777 = "tosa.reshape"(%776) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %778 = "tosa.mul"(%777, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %779 = "tosa.pow"(%777, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %780 = "tosa.mul"(%779, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %781 = "tosa.mul"(%780, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %782 = "tosa.add"(%777, %781) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %783 = "tosa.mul"(%782, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %784 = "tosa.tanh"(%783) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %785 = "tosa.add"(%784, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %786 = "tosa.mul"(%778, %785) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %787 = "tosa.reshape"(%786) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %788 = "tosa.reshape"(%787) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %789 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %790 = "tosa.matmul"(%788, %789) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %791 = "tosa.reshape"(%790) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %792 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %793 = "tosa.mul"(%791, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %794 = "tosa.reshape"(%792) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %795 = "tosa.add"(%794, %793) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %796 = "tosa.reshape"(%795) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %797 = "tosa.mul"(%796, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %798 = "tosa.add"(%750, %797) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %799 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %800 = "tosa.reduce_sum"(%798) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %801 = "tosa.reshape"(%799) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %802 = "tosa.mul"(%800, %801) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %803 = "tosa.sub"(%798, %802) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %804 = "tosa.mul"(%803, %803) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %805 = "tosa.reduce_sum"(%804) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %806 = "tosa.reshape"(%799) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %807 = "tosa.mul"(%805, %806) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %808 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %809 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %810 = "tosa.sub"(%798, %802) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %811 = "tosa.add"(%807, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %812 = "tosa.rsqrt"(%811) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %813 = "tosa.mul"(%810, %812) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %814 = "tosa.mul"(%813, %808) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %815 = "tosa.add"(%814, %809) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %816 = "tosa.reshape"(%815) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %817 = "tosa.reshape"(%816) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %818 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %819 = "tosa.matmul"(%817, %818) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %820 = "tosa.reshape"(%819) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %821 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %822 = "tosa.mul"(%820, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %823 = "tosa.reshape"(%821) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %824 = "tosa.add"(%823, %822) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %825 = "tosa.reshape"(%824) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %826 = "tosa.slice"(%825) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %827 = "tosa.slice"(%825) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %828 = "tosa.slice"(%825) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %829 = "tosa.reshape"(%826) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %830 = "tosa.transpose"(%829, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %831 = "tosa.reshape"(%827) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %832 = "tosa.transpose"(%831, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %833 = "tosa.reshape"(%828) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %834 = "tosa.transpose"(%833, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %835 = "tosa.transpose"(%832, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %836 = "tosa.reshape"(%830) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %837 = "tosa.reshape"(%835) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %838 = "tosa.matmul"(%836, %837) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %839 = "tosa.reshape"(%838) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %840 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %841 = "tosa.reshape"(%840) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %842 = "tosa.mul"(%839, %841) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %843 = torch_c.from_builtin_tensor %842 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %844 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %845 = "tosa.slice"(%844) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %846 = torch_c.from_builtin_tensor %845 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %847 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %848 = torch.aten.to.dtype %847, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %849 = torch.valsem.aten.copy %848, %846, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %850 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %851 = torch.aten.where.self %849, %843, %850 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %852 = torch_c.to_builtin_tensor %851 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %853 = "tosa.reduce_max"(%852) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %854 = "tosa.mul"(%853, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %855 = "tosa.sub"(%852, %854) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %856 = "tosa.exp"(%855) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %857 = "tosa.reduce_sum"(%856) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %858 = "tosa.reciprocal"(%857) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %859 = "tosa.mul"(%856, %858) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %860 = "tosa.reshape"(%859) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %861 = "tosa.reshape"(%834) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %862 = "tosa.matmul"(%860, %861) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %863 = "tosa.reshape"(%862) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %864 = "tosa.transpose"(%863, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %865 = "tosa.reshape"(%864) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %866 = "tosa.reshape"(%865) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %867 = "tosa.reshape"(%866) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %868 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %869 = "tosa.matmul"(%867, %868) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %870 = "tosa.reshape"(%869) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %871 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %872 = "tosa.mul"(%870, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %873 = "tosa.reshape"(%871) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %874 = "tosa.add"(%873, %872) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %875 = "tosa.reshape"(%874) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %876 = "tosa.mul"(%798, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %877 = "tosa.add"(%875, %876) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %878 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %879 = "tosa.reduce_sum"(%877) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %880 = "tosa.reshape"(%878) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %881 = "tosa.mul"(%879, %880) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %882 = "tosa.sub"(%877, %881) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %883 = "tosa.mul"(%882, %882) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %884 = "tosa.reduce_sum"(%883) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %885 = "tosa.reshape"(%878) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %886 = "tosa.mul"(%884, %885) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %887 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %888 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %889 = "tosa.sub"(%877, %881) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %890 = "tosa.add"(%886, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %891 = "tosa.rsqrt"(%890) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %892 = "tosa.mul"(%889, %891) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %893 = "tosa.mul"(%892, %887) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %894 = "tosa.add"(%893, %888) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %895 = "tosa.reshape"(%894) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %896 = "tosa.reshape"(%895) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %897 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %898 = "tosa.matmul"(%896, %897) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %899 = "tosa.reshape"(%898) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %900 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %901 = "tosa.mul"(%899, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %902 = "tosa.reshape"(%900) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %903 = "tosa.add"(%902, %901) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %904 = "tosa.reshape"(%903) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %905 = "tosa.mul"(%904, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %906 = "tosa.pow"(%904, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %907 = "tosa.mul"(%906, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %908 = "tosa.mul"(%907, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %909 = "tosa.add"(%904, %908) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %910 = "tosa.mul"(%909, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %911 = "tosa.tanh"(%910) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %912 = "tosa.add"(%911, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %913 = "tosa.mul"(%905, %912) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %914 = "tosa.reshape"(%913) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %915 = "tosa.reshape"(%914) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %916 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %917 = "tosa.matmul"(%915, %916) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %918 = "tosa.reshape"(%917) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %919 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %920 = "tosa.mul"(%918, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %921 = "tosa.reshape"(%919) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %922 = "tosa.add"(%921, %920) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %923 = "tosa.reshape"(%922) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %924 = "tosa.mul"(%923, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %925 = "tosa.add"(%877, %924) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %926 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %927 = "tosa.reduce_sum"(%925) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %928 = "tosa.reshape"(%926) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %929 = "tosa.mul"(%927, %928) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %930 = "tosa.sub"(%925, %929) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %931 = "tosa.mul"(%930, %930) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %932 = "tosa.reduce_sum"(%931) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %933 = "tosa.reshape"(%926) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %934 = "tosa.mul"(%932, %933) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %935 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %936 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %937 = "tosa.sub"(%925, %929) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %938 = "tosa.add"(%934, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %939 = "tosa.rsqrt"(%938) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %940 = "tosa.mul"(%937, %939) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %941 = "tosa.mul"(%940, %935) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %942 = "tosa.add"(%941, %936) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %943 = "tosa.reshape"(%942) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %944 = "tosa.reshape"(%943) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %945 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %946 = "tosa.matmul"(%944, %945) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %947 = "tosa.reshape"(%946) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %948 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %949 = "tosa.mul"(%947, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %950 = "tosa.reshape"(%948) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %951 = "tosa.add"(%950, %949) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %952 = "tosa.reshape"(%951) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %953 = "tosa.slice"(%952) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %954 = "tosa.slice"(%952) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %955 = "tosa.slice"(%952) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %956 = "tosa.reshape"(%953) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %957 = "tosa.transpose"(%956, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %958 = "tosa.reshape"(%954) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %959 = "tosa.transpose"(%958, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %960 = "tosa.reshape"(%955) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %961 = "tosa.transpose"(%960, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %962 = "tosa.transpose"(%959, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %963 = "tosa.reshape"(%957) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %964 = "tosa.reshape"(%962) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %965 = "tosa.matmul"(%963, %964) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %966 = "tosa.reshape"(%965) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %967 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %968 = "tosa.reshape"(%967) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %969 = "tosa.mul"(%966, %968) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %970 = torch_c.from_builtin_tensor %969 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %971 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %972 = "tosa.slice"(%971) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %973 = torch_c.from_builtin_tensor %972 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %974 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %975 = torch.aten.to.dtype %974, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %976 = torch.valsem.aten.copy %975, %973, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %977 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %978 = torch.aten.where.self %976, %970, %977 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %979 = torch_c.to_builtin_tensor %978 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %980 = "tosa.reduce_max"(%979) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %981 = "tosa.mul"(%980, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %982 = "tosa.sub"(%979, %981) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %983 = "tosa.exp"(%982) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %984 = "tosa.reduce_sum"(%983) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %985 = "tosa.reciprocal"(%984) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %986 = "tosa.mul"(%983, %985) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %987 = "tosa.reshape"(%986) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %988 = "tosa.reshape"(%961) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %989 = "tosa.matmul"(%987, %988) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %990 = "tosa.reshape"(%989) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %991 = "tosa.transpose"(%990, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %992 = "tosa.reshape"(%991) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %993 = "tosa.reshape"(%992) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %994 = "tosa.reshape"(%993) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %995 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %996 = "tosa.matmul"(%994, %995) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %997 = "tosa.reshape"(%996) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %998 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %999 = "tosa.mul"(%997, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1000 = "tosa.reshape"(%998) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1001 = "tosa.add"(%1000, %999) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1002 = "tosa.reshape"(%1001) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1003 = "tosa.mul"(%925, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1004 = "tosa.add"(%1002, %1003) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1005 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1006 = "tosa.reduce_sum"(%1004) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1007 = "tosa.reshape"(%1005) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1008 = "tosa.mul"(%1006, %1007) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1009 = "tosa.sub"(%1004, %1008) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1010 = "tosa.mul"(%1009, %1009) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1011 = "tosa.reduce_sum"(%1010) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1012 = "tosa.reshape"(%1005) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1013 = "tosa.mul"(%1011, %1012) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1014 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1015 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1016 = "tosa.sub"(%1004, %1008) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1017 = "tosa.add"(%1013, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1018 = "tosa.rsqrt"(%1017) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1019 = "tosa.mul"(%1016, %1018) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1020 = "tosa.mul"(%1019, %1014) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1021 = "tosa.add"(%1020, %1015) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1022 = "tosa.reshape"(%1021) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1023 = "tosa.reshape"(%1022) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1024 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1025 = "tosa.matmul"(%1023, %1024) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1026 = "tosa.reshape"(%1025) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1027 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %1028 = "tosa.mul"(%1026, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %1029 = "tosa.reshape"(%1027) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1030 = "tosa.add"(%1029, %1028) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1031 = "tosa.reshape"(%1030) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1032 = "tosa.mul"(%1031, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1033 = "tosa.pow"(%1031, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1034 = "tosa.mul"(%1033, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1035 = "tosa.mul"(%1034, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1036 = "tosa.add"(%1031, %1035) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1037 = "tosa.mul"(%1036, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1038 = "tosa.tanh"(%1037) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1039 = "tosa.add"(%1038, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1040 = "tosa.mul"(%1032, %1039) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1041 = "tosa.reshape"(%1040) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1042 = "tosa.reshape"(%1041) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1043 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1044 = "tosa.matmul"(%1042, %1043) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1045 = "tosa.reshape"(%1044) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1046 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1047 = "tosa.mul"(%1045, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1048 = "tosa.reshape"(%1046) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1049 = "tosa.add"(%1048, %1047) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1050 = "tosa.reshape"(%1049) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1051 = "tosa.mul"(%1050, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1052 = "tosa.add"(%1004, %1051) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1053 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1054 = "tosa.reduce_sum"(%1052) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1055 = "tosa.reshape"(%1053) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1056 = "tosa.mul"(%1054, %1055) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1057 = "tosa.sub"(%1052, %1056) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1058 = "tosa.mul"(%1057, %1057) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1059 = "tosa.reduce_sum"(%1058) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1060 = "tosa.reshape"(%1053) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1061 = "tosa.mul"(%1059, %1060) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1062 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1063 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1064 = "tosa.sub"(%1052, %1056) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1065 = "tosa.add"(%1061, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1066 = "tosa.rsqrt"(%1065) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1067 = "tosa.mul"(%1064, %1066) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1068 = "tosa.mul"(%1067, %1062) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1069 = "tosa.add"(%1068, %1063) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1070 = "tosa.reshape"(%1069) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1071 = "tosa.reshape"(%1070) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1072 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1073 = "tosa.matmul"(%1071, %1072) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1074 = "tosa.reshape"(%1073) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1075 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %1076 = "tosa.mul"(%1074, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %1077 = "tosa.reshape"(%1075) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1078 = "tosa.add"(%1077, %1076) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1079 = "tosa.reshape"(%1078) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1080 = "tosa.slice"(%1079) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1081 = "tosa.slice"(%1079) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1082 = "tosa.slice"(%1079) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1083 = "tosa.reshape"(%1080) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1084 = "tosa.transpose"(%1083, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1085 = "tosa.reshape"(%1081) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1086 = "tosa.transpose"(%1085, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1087 = "tosa.reshape"(%1082) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1088 = "tosa.transpose"(%1087, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1089 = "tosa.transpose"(%1086, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1090 = "tosa.reshape"(%1084) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1091 = "tosa.reshape"(%1089) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1092 = "tosa.matmul"(%1090, %1091) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1093 = "tosa.reshape"(%1092) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1094 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %1095 = "tosa.reshape"(%1094) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1096 = "tosa.mul"(%1093, %1095) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1097 = torch_c.from_builtin_tensor %1096 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1098 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1099 = "tosa.slice"(%1098) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1100 = torch_c.from_builtin_tensor %1099 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1101 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1102 = torch.aten.to.dtype %1101, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1103 = torch.valsem.aten.copy %1102, %1100, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1104 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %1105 = torch.aten.where.self %1103, %1097, %1104 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1106 = torch_c.to_builtin_tensor %1105 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1107 = "tosa.reduce_max"(%1106) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1108 = "tosa.mul"(%1107, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %1109 = "tosa.sub"(%1106, %1108) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1110 = "tosa.exp"(%1109) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1111 = "tosa.reduce_sum"(%1110) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1112 = "tosa.reciprocal"(%1111) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1113 = "tosa.mul"(%1110, %1112) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1114 = "tosa.reshape"(%1113) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1115 = "tosa.reshape"(%1088) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1116 = "tosa.matmul"(%1114, %1115) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1117 = "tosa.reshape"(%1116) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1118 = "tosa.transpose"(%1117, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1119 = "tosa.reshape"(%1118) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1120 = "tosa.reshape"(%1119) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1121 = "tosa.reshape"(%1120) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1122 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1123 = "tosa.matmul"(%1121, %1122) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1124 = "tosa.reshape"(%1123) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1125 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1126 = "tosa.mul"(%1124, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1127 = "tosa.reshape"(%1125) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1128 = "tosa.add"(%1127, %1126) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1129 = "tosa.reshape"(%1128) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1130 = "tosa.mul"(%1052, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1131 = "tosa.add"(%1129, %1130) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1132 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1133 = "tosa.reduce_sum"(%1131) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1134 = "tosa.reshape"(%1132) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1135 = "tosa.mul"(%1133, %1134) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1136 = "tosa.sub"(%1131, %1135) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1137 = "tosa.mul"(%1136, %1136) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1138 = "tosa.reduce_sum"(%1137) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1139 = "tosa.reshape"(%1132) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1140 = "tosa.mul"(%1138, %1139) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1141 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1142 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1143 = "tosa.sub"(%1131, %1135) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1144 = "tosa.add"(%1140, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1145 = "tosa.rsqrt"(%1144) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1146 = "tosa.mul"(%1143, %1145) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1147 = "tosa.mul"(%1146, %1141) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1148 = "tosa.add"(%1147, %1142) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1149 = "tosa.reshape"(%1148) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1150 = "tosa.reshape"(%1149) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1151 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1152 = "tosa.matmul"(%1150, %1151) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1153 = "tosa.reshape"(%1152) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1154 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %1155 = "tosa.mul"(%1153, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %1156 = "tosa.reshape"(%1154) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1157 = "tosa.add"(%1156, %1155) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1158 = "tosa.reshape"(%1157) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1159 = "tosa.mul"(%1158, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1160 = "tosa.pow"(%1158, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1161 = "tosa.mul"(%1160, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1162 = "tosa.mul"(%1161, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1163 = "tosa.add"(%1158, %1162) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1164 = "tosa.mul"(%1163, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1165 = "tosa.tanh"(%1164) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1166 = "tosa.add"(%1165, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1167 = "tosa.mul"(%1159, %1166) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1168 = "tosa.reshape"(%1167) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1169 = "tosa.reshape"(%1168) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1170 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1171 = "tosa.matmul"(%1169, %1170) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1172 = "tosa.reshape"(%1171) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1173 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1174 = "tosa.mul"(%1172, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1175 = "tosa.reshape"(%1173) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1176 = "tosa.add"(%1175, %1174) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1177 = "tosa.reshape"(%1176) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1178 = "tosa.mul"(%1177, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1179 = "tosa.add"(%1131, %1178) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1180 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1181 = "tosa.reduce_sum"(%1179) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1182 = "tosa.reshape"(%1180) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1183 = "tosa.mul"(%1181, %1182) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1184 = "tosa.sub"(%1179, %1183) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1185 = "tosa.mul"(%1184, %1184) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1186 = "tosa.reduce_sum"(%1185) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1187 = "tosa.reshape"(%1180) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1188 = "tosa.mul"(%1186, %1187) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1189 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1190 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1191 = "tosa.sub"(%1179, %1183) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1192 = "tosa.add"(%1188, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1193 = "tosa.rsqrt"(%1192) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1194 = "tosa.mul"(%1191, %1193) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1195 = "tosa.mul"(%1194, %1189) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1196 = "tosa.add"(%1195, %1190) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1197 = "tosa.reshape"(%1196) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1198 = "tosa.reshape"(%1197) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1199 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1200 = "tosa.matmul"(%1198, %1199) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1201 = "tosa.reshape"(%1200) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1202 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %1203 = "tosa.mul"(%1201, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %1204 = "tosa.reshape"(%1202) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1205 = "tosa.add"(%1204, %1203) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1206 = "tosa.reshape"(%1205) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1207 = "tosa.slice"(%1206) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1208 = "tosa.slice"(%1206) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1209 = "tosa.slice"(%1206) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1210 = "tosa.reshape"(%1207) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1211 = "tosa.transpose"(%1210, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1212 = "tosa.reshape"(%1208) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1213 = "tosa.transpose"(%1212, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1214 = "tosa.reshape"(%1209) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1215 = "tosa.transpose"(%1214, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1216 = "tosa.transpose"(%1213, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1217 = "tosa.reshape"(%1211) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1218 = "tosa.reshape"(%1216) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1219 = "tosa.matmul"(%1217, %1218) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1220 = "tosa.reshape"(%1219) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1221 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %1222 = "tosa.reshape"(%1221) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1223 = "tosa.mul"(%1220, %1222) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1224 = torch_c.from_builtin_tensor %1223 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1225 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1226 = "tosa.slice"(%1225) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1227 = torch_c.from_builtin_tensor %1226 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1228 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1229 = torch.aten.to.dtype %1228, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1230 = torch.valsem.aten.copy %1229, %1227, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1231 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %1232 = torch.aten.where.self %1230, %1224, %1231 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1233 = torch_c.to_builtin_tensor %1232 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1234 = "tosa.reduce_max"(%1233) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1235 = "tosa.mul"(%1234, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %1236 = "tosa.sub"(%1233, %1235) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1237 = "tosa.exp"(%1236) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1238 = "tosa.reduce_sum"(%1237) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1239 = "tosa.reciprocal"(%1238) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1240 = "tosa.mul"(%1237, %1239) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1241 = "tosa.reshape"(%1240) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1242 = "tosa.reshape"(%1215) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1243 = "tosa.matmul"(%1241, %1242) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1244 = "tosa.reshape"(%1243) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1245 = "tosa.transpose"(%1244, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1246 = "tosa.reshape"(%1245) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1247 = "tosa.reshape"(%1246) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1248 = "tosa.reshape"(%1247) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1249 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1250 = "tosa.matmul"(%1248, %1249) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1251 = "tosa.reshape"(%1250) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1252 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1253 = "tosa.mul"(%1251, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1254 = "tosa.reshape"(%1252) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1255 = "tosa.add"(%1254, %1253) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1256 = "tosa.reshape"(%1255) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1257 = "tosa.mul"(%1179, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1258 = "tosa.add"(%1256, %1257) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1259 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1260 = "tosa.reduce_sum"(%1258) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1261 = "tosa.reshape"(%1259) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1262 = "tosa.mul"(%1260, %1261) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1263 = "tosa.sub"(%1258, %1262) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1264 = "tosa.mul"(%1263, %1263) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1265 = "tosa.reduce_sum"(%1264) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1266 = "tosa.reshape"(%1259) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1267 = "tosa.mul"(%1265, %1266) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1268 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1269 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1270 = "tosa.sub"(%1258, %1262) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1271 = "tosa.add"(%1267, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1272 = "tosa.rsqrt"(%1271) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1273 = "tosa.mul"(%1270, %1272) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1274 = "tosa.mul"(%1273, %1268) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1275 = "tosa.add"(%1274, %1269) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1276 = "tosa.reshape"(%1275) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1277 = "tosa.reshape"(%1276) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1278 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1279 = "tosa.matmul"(%1277, %1278) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1280 = "tosa.reshape"(%1279) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1281 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %1282 = "tosa.mul"(%1280, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %1283 = "tosa.reshape"(%1281) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1284 = "tosa.add"(%1283, %1282) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1285 = "tosa.reshape"(%1284) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1286 = "tosa.mul"(%1285, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1287 = "tosa.pow"(%1285, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1288 = "tosa.mul"(%1287, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1289 = "tosa.mul"(%1288, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1290 = "tosa.add"(%1285, %1289) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1291 = "tosa.mul"(%1290, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1292 = "tosa.tanh"(%1291) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1293 = "tosa.add"(%1292, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1294 = "tosa.mul"(%1286, %1293) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1295 = "tosa.reshape"(%1294) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1296 = "tosa.reshape"(%1295) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1297 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1298 = "tosa.matmul"(%1296, %1297) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1299 = "tosa.reshape"(%1298) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1300 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1301 = "tosa.mul"(%1299, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1302 = "tosa.reshape"(%1300) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1303 = "tosa.add"(%1302, %1301) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1304 = "tosa.reshape"(%1303) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1305 = "tosa.mul"(%1304, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1306 = "tosa.add"(%1258, %1305) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1307 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1308 = "tosa.reduce_sum"(%1306) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1309 = "tosa.reshape"(%1307) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1310 = "tosa.mul"(%1308, %1309) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1311 = "tosa.sub"(%1306, %1310) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1312 = "tosa.mul"(%1311, %1311) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1313 = "tosa.reduce_sum"(%1312) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1314 = "tosa.reshape"(%1307) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1315 = "tosa.mul"(%1313, %1314) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1316 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1317 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1318 = "tosa.sub"(%1306, %1310) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1319 = "tosa.add"(%1315, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1320 = "tosa.rsqrt"(%1319) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1321 = "tosa.mul"(%1318, %1320) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1322 = "tosa.mul"(%1321, %1316) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1323 = "tosa.add"(%1322, %1317) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1324 = "tosa.reshape"(%1323) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1325 = "tosa.reshape"(%1324) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1326 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1327 = "tosa.matmul"(%1325, %1326) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1328 = "tosa.reshape"(%1327) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1329 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %1330 = "tosa.mul"(%1328, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %1331 = "tosa.reshape"(%1329) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1332 = "tosa.add"(%1331, %1330) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1333 = "tosa.reshape"(%1332) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1334 = "tosa.slice"(%1333) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1335 = "tosa.slice"(%1333) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1336 = "tosa.slice"(%1333) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1337 = "tosa.reshape"(%1334) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1338 = "tosa.transpose"(%1337, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1339 = "tosa.reshape"(%1335) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1340 = "tosa.transpose"(%1339, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1341 = "tosa.reshape"(%1336) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1342 = "tosa.transpose"(%1341, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1343 = "tosa.transpose"(%1340, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1344 = "tosa.reshape"(%1338) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1345 = "tosa.reshape"(%1343) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1346 = "tosa.matmul"(%1344, %1345) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1347 = "tosa.reshape"(%1346) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1348 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %1349 = "tosa.reshape"(%1348) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1350 = "tosa.mul"(%1347, %1349) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1351 = torch_c.from_builtin_tensor %1350 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1352 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1353 = "tosa.slice"(%1352) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1354 = torch_c.from_builtin_tensor %1353 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1355 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1356 = torch.aten.to.dtype %1355, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1357 = torch.valsem.aten.copy %1356, %1354, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1358 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %1359 = torch.aten.where.self %1357, %1351, %1358 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1360 = torch_c.to_builtin_tensor %1359 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1361 = "tosa.reduce_max"(%1360) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1362 = "tosa.mul"(%1361, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %1363 = "tosa.sub"(%1360, %1362) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1364 = "tosa.exp"(%1363) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1365 = "tosa.reduce_sum"(%1364) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1366 = "tosa.reciprocal"(%1365) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1367 = "tosa.mul"(%1364, %1366) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1368 = "tosa.reshape"(%1367) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1369 = "tosa.reshape"(%1342) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1370 = "tosa.matmul"(%1368, %1369) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1371 = "tosa.reshape"(%1370) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1372 = "tosa.transpose"(%1371, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1373 = "tosa.reshape"(%1372) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1374 = "tosa.reshape"(%1373) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1375 = "tosa.reshape"(%1374) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1376 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1377 = "tosa.matmul"(%1375, %1376) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1378 = "tosa.reshape"(%1377) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1379 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1380 = "tosa.mul"(%1378, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1381 = "tosa.reshape"(%1379) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1382 = "tosa.add"(%1381, %1380) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1383 = "tosa.reshape"(%1382) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1384 = "tosa.mul"(%1306, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1385 = "tosa.add"(%1383, %1384) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1386 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1387 = "tosa.reduce_sum"(%1385) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1388 = "tosa.reshape"(%1386) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1389 = "tosa.mul"(%1387, %1388) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1390 = "tosa.sub"(%1385, %1389) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1391 = "tosa.mul"(%1390, %1390) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1392 = "tosa.reduce_sum"(%1391) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1393 = "tosa.reshape"(%1386) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1394 = "tosa.mul"(%1392, %1393) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1395 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1396 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1397 = "tosa.sub"(%1385, %1389) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1398 = "tosa.add"(%1394, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1399 = "tosa.rsqrt"(%1398) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1400 = "tosa.mul"(%1397, %1399) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1401 = "tosa.mul"(%1400, %1395) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1402 = "tosa.add"(%1401, %1396) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1403 = "tosa.reshape"(%1402) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1404 = "tosa.reshape"(%1403) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1405 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1406 = "tosa.matmul"(%1404, %1405) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1407 = "tosa.reshape"(%1406) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1408 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %1409 = "tosa.mul"(%1407, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %1410 = "tosa.reshape"(%1408) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1411 = "tosa.add"(%1410, %1409) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1412 = "tosa.reshape"(%1411) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1413 = "tosa.mul"(%1412, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1414 = "tosa.pow"(%1412, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1415 = "tosa.mul"(%1414, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1416 = "tosa.mul"(%1415, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1417 = "tosa.add"(%1412, %1416) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1418 = "tosa.mul"(%1417, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1419 = "tosa.tanh"(%1418) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1420 = "tosa.add"(%1419, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1421 = "tosa.mul"(%1413, %1420) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1422 = "tosa.reshape"(%1421) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1423 = "tosa.reshape"(%1422) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1424 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1425 = "tosa.matmul"(%1423, %1424) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1426 = "tosa.reshape"(%1425) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1427 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1428 = "tosa.mul"(%1426, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1429 = "tosa.reshape"(%1427) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1430 = "tosa.add"(%1429, %1428) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1431 = "tosa.reshape"(%1430) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1432 = "tosa.mul"(%1431, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1433 = "tosa.add"(%1385, %1432) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1434 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1435 = "tosa.reduce_sum"(%1433) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1436 = "tosa.reshape"(%1434) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1437 = "tosa.mul"(%1435, %1436) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1438 = "tosa.sub"(%1433, %1437) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1439 = "tosa.mul"(%1438, %1438) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1440 = "tosa.reduce_sum"(%1439) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1441 = "tosa.reshape"(%1434) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1442 = "tosa.mul"(%1440, %1441) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1443 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1444 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1445 = "tosa.sub"(%1433, %1437) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1446 = "tosa.add"(%1442, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1447 = "tosa.rsqrt"(%1446) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1448 = "tosa.mul"(%1445, %1447) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1449 = "tosa.mul"(%1448, %1443) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1450 = "tosa.add"(%1449, %1444) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1451 = "tosa.reshape"(%1450) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1452 = "tosa.reshape"(%1451) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1453 = "tosa.reshape"(%19) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1454 = "tosa.matmul"(%1452, %1453) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1455 = "tosa.reshape"(%1454) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1456 = "tosa.mul"(%2, %18) {shift = 0 : i32} : (tensor<1xf32>, tensor<2304xf32>) -> tensor<2304xf32>
  %1457 = "tosa.mul"(%1455, %3) {shift = 0 : i32} : (tensor<5x2304xf32>, tensor<1x1xf32>) -> tensor<5x2304xf32>
  %1458 = "tosa.reshape"(%1456) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1459 = "tosa.add"(%1458, %1457) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1460 = "tosa.reshape"(%1459) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1461 = "tosa.slice"(%1460) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1462 = "tosa.slice"(%1460) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1463 = "tosa.slice"(%1460) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1464 = "tosa.reshape"(%1461) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1465 = "tosa.transpose"(%1464, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1466 = "tosa.reshape"(%1462) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1467 = "tosa.transpose"(%1466, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1468 = "tosa.reshape"(%1463) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1469 = "tosa.transpose"(%1468, %11) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1470 = "tosa.transpose"(%1467, %10) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1471 = "tosa.reshape"(%1465) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1472 = "tosa.reshape"(%1470) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1473 = "tosa.matmul"(%1471, %1472) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1474 = "tosa.reshape"(%1473) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1475 = "tosa.reciprocal"(%15) : (tensor<f32>) -> tensor<f32>
  %1476 = "tosa.reshape"(%1475) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1477 = "tosa.mul"(%1474, %1476) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1478 = torch_c.from_builtin_tensor %1477 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1479 = "tosa.slice"(%16) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1480 = "tosa.slice"(%1479) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1481 = torch_c.from_builtin_tensor %1480 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1482 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1483 = torch.aten.to.dtype %1482, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1484 = torch.valsem.aten.copy %1483, %1481, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1485 = torch_c.from_builtin_tensor %17 : tensor<f32> -> !torch.vtensor<[],f32>
  %1486 = torch.aten.where.self %1484, %1478, %1485 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1487 = torch_c.to_builtin_tensor %1486 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1488 = "tosa.reduce_max"(%1487) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1489 = "tosa.mul"(%1488, %4) {shift = 0 : i32} : (tensor<1x12x5x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x1xf32>
  %1490 = "tosa.sub"(%1487, %1489) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1491 = "tosa.exp"(%1490) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1492 = "tosa.reduce_sum"(%1491) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1493 = "tosa.reciprocal"(%1492) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1494 = "tosa.mul"(%1491, %1493) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1495 = "tosa.reshape"(%1494) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1496 = "tosa.reshape"(%1469) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1497 = "tosa.matmul"(%1495, %1496) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1498 = "tosa.reshape"(%1497) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1499 = "tosa.transpose"(%1498, %11) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1500 = "tosa.reshape"(%1499) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1501 = "tosa.reshape"(%1500) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1502 = "tosa.reshape"(%1501) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1503 = "tosa.reshape"(%20) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1504 = "tosa.matmul"(%1502, %1503) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1505 = "tosa.reshape"(%1504) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1506 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1507 = "tosa.mul"(%1505, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1508 = "tosa.reshape"(%1506) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1509 = "tosa.add"(%1508, %1507) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1510 = "tosa.reshape"(%1509) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1511 = "tosa.mul"(%1433, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1512 = "tosa.add"(%1510, %1511) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1513 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1514 = "tosa.reduce_sum"(%1512) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1515 = "tosa.reshape"(%1513) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1516 = "tosa.mul"(%1514, %1515) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1517 = "tosa.sub"(%1512, %1516) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1518 = "tosa.mul"(%1517, %1517) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1519 = "tosa.reduce_sum"(%1518) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1520 = "tosa.reshape"(%1513) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1521 = "tosa.mul"(%1519, %1520) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1522 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1523 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1524 = "tosa.sub"(%1512, %1516) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1525 = "tosa.add"(%1521, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1526 = "tosa.rsqrt"(%1525) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1527 = "tosa.mul"(%1524, %1526) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1528 = "tosa.mul"(%1527, %1522) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1529 = "tosa.add"(%1528, %1523) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1530 = "tosa.reshape"(%1529) {new_shape = [-1, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1531 = "tosa.reshape"(%1530) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1532 = "tosa.reshape"(%22) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1533 = "tosa.matmul"(%1531, %1532) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1534 = "tosa.reshape"(%1533) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1535 = "tosa.mul"(%2, %21) {shift = 0 : i32} : (tensor<1xf32>, tensor<3072xf32>) -> tensor<3072xf32>
  %1536 = "tosa.mul"(%1534, %3) {shift = 0 : i32} : (tensor<5x3072xf32>, tensor<1x1xf32>) -> tensor<5x3072xf32>
  %1537 = "tosa.reshape"(%1535) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1538 = "tosa.add"(%1537, %1536) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1539 = "tosa.reshape"(%1538) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1540 = "tosa.mul"(%1539, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1541 = "tosa.pow"(%1539, %6) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1542 = "tosa.mul"(%1541, %7) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1543 = "tosa.mul"(%1542, %0) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1544 = "tosa.add"(%1539, %1543) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1545 = "tosa.mul"(%1544, %8) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1546 = "tosa.tanh"(%1545) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1547 = "tosa.add"(%1546, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1548 = "tosa.mul"(%1540, %1547) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1549 = "tosa.reshape"(%1548) {new_shape = [-1, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1550 = "tosa.reshape"(%1549) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1551 = "tosa.reshape"(%23) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1552 = "tosa.matmul"(%1550, %1551) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1553 = "tosa.reshape"(%1552) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1554 = "tosa.mul"(%2, %24) {shift = 0 : i32} : (tensor<1xf32>, tensor<768xf32>) -> tensor<768xf32>
  %1555 = "tosa.mul"(%1553, %3) {shift = 0 : i32} : (tensor<5x768xf32>, tensor<1x1xf32>) -> tensor<5x768xf32>
  %1556 = "tosa.reshape"(%1554) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1557 = "tosa.add"(%1556, %1555) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1558 = "tosa.reshape"(%1557) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1559 = "tosa.mul"(%1558, %0) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x1xf32>) -> tensor<1x5x768xf32>
  %1560 = "tosa.add"(%1512, %1559) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1561 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %1562 = "tosa.reduce_sum"(%1560) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1563 = "tosa.reshape"(%1561) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1564 = "tosa.mul"(%1562, %1563) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1565 = "tosa.sub"(%1560, %1564) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1566 = "tosa.mul"(%1565, %1565) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1567 = "tosa.reduce_sum"(%1566) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1568 = "tosa.reshape"(%1561) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1569 = "tosa.mul"(%1567, %1568) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1570 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1571 = "tosa.reshape"(%24) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1572 = "tosa.sub"(%1560, %1564) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1573 = "tosa.add"(%1569, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1574 = "tosa.rsqrt"(%1573) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1575 = "tosa.mul"(%1572, %1574) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1576 = "tosa.mul"(%1575, %1570) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1577 = "tosa.add"(%1576, %1571) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1578 = "tosa.transpose"(%13, %9) : (tensor<50257x768xf32>, tensor<2xi32>) -> tensor<768x50257xf32>
  %1579 = "tosa.reshape"(%1577) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1580 = "tosa.reshape"(%1579) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1581 = "tosa.reshape"(%1578) {new_shape = [1, 768, 50257]} : (tensor<768x50257xf32>) -> tensor<1x768x50257xf32>
  %1582 = "tosa.matmul"(%1580, %1581) : (tensor<1x5x768xf32>, tensor<1x768x50257xf32>) -> tensor<1x5x50257xf32>
  %1583 = "tosa.reshape"(%1582) {new_shape = [5, 50257]} : (tensor<1x5x50257xf32>) -> tensor<5x50257xf32>
  %1584 = "tosa.reshape"(%1583) {new_shape = [1, 5, 50257]} : (tensor<5x50257xf32>) -> tensor<1x5x50257xf32>
  %1585 = torch_c.from_builtin_tensor %1584 : tensor<1x5x50257xf32> -> !torch.vtensor<[1,5,50257],f32>
  return %1585 : !torch.vtensor<[1,5,50257],f32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @forward(%arg0: !torch.vtensor<[1,5],si64>) -> !torch.vtensor<[1,5,50257],f32> {
  %0 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %1 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %2 = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %3 = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %4 = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %5 = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %6 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
  %7 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %8 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %9 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %int0 = torch.constant.int 0
  %int4 = torch.constant.int 4
  %int11 = torch.constant.int 11
  %none = torch.constant.none
  %false = torch.constant.bool false
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
  %11 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
  %12 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %13 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
  %14 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
  %15 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %17 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %19 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %21 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %float0.000000e00 = torch.constant.float 0.000000e+00
  %int1 = torch.constant.int 1
  %int5 = torch.constant.int 5
  %22 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
  %cpu = torch.constant.device "cpu"
  %23 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
  %24 = torch_c.to_builtin_tensor %23 : !torch.vtensor<[5],si64> -> tensor<5xi64>
  %25 = "tosa.reshape"(%24) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
  %26 = "tosa.reshape"(%10) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
  %27 = "tosa.cast"(%22) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %28 = "tosa.gather"(%26, %27) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %29 = "tosa.reshape"(%11) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
  %30 = "tosa.cast"(%25) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %31 = "tosa.gather"(%29, %30) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %32 = "tosa.add"(%28, %31) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %33 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %34 = "tosa.reduce_sum"(%32) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %35 = "tosa.reshape"(%33) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %36 = "tosa.mul"(%34, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %37 = "tosa.sub"(%32, %36) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %38 = "tosa.mul"(%37, %37) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %39 = "tosa.reduce_sum"(%38) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %40 = "tosa.reshape"(%33) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %41 = "tosa.mul"(%39, %40) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %42 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %43 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %44 = "tosa.sub"(%32, %36) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %45 = "tosa.add"(%41, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %46 = "tosa.rsqrt"(%45) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %47 = "tosa.mul"(%44, %46) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %48 = "tosa.mul"(%47, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %49 = "tosa.add"(%48, %43) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %50 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %51 = "tosa.matmul"(%49, %50) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %52 = "tosa.reshape"(%51) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %53 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %54 = "tosa.add"(%53, %52) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %55 = "tosa.reshape"(%54) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %56 = "tosa.slice"(%55) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %57 = "tosa.slice"(%55) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %58 = "tosa.slice"(%55) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %59 = "tosa.reshape"(%56) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %60 = "tosa.transpose"(%59, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %61 = "tosa.reshape"(%57) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %62 = "tosa.transpose"(%61, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %63 = "tosa.reshape"(%58) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %64 = "tosa.transpose"(%63, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %65 = "tosa.transpose"(%62, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %66 = "tosa.reshape"(%60) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %67 = "tosa.reshape"(%65) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %68 = "tosa.matmul"(%66, %67) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %69 = "tosa.reshape"(%68) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %70 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %71 = "tosa.reshape"(%70) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %72 = "tosa.mul"(%69, %71) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %73 = torch_c.from_builtin_tensor %72 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %74 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %75 = "tosa.slice"(%74) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %76 = torch_c.from_builtin_tensor %75 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %77 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %78 = torch.aten.to.dtype %77, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %79 = torch.valsem.aten.copy %78, %76, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %80 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %81 = torch.aten.where.self %79, %73, %80 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %82 = torch_c.to_builtin_tensor %81 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %83 = "tosa.reduce_max"(%82) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %84 = "tosa.sub"(%82, %83) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %85 = "tosa.exp"(%84) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %86 = "tosa.reduce_sum"(%85) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %87 = "tosa.reciprocal"(%86) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %88 = "tosa.mul"(%85, %87) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %89 = "tosa.reshape"(%88) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %90 = "tosa.reshape"(%64) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %91 = "tosa.matmul"(%89, %90) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %92 = "tosa.reshape"(%91) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %93 = "tosa.transpose"(%92, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %94 = "tosa.reshape"(%93) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %95 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %96 = "tosa.matmul"(%94, %95) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %97 = "tosa.reshape"(%96) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %98 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %99 = "tosa.add"(%98, %97) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %100 = "tosa.reshape"(%99) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %101 = "tosa.add"(%100, %32) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %102 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %103 = "tosa.reduce_sum"(%101) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %104 = "tosa.reshape"(%102) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %105 = "tosa.mul"(%103, %104) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %106 = "tosa.sub"(%101, %105) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %107 = "tosa.mul"(%106, %106) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %108 = "tosa.reduce_sum"(%107) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %109 = "tosa.reshape"(%102) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %110 = "tosa.mul"(%108, %109) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %111 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %112 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %113 = "tosa.sub"(%101, %105) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %114 = "tosa.add"(%110, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %115 = "tosa.rsqrt"(%114) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %116 = "tosa.mul"(%113, %115) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %117 = "tosa.mul"(%116, %111) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %118 = "tosa.add"(%117, %112) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %119 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %120 = "tosa.matmul"(%118, %119) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %121 = "tosa.reshape"(%120) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %122 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %123 = "tosa.add"(%122, %121) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %124 = "tosa.reshape"(%123) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %125 = "tosa.mul"(%124, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %126 = "tosa.pow"(%124, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %127 = "tosa.mul"(%126, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %128 = "tosa.add"(%124, %127) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %129 = "tosa.mul"(%128, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %130 = "tosa.tanh"(%129) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %131 = "tosa.add"(%130, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %132 = "tosa.mul"(%125, %131) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %133 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %134 = "tosa.matmul"(%132, %133) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %135 = "tosa.reshape"(%134) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %136 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %137 = "tosa.add"(%136, %135) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %138 = "tosa.reshape"(%137) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %139 = "tosa.add"(%101, %138) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %140 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %141 = "tosa.reduce_sum"(%139) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %142 = "tosa.reshape"(%140) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %143 = "tosa.mul"(%141, %142) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %144 = "tosa.sub"(%139, %143) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %145 = "tosa.mul"(%144, %144) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %146 = "tosa.reduce_sum"(%145) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %147 = "tosa.reshape"(%140) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %148 = "tosa.mul"(%146, %147) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %149 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %150 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %151 = "tosa.sub"(%139, %143) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %152 = "tosa.add"(%148, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %153 = "tosa.rsqrt"(%152) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %154 = "tosa.mul"(%151, %153) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %155 = "tosa.mul"(%154, %149) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %156 = "tosa.add"(%155, %150) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %157 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %158 = "tosa.matmul"(%156, %157) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %159 = "tosa.reshape"(%158) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %160 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %161 = "tosa.add"(%160, %159) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %162 = "tosa.reshape"(%161) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %163 = "tosa.slice"(%162) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %164 = "tosa.slice"(%162) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %165 = "tosa.slice"(%162) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %166 = "tosa.reshape"(%163) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %167 = "tosa.transpose"(%166, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %168 = "tosa.reshape"(%164) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %169 = "tosa.transpose"(%168, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %170 = "tosa.reshape"(%165) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %171 = "tosa.transpose"(%170, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %172 = "tosa.transpose"(%169, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %173 = "tosa.reshape"(%167) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %174 = "tosa.reshape"(%172) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %175 = "tosa.matmul"(%173, %174) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %176 = "tosa.reshape"(%175) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %177 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %178 = "tosa.reshape"(%177) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %179 = "tosa.mul"(%176, %178) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %180 = torch_c.from_builtin_tensor %179 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %181 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %182 = "tosa.slice"(%181) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %183 = torch_c.from_builtin_tensor %182 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %184 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %185 = torch.aten.to.dtype %184, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %186 = torch.valsem.aten.copy %185, %183, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %187 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %188 = torch.aten.where.self %186, %180, %187 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %189 = torch_c.to_builtin_tensor %188 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %190 = "tosa.reduce_max"(%189) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %191 = "tosa.sub"(%189, %190) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %192 = "tosa.exp"(%191) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %193 = "tosa.reduce_sum"(%192) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %194 = "tosa.reciprocal"(%193) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %195 = "tosa.mul"(%192, %194) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %196 = "tosa.reshape"(%195) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %197 = "tosa.reshape"(%171) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %198 = "tosa.matmul"(%196, %197) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %199 = "tosa.reshape"(%198) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %200 = "tosa.transpose"(%199, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %201 = "tosa.reshape"(%200) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %202 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %203 = "tosa.matmul"(%201, %202) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %204 = "tosa.reshape"(%203) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %205 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %206 = "tosa.add"(%205, %204) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %207 = "tosa.reshape"(%206) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %208 = "tosa.add"(%207, %139) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %209 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %210 = "tosa.reduce_sum"(%208) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %211 = "tosa.reshape"(%209) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %212 = "tosa.mul"(%210, %211) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %213 = "tosa.sub"(%208, %212) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %214 = "tosa.mul"(%213, %213) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %215 = "tosa.reduce_sum"(%214) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %216 = "tosa.reshape"(%209) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %217 = "tosa.mul"(%215, %216) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %218 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %219 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %220 = "tosa.sub"(%208, %212) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %221 = "tosa.add"(%217, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %222 = "tosa.rsqrt"(%221) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %223 = "tosa.mul"(%220, %222) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %224 = "tosa.mul"(%223, %218) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %225 = "tosa.add"(%224, %219) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %226 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %227 = "tosa.matmul"(%225, %226) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %228 = "tosa.reshape"(%227) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %229 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %230 = "tosa.add"(%229, %228) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %231 = "tosa.reshape"(%230) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %232 = "tosa.mul"(%231, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %233 = "tosa.pow"(%231, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %234 = "tosa.mul"(%233, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %235 = "tosa.add"(%231, %234) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %236 = "tosa.mul"(%235, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %237 = "tosa.tanh"(%236) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %238 = "tosa.add"(%237, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %239 = "tosa.mul"(%232, %238) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %240 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %241 = "tosa.matmul"(%239, %240) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %242 = "tosa.reshape"(%241) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %243 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %244 = "tosa.add"(%243, %242) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %245 = "tosa.reshape"(%244) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %246 = "tosa.add"(%208, %245) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %247 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %248 = "tosa.reduce_sum"(%246) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %249 = "tosa.reshape"(%247) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %250 = "tosa.mul"(%248, %249) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %251 = "tosa.sub"(%246, %250) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %252 = "tosa.mul"(%251, %251) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %253 = "tosa.reduce_sum"(%252) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %254 = "tosa.reshape"(%247) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %255 = "tosa.mul"(%253, %254) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %256 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %257 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %258 = "tosa.sub"(%246, %250) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %259 = "tosa.add"(%255, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %260 = "tosa.rsqrt"(%259) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %261 = "tosa.mul"(%258, %260) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %262 = "tosa.mul"(%261, %256) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %263 = "tosa.add"(%262, %257) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %264 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %265 = "tosa.matmul"(%263, %264) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %266 = "tosa.reshape"(%265) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %267 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %268 = "tosa.add"(%267, %266) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %269 = "tosa.reshape"(%268) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %270 = "tosa.slice"(%269) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %271 = "tosa.slice"(%269) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %272 = "tosa.slice"(%269) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %273 = "tosa.reshape"(%270) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %274 = "tosa.transpose"(%273, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %275 = "tosa.reshape"(%271) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %276 = "tosa.transpose"(%275, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %277 = "tosa.reshape"(%272) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %278 = "tosa.transpose"(%277, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %279 = "tosa.transpose"(%276, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %280 = "tosa.reshape"(%274) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %281 = "tosa.reshape"(%279) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %282 = "tosa.matmul"(%280, %281) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %283 = "tosa.reshape"(%282) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %284 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %285 = "tosa.reshape"(%284) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %286 = "tosa.mul"(%283, %285) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %287 = torch_c.from_builtin_tensor %286 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %288 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %289 = "tosa.slice"(%288) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %290 = torch_c.from_builtin_tensor %289 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %291 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %292 = torch.aten.to.dtype %291, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %293 = torch.valsem.aten.copy %292, %290, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %294 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %295 = torch.aten.where.self %293, %287, %294 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %296 = torch_c.to_builtin_tensor %295 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %297 = "tosa.reduce_max"(%296) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %298 = "tosa.sub"(%296, %297) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %299 = "tosa.exp"(%298) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %300 = "tosa.reduce_sum"(%299) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %301 = "tosa.reciprocal"(%300) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %302 = "tosa.mul"(%299, %301) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %303 = "tosa.reshape"(%302) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %304 = "tosa.reshape"(%278) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %305 = "tosa.matmul"(%303, %304) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %306 = "tosa.reshape"(%305) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %307 = "tosa.transpose"(%306, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %308 = "tosa.reshape"(%307) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %309 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %310 = "tosa.matmul"(%308, %309) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %311 = "tosa.reshape"(%310) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %312 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %313 = "tosa.add"(%312, %311) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %314 = "tosa.reshape"(%313) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %315 = "tosa.add"(%314, %246) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %316 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %317 = "tosa.reduce_sum"(%315) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %318 = "tosa.reshape"(%316) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %319 = "tosa.mul"(%317, %318) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %320 = "tosa.sub"(%315, %319) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %321 = "tosa.mul"(%320, %320) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %322 = "tosa.reduce_sum"(%321) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %323 = "tosa.reshape"(%316) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %324 = "tosa.mul"(%322, %323) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %325 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %326 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %327 = "tosa.sub"(%315, %319) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %328 = "tosa.add"(%324, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %329 = "tosa.rsqrt"(%328) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %330 = "tosa.mul"(%327, %329) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %331 = "tosa.mul"(%330, %325) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %332 = "tosa.add"(%331, %326) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %333 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %334 = "tosa.matmul"(%332, %333) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %335 = "tosa.reshape"(%334) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %336 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %337 = "tosa.add"(%336, %335) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %338 = "tosa.reshape"(%337) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %339 = "tosa.mul"(%338, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %340 = "tosa.pow"(%338, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %341 = "tosa.mul"(%340, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %342 = "tosa.add"(%338, %341) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %343 = "tosa.mul"(%342, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %344 = "tosa.tanh"(%343) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %345 = "tosa.add"(%344, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %346 = "tosa.mul"(%339, %345) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %347 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %348 = "tosa.matmul"(%346, %347) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %349 = "tosa.reshape"(%348) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %350 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %351 = "tosa.add"(%350, %349) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %352 = "tosa.reshape"(%351) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %353 = "tosa.add"(%315, %352) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %354 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %355 = "tosa.reduce_sum"(%353) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %356 = "tosa.reshape"(%354) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %357 = "tosa.mul"(%355, %356) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %358 = "tosa.sub"(%353, %357) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %359 = "tosa.mul"(%358, %358) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %360 = "tosa.reduce_sum"(%359) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %361 = "tosa.reshape"(%354) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %362 = "tosa.mul"(%360, %361) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %363 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %364 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %365 = "tosa.sub"(%353, %357) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %366 = "tosa.add"(%362, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %367 = "tosa.rsqrt"(%366) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %368 = "tosa.mul"(%365, %367) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %369 = "tosa.mul"(%368, %363) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %370 = "tosa.add"(%369, %364) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %371 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %372 = "tosa.matmul"(%370, %371) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %373 = "tosa.reshape"(%372) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %374 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %375 = "tosa.add"(%374, %373) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %376 = "tosa.reshape"(%375) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %377 = "tosa.slice"(%376) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %378 = "tosa.slice"(%376) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %379 = "tosa.slice"(%376) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %380 = "tosa.reshape"(%377) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %381 = "tosa.transpose"(%380, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %382 = "tosa.reshape"(%378) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %383 = "tosa.transpose"(%382, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %384 = "tosa.reshape"(%379) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %385 = "tosa.transpose"(%384, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %386 = "tosa.transpose"(%383, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %387 = "tosa.reshape"(%381) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %388 = "tosa.reshape"(%386) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %389 = "tosa.matmul"(%387, %388) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %390 = "tosa.reshape"(%389) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %391 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %392 = "tosa.reshape"(%391) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %393 = "tosa.mul"(%390, %392) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %394 = torch_c.from_builtin_tensor %393 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %395 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %396 = "tosa.slice"(%395) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %397 = torch_c.from_builtin_tensor %396 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %398 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %399 = torch.aten.to.dtype %398, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %400 = torch.valsem.aten.copy %399, %397, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %401 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %402 = torch.aten.where.self %400, %394, %401 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %403 = torch_c.to_builtin_tensor %402 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %404 = "tosa.reduce_max"(%403) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %405 = "tosa.sub"(%403, %404) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %406 = "tosa.exp"(%405) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %407 = "tosa.reduce_sum"(%406) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %408 = "tosa.reciprocal"(%407) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %409 = "tosa.mul"(%406, %408) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %410 = "tosa.reshape"(%409) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %411 = "tosa.reshape"(%385) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %412 = "tosa.matmul"(%410, %411) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %413 = "tosa.reshape"(%412) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %414 = "tosa.transpose"(%413, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %415 = "tosa.reshape"(%414) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %416 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %417 = "tosa.matmul"(%415, %416) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %418 = "tosa.reshape"(%417) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %419 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %420 = "tosa.add"(%419, %418) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %421 = "tosa.reshape"(%420) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %422 = "tosa.add"(%421, %353) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %423 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %424 = "tosa.reduce_sum"(%422) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %425 = "tosa.reshape"(%423) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %426 = "tosa.mul"(%424, %425) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %427 = "tosa.sub"(%422, %426) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %428 = "tosa.mul"(%427, %427) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %429 = "tosa.reduce_sum"(%428) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %430 = "tosa.reshape"(%423) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %431 = "tosa.mul"(%429, %430) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %432 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %433 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %434 = "tosa.sub"(%422, %426) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %435 = "tosa.add"(%431, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %436 = "tosa.rsqrt"(%435) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %437 = "tosa.mul"(%434, %436) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %438 = "tosa.mul"(%437, %432) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %439 = "tosa.add"(%438, %433) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %440 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %441 = "tosa.matmul"(%439, %440) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %442 = "tosa.reshape"(%441) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %443 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %444 = "tosa.add"(%443, %442) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %445 = "tosa.reshape"(%444) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %446 = "tosa.mul"(%445, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %447 = "tosa.pow"(%445, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %448 = "tosa.mul"(%447, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %449 = "tosa.add"(%445, %448) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %450 = "tosa.mul"(%449, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %451 = "tosa.tanh"(%450) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %452 = "tosa.add"(%451, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %453 = "tosa.mul"(%446, %452) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %454 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %455 = "tosa.matmul"(%453, %454) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %456 = "tosa.reshape"(%455) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %457 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %458 = "tosa.add"(%457, %456) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %459 = "tosa.reshape"(%458) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %460 = "tosa.add"(%422, %459) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %461 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %462 = "tosa.reduce_sum"(%460) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %463 = "tosa.reshape"(%461) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %464 = "tosa.mul"(%462, %463) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %465 = "tosa.sub"(%460, %464) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %466 = "tosa.mul"(%465, %465) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %467 = "tosa.reduce_sum"(%466) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %468 = "tosa.reshape"(%461) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %469 = "tosa.mul"(%467, %468) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %470 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %471 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %472 = "tosa.sub"(%460, %464) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %473 = "tosa.add"(%469, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %474 = "tosa.rsqrt"(%473) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %475 = "tosa.mul"(%472, %474) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %476 = "tosa.mul"(%475, %470) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %477 = "tosa.add"(%476, %471) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %478 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %479 = "tosa.matmul"(%477, %478) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %480 = "tosa.reshape"(%479) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %481 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %482 = "tosa.add"(%481, %480) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %483 = "tosa.reshape"(%482) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %484 = "tosa.slice"(%483) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %485 = "tosa.slice"(%483) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %486 = "tosa.slice"(%483) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %487 = "tosa.reshape"(%484) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %488 = "tosa.transpose"(%487, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %489 = "tosa.reshape"(%485) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %490 = "tosa.transpose"(%489, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %491 = "tosa.reshape"(%486) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %492 = "tosa.transpose"(%491, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %493 = "tosa.transpose"(%490, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %494 = "tosa.reshape"(%488) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %495 = "tosa.reshape"(%493) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %496 = "tosa.matmul"(%494, %495) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %497 = "tosa.reshape"(%496) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %498 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %499 = "tosa.reshape"(%498) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %500 = "tosa.mul"(%497, %499) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %501 = torch_c.from_builtin_tensor %500 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %502 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %503 = "tosa.slice"(%502) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %504 = torch_c.from_builtin_tensor %503 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %505 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %506 = torch.aten.to.dtype %505, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %507 = torch.valsem.aten.copy %506, %504, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %508 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %509 = torch.aten.where.self %507, %501, %508 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %510 = torch_c.to_builtin_tensor %509 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %511 = "tosa.reduce_max"(%510) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %512 = "tosa.sub"(%510, %511) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %513 = "tosa.exp"(%512) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %514 = "tosa.reduce_sum"(%513) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %515 = "tosa.reciprocal"(%514) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %516 = "tosa.mul"(%513, %515) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %517 = "tosa.reshape"(%516) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %518 = "tosa.reshape"(%492) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %519 = "tosa.matmul"(%517, %518) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %520 = "tosa.reshape"(%519) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %521 = "tosa.transpose"(%520, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %522 = "tosa.reshape"(%521) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %523 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %524 = "tosa.matmul"(%522, %523) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %525 = "tosa.reshape"(%524) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %526 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %527 = "tosa.add"(%526, %525) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %528 = "tosa.reshape"(%527) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %529 = "tosa.add"(%528, %460) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %530 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %531 = "tosa.reduce_sum"(%529) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %532 = "tosa.reshape"(%530) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %533 = "tosa.mul"(%531, %532) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %534 = "tosa.sub"(%529, %533) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %535 = "tosa.mul"(%534, %534) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %536 = "tosa.reduce_sum"(%535) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %537 = "tosa.reshape"(%530) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %538 = "tosa.mul"(%536, %537) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %539 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %540 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %541 = "tosa.sub"(%529, %533) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %542 = "tosa.add"(%538, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %543 = "tosa.rsqrt"(%542) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %544 = "tosa.mul"(%541, %543) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %545 = "tosa.mul"(%544, %539) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %546 = "tosa.add"(%545, %540) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %547 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %548 = "tosa.matmul"(%546, %547) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %549 = "tosa.reshape"(%548) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %550 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %551 = "tosa.add"(%550, %549) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %552 = "tosa.reshape"(%551) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %553 = "tosa.mul"(%552, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %554 = "tosa.pow"(%552, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %555 = "tosa.mul"(%554, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %556 = "tosa.add"(%552, %555) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %557 = "tosa.mul"(%556, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %558 = "tosa.tanh"(%557) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %559 = "tosa.add"(%558, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %560 = "tosa.mul"(%553, %559) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %561 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %562 = "tosa.matmul"(%560, %561) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %563 = "tosa.reshape"(%562) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %564 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %565 = "tosa.add"(%564, %563) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %566 = "tosa.reshape"(%565) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %567 = "tosa.add"(%529, %566) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %568 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %569 = "tosa.reduce_sum"(%567) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %570 = "tosa.reshape"(%568) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %571 = "tosa.mul"(%569, %570) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %572 = "tosa.sub"(%567, %571) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %573 = "tosa.mul"(%572, %572) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %574 = "tosa.reduce_sum"(%573) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %575 = "tosa.reshape"(%568) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %576 = "tosa.mul"(%574, %575) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %577 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %578 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %579 = "tosa.sub"(%567, %571) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %580 = "tosa.add"(%576, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %581 = "tosa.rsqrt"(%580) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %582 = "tosa.mul"(%579, %581) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %583 = "tosa.mul"(%582, %577) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %584 = "tosa.add"(%583, %578) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %585 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %586 = "tosa.matmul"(%584, %585) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %587 = "tosa.reshape"(%586) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %588 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %589 = "tosa.add"(%588, %587) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %590 = "tosa.reshape"(%589) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %591 = "tosa.slice"(%590) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %592 = "tosa.slice"(%590) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %593 = "tosa.slice"(%590) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %594 = "tosa.reshape"(%591) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %595 = "tosa.transpose"(%594, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %596 = "tosa.reshape"(%592) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %597 = "tosa.transpose"(%596, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %598 = "tosa.reshape"(%593) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %599 = "tosa.transpose"(%598, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %600 = "tosa.transpose"(%597, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %601 = "tosa.reshape"(%595) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %602 = "tosa.reshape"(%600) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %603 = "tosa.matmul"(%601, %602) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %604 = "tosa.reshape"(%603) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %605 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %606 = "tosa.reshape"(%605) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %607 = "tosa.mul"(%604, %606) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %608 = torch_c.from_builtin_tensor %607 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %609 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %610 = "tosa.slice"(%609) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %611 = torch_c.from_builtin_tensor %610 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %612 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %613 = torch.aten.to.dtype %612, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %614 = torch.valsem.aten.copy %613, %611, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %615 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %616 = torch.aten.where.self %614, %608, %615 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %617 = torch_c.to_builtin_tensor %616 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %618 = "tosa.reduce_max"(%617) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %619 = "tosa.sub"(%617, %618) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %620 = "tosa.exp"(%619) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %621 = "tosa.reduce_sum"(%620) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %622 = "tosa.reciprocal"(%621) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %623 = "tosa.mul"(%620, %622) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %624 = "tosa.reshape"(%623) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %625 = "tosa.reshape"(%599) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %626 = "tosa.matmul"(%624, %625) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %627 = "tosa.reshape"(%626) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %628 = "tosa.transpose"(%627, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %629 = "tosa.reshape"(%628) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %630 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %631 = "tosa.matmul"(%629, %630) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %632 = "tosa.reshape"(%631) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %633 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %634 = "tosa.add"(%633, %632) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %635 = "tosa.reshape"(%634) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %636 = "tosa.add"(%635, %567) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %637 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %638 = "tosa.reduce_sum"(%636) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %639 = "tosa.reshape"(%637) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %640 = "tosa.mul"(%638, %639) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %641 = "tosa.sub"(%636, %640) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %642 = "tosa.mul"(%641, %641) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %643 = "tosa.reduce_sum"(%642) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %644 = "tosa.reshape"(%637) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %645 = "tosa.mul"(%643, %644) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %646 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %647 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %648 = "tosa.sub"(%636, %640) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %649 = "tosa.add"(%645, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %650 = "tosa.rsqrt"(%649) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %651 = "tosa.mul"(%648, %650) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %652 = "tosa.mul"(%651, %646) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %653 = "tosa.add"(%652, %647) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %654 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %655 = "tosa.matmul"(%653, %654) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %656 = "tosa.reshape"(%655) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %657 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %658 = "tosa.add"(%657, %656) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %659 = "tosa.reshape"(%658) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %660 = "tosa.mul"(%659, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %661 = "tosa.pow"(%659, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %662 = "tosa.mul"(%661, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %663 = "tosa.add"(%659, %662) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %664 = "tosa.mul"(%663, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %665 = "tosa.tanh"(%664) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %666 = "tosa.add"(%665, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %667 = "tosa.mul"(%660, %666) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %668 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %669 = "tosa.matmul"(%667, %668) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %670 = "tosa.reshape"(%669) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %671 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %672 = "tosa.add"(%671, %670) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %673 = "tosa.reshape"(%672) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %674 = "tosa.add"(%636, %673) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %675 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %676 = "tosa.reduce_sum"(%674) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %677 = "tosa.reshape"(%675) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %678 = "tosa.mul"(%676, %677) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %679 = "tosa.sub"(%674, %678) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %680 = "tosa.mul"(%679, %679) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %681 = "tosa.reduce_sum"(%680) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %682 = "tosa.reshape"(%675) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %683 = "tosa.mul"(%681, %682) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %684 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %685 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %686 = "tosa.sub"(%674, %678) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %687 = "tosa.add"(%683, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %688 = "tosa.rsqrt"(%687) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %689 = "tosa.mul"(%686, %688) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %690 = "tosa.mul"(%689, %684) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %691 = "tosa.add"(%690, %685) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %692 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %693 = "tosa.matmul"(%691, %692) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %694 = "tosa.reshape"(%693) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %695 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %696 = "tosa.add"(%695, %694) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %697 = "tosa.reshape"(%696) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %698 = "tosa.slice"(%697) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %699 = "tosa.slice"(%697) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %700 = "tosa.slice"(%697) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %701 = "tosa.reshape"(%698) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %702 = "tosa.transpose"(%701, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %703 = "tosa.reshape"(%699) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %704 = "tosa.transpose"(%703, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %705 = "tosa.reshape"(%700) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %706 = "tosa.transpose"(%705, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %707 = "tosa.transpose"(%704, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %708 = "tosa.reshape"(%702) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %709 = "tosa.reshape"(%707) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %710 = "tosa.matmul"(%708, %709) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %711 = "tosa.reshape"(%710) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %712 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %713 = "tosa.reshape"(%712) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %714 = "tosa.mul"(%711, %713) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %715 = torch_c.from_builtin_tensor %714 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %716 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %717 = "tosa.slice"(%716) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %718 = torch_c.from_builtin_tensor %717 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %719 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %720 = torch.aten.to.dtype %719, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %721 = torch.valsem.aten.copy %720, %718, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %722 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %723 = torch.aten.where.self %721, %715, %722 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %724 = torch_c.to_builtin_tensor %723 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %725 = "tosa.reduce_max"(%724) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %726 = "tosa.sub"(%724, %725) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %727 = "tosa.exp"(%726) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %728 = "tosa.reduce_sum"(%727) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %729 = "tosa.reciprocal"(%728) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %730 = "tosa.mul"(%727, %729) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %731 = "tosa.reshape"(%730) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %732 = "tosa.reshape"(%706) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %733 = "tosa.matmul"(%731, %732) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %734 = "tosa.reshape"(%733) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %735 = "tosa.transpose"(%734, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %736 = "tosa.reshape"(%735) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %737 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %738 = "tosa.matmul"(%736, %737) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %739 = "tosa.reshape"(%738) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %740 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %741 = "tosa.add"(%740, %739) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %742 = "tosa.reshape"(%741) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %743 = "tosa.add"(%742, %674) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %744 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %745 = "tosa.reduce_sum"(%743) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %746 = "tosa.reshape"(%744) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %747 = "tosa.mul"(%745, %746) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %748 = "tosa.sub"(%743, %747) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %749 = "tosa.mul"(%748, %748) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %750 = "tosa.reduce_sum"(%749) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %751 = "tosa.reshape"(%744) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %752 = "tosa.mul"(%750, %751) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %753 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %754 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %755 = "tosa.sub"(%743, %747) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %756 = "tosa.add"(%752, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %757 = "tosa.rsqrt"(%756) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %758 = "tosa.mul"(%755, %757) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %759 = "tosa.mul"(%758, %753) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %760 = "tosa.add"(%759, %754) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %761 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %762 = "tosa.matmul"(%760, %761) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %763 = "tosa.reshape"(%762) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %764 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %765 = "tosa.add"(%764, %763) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %766 = "tosa.reshape"(%765) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %767 = "tosa.mul"(%766, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %768 = "tosa.pow"(%766, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %769 = "tosa.mul"(%768, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %770 = "tosa.add"(%766, %769) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %771 = "tosa.mul"(%770, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %772 = "tosa.tanh"(%771) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %773 = "tosa.add"(%772, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %774 = "tosa.mul"(%767, %773) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %775 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %776 = "tosa.matmul"(%774, %775) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %777 = "tosa.reshape"(%776) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %778 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %779 = "tosa.add"(%778, %777) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %780 = "tosa.reshape"(%779) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %781 = "tosa.add"(%743, %780) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %782 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %783 = "tosa.reduce_sum"(%781) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %784 = "tosa.reshape"(%782) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %785 = "tosa.mul"(%783, %784) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %786 = "tosa.sub"(%781, %785) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %787 = "tosa.mul"(%786, %786) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %788 = "tosa.reduce_sum"(%787) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %789 = "tosa.reshape"(%782) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %790 = "tosa.mul"(%788, %789) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %791 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %792 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %793 = "tosa.sub"(%781, %785) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %794 = "tosa.add"(%790, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %795 = "tosa.rsqrt"(%794) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %796 = "tosa.mul"(%793, %795) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %797 = "tosa.mul"(%796, %791) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %798 = "tosa.add"(%797, %792) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %799 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %800 = "tosa.matmul"(%798, %799) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %801 = "tosa.reshape"(%800) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %802 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %803 = "tosa.add"(%802, %801) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %804 = "tosa.reshape"(%803) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %805 = "tosa.slice"(%804) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %806 = "tosa.slice"(%804) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %807 = "tosa.slice"(%804) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %808 = "tosa.reshape"(%805) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %809 = "tosa.transpose"(%808, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %810 = "tosa.reshape"(%806) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %811 = "tosa.transpose"(%810, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %812 = "tosa.reshape"(%807) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %813 = "tosa.transpose"(%812, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %814 = "tosa.transpose"(%811, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %815 = "tosa.reshape"(%809) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %816 = "tosa.reshape"(%814) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %817 = "tosa.matmul"(%815, %816) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %818 = "tosa.reshape"(%817) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %819 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %820 = "tosa.reshape"(%819) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %821 = "tosa.mul"(%818, %820) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %822 = torch_c.from_builtin_tensor %821 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %823 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %824 = "tosa.slice"(%823) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %825 = torch_c.from_builtin_tensor %824 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %826 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %827 = torch.aten.to.dtype %826, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %828 = torch.valsem.aten.copy %827, %825, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %829 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %830 = torch.aten.where.self %828, %822, %829 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %831 = torch_c.to_builtin_tensor %830 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %832 = "tosa.reduce_max"(%831) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %833 = "tosa.sub"(%831, %832) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %834 = "tosa.exp"(%833) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %835 = "tosa.reduce_sum"(%834) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %836 = "tosa.reciprocal"(%835) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %837 = "tosa.mul"(%834, %836) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %838 = "tosa.reshape"(%837) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %839 = "tosa.reshape"(%813) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %840 = "tosa.matmul"(%838, %839) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %841 = "tosa.reshape"(%840) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %842 = "tosa.transpose"(%841, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %843 = "tosa.reshape"(%842) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %844 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %845 = "tosa.matmul"(%843, %844) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %846 = "tosa.reshape"(%845) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %847 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %848 = "tosa.add"(%847, %846) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %849 = "tosa.reshape"(%848) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %850 = "tosa.add"(%849, %781) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %851 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %852 = "tosa.reduce_sum"(%850) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %853 = "tosa.reshape"(%851) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %854 = "tosa.mul"(%852, %853) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %855 = "tosa.sub"(%850, %854) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %856 = "tosa.mul"(%855, %855) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %857 = "tosa.reduce_sum"(%856) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %858 = "tosa.reshape"(%851) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %859 = "tosa.mul"(%857, %858) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %860 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %861 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %862 = "tosa.sub"(%850, %854) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %863 = "tosa.add"(%859, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %864 = "tosa.rsqrt"(%863) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %865 = "tosa.mul"(%862, %864) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %866 = "tosa.mul"(%865, %860) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %867 = "tosa.add"(%866, %861) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %868 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %869 = "tosa.matmul"(%867, %868) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %870 = "tosa.reshape"(%869) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %871 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %872 = "tosa.add"(%871, %870) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %873 = "tosa.reshape"(%872) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %874 = "tosa.mul"(%873, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %875 = "tosa.pow"(%873, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %876 = "tosa.mul"(%875, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %877 = "tosa.add"(%873, %876) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %878 = "tosa.mul"(%877, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %879 = "tosa.tanh"(%878) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %880 = "tosa.add"(%879, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %881 = "tosa.mul"(%874, %880) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %882 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %883 = "tosa.matmul"(%881, %882) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %884 = "tosa.reshape"(%883) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %885 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %886 = "tosa.add"(%885, %884) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %887 = "tosa.reshape"(%886) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %888 = "tosa.add"(%850, %887) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %889 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %890 = "tosa.reduce_sum"(%888) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %891 = "tosa.reshape"(%889) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %892 = "tosa.mul"(%890, %891) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %893 = "tosa.sub"(%888, %892) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %894 = "tosa.mul"(%893, %893) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %895 = "tosa.reduce_sum"(%894) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %896 = "tosa.reshape"(%889) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %897 = "tosa.mul"(%895, %896) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %898 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %899 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %900 = "tosa.sub"(%888, %892) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %901 = "tosa.add"(%897, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %902 = "tosa.rsqrt"(%901) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %903 = "tosa.mul"(%900, %902) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %904 = "tosa.mul"(%903, %898) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %905 = "tosa.add"(%904, %899) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %906 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %907 = "tosa.matmul"(%905, %906) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %908 = "tosa.reshape"(%907) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %909 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %910 = "tosa.add"(%909, %908) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %911 = "tosa.reshape"(%910) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %912 = "tosa.slice"(%911) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %913 = "tosa.slice"(%911) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %914 = "tosa.slice"(%911) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %915 = "tosa.reshape"(%912) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %916 = "tosa.transpose"(%915, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %917 = "tosa.reshape"(%913) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %918 = "tosa.transpose"(%917, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %919 = "tosa.reshape"(%914) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %920 = "tosa.transpose"(%919, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %921 = "tosa.transpose"(%918, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %922 = "tosa.reshape"(%916) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %923 = "tosa.reshape"(%921) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %924 = "tosa.matmul"(%922, %923) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %925 = "tosa.reshape"(%924) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %926 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %927 = "tosa.reshape"(%926) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %928 = "tosa.mul"(%925, %927) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %929 = torch_c.from_builtin_tensor %928 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %930 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %931 = "tosa.slice"(%930) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %932 = torch_c.from_builtin_tensor %931 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %933 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %934 = torch.aten.to.dtype %933, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %935 = torch.valsem.aten.copy %934, %932, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %936 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %937 = torch.aten.where.self %935, %929, %936 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %938 = torch_c.to_builtin_tensor %937 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %939 = "tosa.reduce_max"(%938) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %940 = "tosa.sub"(%938, %939) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %941 = "tosa.exp"(%940) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %942 = "tosa.reduce_sum"(%941) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %943 = "tosa.reciprocal"(%942) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %944 = "tosa.mul"(%941, %943) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %945 = "tosa.reshape"(%944) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %946 = "tosa.reshape"(%920) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %947 = "tosa.matmul"(%945, %946) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %948 = "tosa.reshape"(%947) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %949 = "tosa.transpose"(%948, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %950 = "tosa.reshape"(%949) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %951 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %952 = "tosa.matmul"(%950, %951) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %953 = "tosa.reshape"(%952) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %954 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %955 = "tosa.add"(%954, %953) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %956 = "tosa.reshape"(%955) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %957 = "tosa.add"(%956, %888) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %958 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %959 = "tosa.reduce_sum"(%957) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %960 = "tosa.reshape"(%958) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %961 = "tosa.mul"(%959, %960) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %962 = "tosa.sub"(%957, %961) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %963 = "tosa.mul"(%962, %962) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %964 = "tosa.reduce_sum"(%963) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %965 = "tosa.reshape"(%958) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %966 = "tosa.mul"(%964, %965) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %967 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %968 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %969 = "tosa.sub"(%957, %961) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %970 = "tosa.add"(%966, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %971 = "tosa.rsqrt"(%970) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %972 = "tosa.mul"(%969, %971) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %973 = "tosa.mul"(%972, %967) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %974 = "tosa.add"(%973, %968) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %975 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %976 = "tosa.matmul"(%974, %975) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %977 = "tosa.reshape"(%976) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %978 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %979 = "tosa.add"(%978, %977) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %980 = "tosa.reshape"(%979) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %981 = "tosa.mul"(%980, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %982 = "tosa.pow"(%980, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %983 = "tosa.mul"(%982, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %984 = "tosa.add"(%980, %983) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %985 = "tosa.mul"(%984, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %986 = "tosa.tanh"(%985) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %987 = "tosa.add"(%986, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %988 = "tosa.mul"(%981, %987) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %989 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %990 = "tosa.matmul"(%988, %989) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %991 = "tosa.reshape"(%990) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %992 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %993 = "tosa.add"(%992, %991) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %994 = "tosa.reshape"(%993) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %995 = "tosa.add"(%957, %994) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %996 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %997 = "tosa.reduce_sum"(%995) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %998 = "tosa.reshape"(%996) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %999 = "tosa.mul"(%997, %998) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1000 = "tosa.sub"(%995, %999) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1001 = "tosa.mul"(%1000, %1000) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1002 = "tosa.reduce_sum"(%1001) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1003 = "tosa.reshape"(%996) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1004 = "tosa.mul"(%1002, %1003) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1005 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1006 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1007 = "tosa.sub"(%995, %999) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1008 = "tosa.add"(%1004, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1009 = "tosa.rsqrt"(%1008) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1010 = "tosa.mul"(%1007, %1009) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1011 = "tosa.mul"(%1010, %1005) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1012 = "tosa.add"(%1011, %1006) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1013 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1014 = "tosa.matmul"(%1012, %1013) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1015 = "tosa.reshape"(%1014) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1016 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1017 = "tosa.add"(%1016, %1015) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1018 = "tosa.reshape"(%1017) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1019 = "tosa.slice"(%1018) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1020 = "tosa.slice"(%1018) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1021 = "tosa.slice"(%1018) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1022 = "tosa.reshape"(%1019) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1023 = "tosa.transpose"(%1022, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1024 = "tosa.reshape"(%1020) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1025 = "tosa.transpose"(%1024, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1026 = "tosa.reshape"(%1021) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1027 = "tosa.transpose"(%1026, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1028 = "tosa.transpose"(%1025, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1029 = "tosa.reshape"(%1023) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1030 = "tosa.reshape"(%1028) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1031 = "tosa.matmul"(%1029, %1030) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1032 = "tosa.reshape"(%1031) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1033 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %1034 = "tosa.reshape"(%1033) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1035 = "tosa.mul"(%1032, %1034) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1036 = torch_c.from_builtin_tensor %1035 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1037 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1038 = "tosa.slice"(%1037) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1039 = torch_c.from_builtin_tensor %1038 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1040 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1041 = torch.aten.to.dtype %1040, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1042 = torch.valsem.aten.copy %1041, %1039, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1043 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %1044 = torch.aten.where.self %1042, %1036, %1043 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1045 = torch_c.to_builtin_tensor %1044 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1046 = "tosa.reduce_max"(%1045) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1047 = "tosa.sub"(%1045, %1046) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1048 = "tosa.exp"(%1047) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1049 = "tosa.reduce_sum"(%1048) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1050 = "tosa.reciprocal"(%1049) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1051 = "tosa.mul"(%1048, %1050) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1052 = "tosa.reshape"(%1051) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1053 = "tosa.reshape"(%1027) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1054 = "tosa.matmul"(%1052, %1053) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1055 = "tosa.reshape"(%1054) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1056 = "tosa.transpose"(%1055, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1057 = "tosa.reshape"(%1056) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1058 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1059 = "tosa.matmul"(%1057, %1058) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1060 = "tosa.reshape"(%1059) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1061 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1062 = "tosa.add"(%1061, %1060) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1063 = "tosa.reshape"(%1062) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1064 = "tosa.add"(%1063, %995) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1065 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %1066 = "tosa.reduce_sum"(%1064) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1067 = "tosa.reshape"(%1065) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1068 = "tosa.mul"(%1066, %1067) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1069 = "tosa.sub"(%1064, %1068) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1070 = "tosa.mul"(%1069, %1069) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1071 = "tosa.reduce_sum"(%1070) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1072 = "tosa.reshape"(%1065) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1073 = "tosa.mul"(%1071, %1072) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1074 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1075 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1076 = "tosa.sub"(%1064, %1068) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1077 = "tosa.add"(%1073, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1078 = "tosa.rsqrt"(%1077) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1079 = "tosa.mul"(%1076, %1078) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1080 = "tosa.mul"(%1079, %1074) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1081 = "tosa.add"(%1080, %1075) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1082 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1083 = "tosa.matmul"(%1081, %1082) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1084 = "tosa.reshape"(%1083) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1085 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1086 = "tosa.add"(%1085, %1084) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1087 = "tosa.reshape"(%1086) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1088 = "tosa.mul"(%1087, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1089 = "tosa.pow"(%1087, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1090 = "tosa.mul"(%1089, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1091 = "tosa.add"(%1087, %1090) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1092 = "tosa.mul"(%1091, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1093 = "tosa.tanh"(%1092) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1094 = "tosa.add"(%1093, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1095 = "tosa.mul"(%1088, %1094) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1096 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1097 = "tosa.matmul"(%1095, %1096) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1098 = "tosa.reshape"(%1097) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1099 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1100 = "tosa.add"(%1099, %1098) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1101 = "tosa.reshape"(%1100) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1102 = "tosa.add"(%1064, %1101) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1103 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %1104 = "tosa.reduce_sum"(%1102) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1105 = "tosa.reshape"(%1103) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1106 = "tosa.mul"(%1104, %1105) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1107 = "tosa.sub"(%1102, %1106) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1108 = "tosa.mul"(%1107, %1107) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1109 = "tosa.reduce_sum"(%1108) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1110 = "tosa.reshape"(%1103) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1111 = "tosa.mul"(%1109, %1110) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1112 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1113 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1114 = "tosa.sub"(%1102, %1106) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1115 = "tosa.add"(%1111, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1116 = "tosa.rsqrt"(%1115) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1117 = "tosa.mul"(%1114, %1116) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1118 = "tosa.mul"(%1117, %1112) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1119 = "tosa.add"(%1118, %1113) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1120 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1121 = "tosa.matmul"(%1119, %1120) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1122 = "tosa.reshape"(%1121) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1123 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1124 = "tosa.add"(%1123, %1122) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1125 = "tosa.reshape"(%1124) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1126 = "tosa.slice"(%1125) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1127 = "tosa.slice"(%1125) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1128 = "tosa.slice"(%1125) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1129 = "tosa.reshape"(%1126) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1130 = "tosa.transpose"(%1129, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1131 = "tosa.reshape"(%1127) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1132 = "tosa.transpose"(%1131, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1133 = "tosa.reshape"(%1128) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1134 = "tosa.transpose"(%1133, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1135 = "tosa.transpose"(%1132, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1136 = "tosa.reshape"(%1130) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1137 = "tosa.reshape"(%1135) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1138 = "tosa.matmul"(%1136, %1137) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1139 = "tosa.reshape"(%1138) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1140 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %1141 = "tosa.reshape"(%1140) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1142 = "tosa.mul"(%1139, %1141) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1143 = torch_c.from_builtin_tensor %1142 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1144 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1145 = "tosa.slice"(%1144) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1146 = torch_c.from_builtin_tensor %1145 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1147 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1148 = torch.aten.to.dtype %1147, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1149 = torch.valsem.aten.copy %1148, %1146, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1150 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %1151 = torch.aten.where.self %1149, %1143, %1150 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1152 = torch_c.to_builtin_tensor %1151 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1153 = "tosa.reduce_max"(%1152) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1154 = "tosa.sub"(%1152, %1153) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1155 = "tosa.exp"(%1154) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1156 = "tosa.reduce_sum"(%1155) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1157 = "tosa.reciprocal"(%1156) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1158 = "tosa.mul"(%1155, %1157) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1159 = "tosa.reshape"(%1158) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1160 = "tosa.reshape"(%1134) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1161 = "tosa.matmul"(%1159, %1160) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1162 = "tosa.reshape"(%1161) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1163 = "tosa.transpose"(%1162, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1164 = "tosa.reshape"(%1163) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1165 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1166 = "tosa.matmul"(%1164, %1165) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1167 = "tosa.reshape"(%1166) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1168 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1169 = "tosa.add"(%1168, %1167) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1170 = "tosa.reshape"(%1169) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1171 = "tosa.add"(%1170, %1102) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1172 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %1173 = "tosa.reduce_sum"(%1171) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1174 = "tosa.reshape"(%1172) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1175 = "tosa.mul"(%1173, %1174) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1176 = "tosa.sub"(%1171, %1175) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1177 = "tosa.mul"(%1176, %1176) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1178 = "tosa.reduce_sum"(%1177) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1179 = "tosa.reshape"(%1172) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1180 = "tosa.mul"(%1178, %1179) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1181 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1182 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1183 = "tosa.sub"(%1171, %1175) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1184 = "tosa.add"(%1180, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1185 = "tosa.rsqrt"(%1184) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1186 = "tosa.mul"(%1183, %1185) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1187 = "tosa.mul"(%1186, %1181) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1188 = "tosa.add"(%1187, %1182) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1189 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1190 = "tosa.matmul"(%1188, %1189) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1191 = "tosa.reshape"(%1190) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1192 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1193 = "tosa.add"(%1192, %1191) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1194 = "tosa.reshape"(%1193) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1195 = "tosa.mul"(%1194, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1196 = "tosa.pow"(%1194, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1197 = "tosa.mul"(%1196, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1198 = "tosa.add"(%1194, %1197) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1199 = "tosa.mul"(%1198, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1200 = "tosa.tanh"(%1199) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1201 = "tosa.add"(%1200, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1202 = "tosa.mul"(%1195, %1201) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1203 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1204 = "tosa.matmul"(%1202, %1203) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1205 = "tosa.reshape"(%1204) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1206 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1207 = "tosa.add"(%1206, %1205) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1208 = "tosa.reshape"(%1207) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1209 = "tosa.add"(%1171, %1208) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1210 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %1211 = "tosa.reduce_sum"(%1209) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1212 = "tosa.reshape"(%1210) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1213 = "tosa.mul"(%1211, %1212) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1214 = "tosa.sub"(%1209, %1213) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1215 = "tosa.mul"(%1214, %1214) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1216 = "tosa.reduce_sum"(%1215) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1217 = "tosa.reshape"(%1210) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1218 = "tosa.mul"(%1216, %1217) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1219 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1220 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1221 = "tosa.sub"(%1209, %1213) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1222 = "tosa.add"(%1218, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1223 = "tosa.rsqrt"(%1222) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1224 = "tosa.mul"(%1221, %1223) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1225 = "tosa.mul"(%1224, %1219) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1226 = "tosa.add"(%1225, %1220) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1227 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %1228 = "tosa.matmul"(%1226, %1227) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %1229 = "tosa.reshape"(%1228) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %1230 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %1231 = "tosa.add"(%1230, %1229) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %1232 = "tosa.reshape"(%1231) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %1233 = "tosa.slice"(%1232) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1234 = "tosa.slice"(%1232) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1235 = "tosa.slice"(%1232) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %1236 = "tosa.reshape"(%1233) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1237 = "tosa.transpose"(%1236, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1238 = "tosa.reshape"(%1234) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1239 = "tosa.transpose"(%1238, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1240 = "tosa.reshape"(%1235) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %1241 = "tosa.transpose"(%1240, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %1242 = "tosa.transpose"(%1239, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %1243 = "tosa.reshape"(%1237) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1244 = "tosa.reshape"(%1242) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %1245 = "tosa.matmul"(%1243, %1244) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %1246 = "tosa.reshape"(%1245) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1247 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %1248 = "tosa.reshape"(%1247) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %1249 = "tosa.mul"(%1246, %1248) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %1250 = torch_c.from_builtin_tensor %1249 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %1251 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %1252 = "tosa.slice"(%1251) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %1253 = torch_c.from_builtin_tensor %1252 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %1254 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %1255 = torch.aten.to.dtype %1254, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %1256 = torch.valsem.aten.copy %1255, %1253, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %1257 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %1258 = torch.aten.where.self %1256, %1250, %1257 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %1259 = torch_c.to_builtin_tensor %1258 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %1260 = "tosa.reduce_max"(%1259) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1261 = "tosa.sub"(%1259, %1260) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1262 = "tosa.exp"(%1261) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %1263 = "tosa.reduce_sum"(%1262) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %1264 = "tosa.reciprocal"(%1263) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %1265 = "tosa.mul"(%1262, %1264) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %1266 = "tosa.reshape"(%1265) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %1267 = "tosa.reshape"(%1241) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %1268 = "tosa.matmul"(%1266, %1267) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %1269 = "tosa.reshape"(%1268) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %1270 = "tosa.transpose"(%1269, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %1271 = "tosa.reshape"(%1270) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %1272 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %1273 = "tosa.matmul"(%1271, %1272) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %1274 = "tosa.reshape"(%1273) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1275 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1276 = "tosa.add"(%1275, %1274) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1277 = "tosa.reshape"(%1276) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1278 = "tosa.add"(%1277, %1209) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1279 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %1280 = "tosa.reduce_sum"(%1278) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1281 = "tosa.reshape"(%1279) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1282 = "tosa.mul"(%1280, %1281) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1283 = "tosa.sub"(%1278, %1282) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1284 = "tosa.mul"(%1283, %1283) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1285 = "tosa.reduce_sum"(%1284) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1286 = "tosa.reshape"(%1279) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1287 = "tosa.mul"(%1285, %1286) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1288 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1289 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1290 = "tosa.sub"(%1278, %1282) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1291 = "tosa.add"(%1287, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1292 = "tosa.rsqrt"(%1291) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1293 = "tosa.mul"(%1290, %1292) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1294 = "tosa.mul"(%1293, %1288) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1295 = "tosa.add"(%1294, %1289) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1296 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %1297 = "tosa.matmul"(%1295, %1296) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1298 = "tosa.reshape"(%1297) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1299 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %1300 = "tosa.add"(%1299, %1298) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1301 = "tosa.reshape"(%1300) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1302 = "tosa.mul"(%1301, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1303 = "tosa.pow"(%1301, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1304 = "tosa.mul"(%1303, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1305 = "tosa.add"(%1301, %1304) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1306 = "tosa.mul"(%1305, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1307 = "tosa.tanh"(%1306) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1308 = "tosa.add"(%1307, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1309 = "tosa.mul"(%1302, %1308) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1310 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %1311 = "tosa.matmul"(%1309, %1310) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1312 = "tosa.reshape"(%1311) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1313 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %1314 = "tosa.add"(%1313, %1312) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1315 = "tosa.reshape"(%1314) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1316 = "tosa.add"(%1278, %1315) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1317 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %1318 = "tosa.reduce_sum"(%1316) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1319 = "tosa.reshape"(%1317) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1320 = "tosa.mul"(%1318, %1319) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1321 = "tosa.sub"(%1316, %1320) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1322 = "tosa.mul"(%1321, %1321) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1323 = "tosa.reduce_sum"(%1322) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1324 = "tosa.reshape"(%1317) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %1325 = "tosa.mul"(%1323, %1324) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1326 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1327 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %1328 = "tosa.sub"(%1316, %1320) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1329 = "tosa.add"(%1325, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1330 = "tosa.rsqrt"(%1329) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1331 = "tosa.mul"(%1328, %1330) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1332 = "tosa.mul"(%1331, %1326) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1333 = "tosa.add"(%1332, %1327) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1334 = "tosa.transpose"(%10, %6) : (tensor<50257x768xf32>, tensor<2xi32>) -> tensor<768x50257xf32>
  %1335 = "tosa.reshape"(%1334) {new_shape = [1, 768, 50257]} : (tensor<768x50257xf32>) -> tensor<1x768x50257xf32>
  %1336 = "tosa.matmul"(%1333, %1335) : (tensor<1x5x768xf32>, tensor<1x768x50257xf32>) -> tensor<1x5x50257xf32>
  %1337 = torch_c.from_builtin_tensor %1336 : tensor<1x5x50257xf32> -> !torch.vtensor<[1,5,50257],f32>
  return %1337 : !torch.vtensor<[1,5,50257],f32>
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @forward(%arg0: !torch.vtensor<[1,5],si64>) -> !torch.vtensor<[1,5,50257],f32> {
  %0 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %1 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %2 = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %3 = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %4 = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %5 = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %6 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
  %7 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %8 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %9 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %int0 = torch.constant.int 0
  %int4 = torch.constant.int 4
  %int11 = torch.constant.int 11
  %none = torch.constant.none
  %false = torch.constant.bool false
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
  %11 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
  %12 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %13 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
  %14 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
  %15 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %17 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %19 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %21 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %float0.000000e00 = torch.constant.float 0.000000e+00
  %int1 = torch.constant.int 1
  %int5 = torch.constant.int 5
  %22 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
  %cpu = torch.constant.device "cpu"
  %23 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
  %24 = torch_c.to_builtin_tensor %23 : !torch.vtensor<[5],si64> -> tensor<5xi64>
  %25 = "tosa.reshape"(%24) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
  %26 = "tosa.reshape"(%10) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
  %27 = "tosa.cast"(%22) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %28 = "tosa.gather"(%26, %27) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %29 = "tosa.reshape"(%11) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
  %30 = "tosa.cast"(%25) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %31 = "tosa.gather"(%29, %30) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %32 = "tosa.add"(%28, %31) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %33 = "tosa.reciprocal"(%9) : (tensor<1xf32>) -> tensor<1xf32>
  %34 = "tosa.reduce_sum"(%32) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %35 = "tosa.reshape"(%33) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %36 = "tosa.mul"(%34, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %37 = "tosa.sub"(%32, %36) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %38 = "tosa.mul"(%37, %37) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %39 = "tosa.reduce_sum"(%38) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %40 = "tosa.mul"(%39, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %41 = "tosa.reshape"(%21) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %42 = "tosa.add"(%40, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %43 = "tosa.rsqrt"(%42) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %44 = "tosa.mul"(%37, %43) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %45 = "tosa.mul"(%44, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %46 = "tosa.add"(%45, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %47 = "tosa.reshape"(%16) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %48 = "tosa.matmul"(%46, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %49 = "tosa.reshape"(%48) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %50 = "tosa.reshape"(%15) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %51 = "tosa.add"(%50, %49) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %52 = "tosa.reshape"(%51) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %53 = "tosa.slice"(%52) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %54 = "tosa.slice"(%52) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %55 = "tosa.slice"(%52) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %56 = "tosa.reshape"(%53) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %57 = "tosa.transpose"(%56, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %58 = "tosa.reshape"(%54) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %59 = "tosa.transpose"(%58, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %60 = "tosa.reshape"(%55) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %61 = "tosa.transpose"(%60, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %62 = "tosa.transpose"(%59, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %63 = "tosa.reshape"(%57) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %64 = "tosa.reshape"(%62) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %65 = "tosa.matmul"(%63, %64) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %66 = "tosa.reshape"(%65) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %67 = "tosa.reciprocal"(%12) : (tensor<f32>) -> tensor<f32>
  %68 = "tosa.reshape"(%67) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %69 = "tosa.mul"(%66, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %70 = torch_c.from_builtin_tensor %69 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %71 = "tosa.slice"(%13) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %72 = "tosa.slice"(%71) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %73 = torch_c.from_builtin_tensor %72 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %74 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %75 = torch.aten.to.dtype %74, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %76 = torch.valsem.aten.copy %75, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %77 = torch_c.from_builtin_tensor %14 : tensor<f32> -> !torch.vtensor<[],f32>
  %78 = torch.aten.where.self %76, %70, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %79 = torch_c.to_builtin_tensor %78 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %80 = "tosa.reduce_max"(%79) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %81 = "tosa.sub"(%79, %80) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %82 = "tosa.exp"(%81) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %83 = "tosa.reduce_sum"(%82) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %84 = "tosa.reciprocal"(%83) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %85 = "tosa.mul"(%82, %84) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %86 = "tosa.reshape"(%85) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %87 = "tosa.reshape"(%61) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %88 = "tosa.matmul"(%86, %87) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %89 = "tosa.reshape"(%88) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %90 = "tosa.transpose"(%89, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %91 = "tosa.reshape"(%90) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %92 = "tosa.reshape"(%17) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %93 = "tosa.matmul"(%91, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %94 = "tosa.reshape"(%93) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %95 = "tosa.reshape"(%21) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %96 = "tosa.add"(%95, %94) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %97 = "tosa.reshape"(%96) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %98 = "tosa.add"(%97, %32) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %99 = "tosa.reduce_sum"(%98) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %100 = "tosa.mul"(%99, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %101 = "tosa.sub"(%98, %100) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %102 = "tosa.mul"(%101, %101) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %103 = "tosa.reduce_sum"(%102) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %104 = "tosa.mul"(%103, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %105 = "tosa.add"(%104, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %106 = "tosa.rsqrt"(%105) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %107 = "tosa.mul"(%101, %106) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %108 = "tosa.mul"(%107, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %109 = "tosa.add"(%108, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %110 = "tosa.reshape"(%19) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %111 = "tosa.matmul"(%109, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %112 = "tosa.reshape"(%111) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %113 = "tosa.reshape"(%18) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %114 = "tosa.add"(%113, %112) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %115 = "tosa.reshape"(%114) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %116 = "tosa.mul"(%115, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %117 = "tosa.pow"(%115, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %118 = "tosa.mul"(%117, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %119 = "tosa.add"(%115, %118) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %120 = "tosa.mul"(%119, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %121 = "tosa.tanh"(%120) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %122 = "tosa.add"(%121, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %123 = "tosa.mul"(%116, %122) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %124 = "tosa.reshape"(%20) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %125 = "tosa.matmul"(%123, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %126 = "tosa.reshape"(%125) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %127 = "tosa.add"(%95, %126) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %128 = "tosa.reshape"(%127) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %129 = "tosa.add"(%98, %128) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %130 = "tosa.reduce_sum"(%129) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %131 = "tosa.mul"(%130, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %132 = "tosa.sub"(%129, %131) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %133 = "tosa.mul"(%132, %132) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %134 = "tosa.reduce_sum"(%133) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %135 = "tosa.mul"(%134, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %136 = "tosa.add"(%135, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %137 = "tosa.rsqrt"(%136) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %138 = "tosa.mul"(%132, %137) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %139 = "tosa.mul"(%138, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %140 = "tosa.add"(%139, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %141 = "tosa.matmul"(%140, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %142 = "tosa.reshape"(%141) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %143 = "tosa.add"(%50, %142) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %144 = "tosa.reshape"(%143) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %145 = "tosa.slice"(%144) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %146 = "tosa.slice"(%144) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %147 = "tosa.slice"(%144) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %148 = "tosa.reshape"(%145) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %149 = "tosa.transpose"(%148, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %150 = "tosa.reshape"(%146) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %151 = "tosa.transpose"(%150, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %152 = "tosa.reshape"(%147) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %153 = "tosa.transpose"(%152, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %154 = "tosa.transpose"(%151, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %155 = "tosa.reshape"(%149) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %156 = "tosa.reshape"(%154) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %157 = "tosa.matmul"(%155, %156) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %158 = "tosa.reshape"(%157) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %159 = "tosa.mul"(%158, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %160 = torch_c.from_builtin_tensor %159 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %161 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %162 = torch.aten.to.dtype %161, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %163 = torch.valsem.aten.copy %162, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %164 = torch.aten.where.self %163, %160, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %165 = torch_c.to_builtin_tensor %164 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %166 = "tosa.reduce_max"(%165) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %167 = "tosa.sub"(%165, %166) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %168 = "tosa.exp"(%167) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %169 = "tosa.reduce_sum"(%168) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %170 = "tosa.reciprocal"(%169) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %171 = "tosa.mul"(%168, %170) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %172 = "tosa.reshape"(%171) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %173 = "tosa.reshape"(%153) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %174 = "tosa.matmul"(%172, %173) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %175 = "tosa.reshape"(%174) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %176 = "tosa.transpose"(%175, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %177 = "tosa.reshape"(%176) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %178 = "tosa.matmul"(%177, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %179 = "tosa.reshape"(%178) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %180 = "tosa.add"(%95, %179) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %181 = "tosa.reshape"(%180) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %182 = "tosa.add"(%181, %129) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %183 = "tosa.reduce_sum"(%182) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %184 = "tosa.mul"(%183, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %185 = "tosa.sub"(%182, %184) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %186 = "tosa.mul"(%185, %185) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %187 = "tosa.reduce_sum"(%186) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %188 = "tosa.mul"(%187, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %189 = "tosa.add"(%188, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %190 = "tosa.rsqrt"(%189) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %191 = "tosa.mul"(%185, %190) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %192 = "tosa.mul"(%191, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %193 = "tosa.add"(%192, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %194 = "tosa.matmul"(%193, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %195 = "tosa.reshape"(%194) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %196 = "tosa.add"(%113, %195) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %197 = "tosa.reshape"(%196) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %198 = "tosa.mul"(%197, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %199 = "tosa.pow"(%197, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %200 = "tosa.mul"(%199, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %201 = "tosa.add"(%197, %200) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %202 = "tosa.mul"(%201, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %203 = "tosa.tanh"(%202) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %204 = "tosa.add"(%203, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %205 = "tosa.mul"(%198, %204) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %206 = "tosa.matmul"(%205, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %207 = "tosa.reshape"(%206) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %208 = "tosa.add"(%95, %207) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %209 = "tosa.reshape"(%208) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %210 = "tosa.add"(%182, %209) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %211 = "tosa.reduce_sum"(%210) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %212 = "tosa.mul"(%211, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %213 = "tosa.sub"(%210, %212) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %214 = "tosa.mul"(%213, %213) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %215 = "tosa.reduce_sum"(%214) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %216 = "tosa.mul"(%215, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %217 = "tosa.add"(%216, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %218 = "tosa.rsqrt"(%217) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %219 = "tosa.mul"(%213, %218) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %220 = "tosa.mul"(%219, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %221 = "tosa.add"(%220, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %222 = "tosa.matmul"(%221, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %223 = "tosa.reshape"(%222) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %224 = "tosa.add"(%50, %223) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %225 = "tosa.reshape"(%224) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %226 = "tosa.slice"(%225) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %227 = "tosa.slice"(%225) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %228 = "tosa.slice"(%225) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %229 = "tosa.reshape"(%226) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %230 = "tosa.transpose"(%229, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %231 = "tosa.reshape"(%227) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %232 = "tosa.transpose"(%231, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %233 = "tosa.reshape"(%228) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %234 = "tosa.transpose"(%233, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %235 = "tosa.transpose"(%232, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %236 = "tosa.reshape"(%230) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %237 = "tosa.reshape"(%235) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %238 = "tosa.matmul"(%236, %237) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %239 = "tosa.reshape"(%238) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %240 = "tosa.mul"(%239, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %241 = torch_c.from_builtin_tensor %240 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %242 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %243 = torch.aten.to.dtype %242, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %244 = torch.valsem.aten.copy %243, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %245 = torch.aten.where.self %244, %241, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %246 = torch_c.to_builtin_tensor %245 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %247 = "tosa.reduce_max"(%246) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %248 = "tosa.sub"(%246, %247) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %249 = "tosa.exp"(%248) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %250 = "tosa.reduce_sum"(%249) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %251 = "tosa.reciprocal"(%250) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %252 = "tosa.mul"(%249, %251) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %253 = "tosa.reshape"(%252) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %254 = "tosa.reshape"(%234) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %255 = "tosa.matmul"(%253, %254) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %256 = "tosa.reshape"(%255) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %257 = "tosa.transpose"(%256, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %258 = "tosa.reshape"(%257) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %259 = "tosa.matmul"(%258, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %260 = "tosa.reshape"(%259) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %261 = "tosa.add"(%95, %260) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %262 = "tosa.reshape"(%261) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %263 = "tosa.add"(%262, %210) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %264 = "tosa.reduce_sum"(%263) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %265 = "tosa.mul"(%264, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %266 = "tosa.sub"(%263, %265) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %267 = "tosa.mul"(%266, %266) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %268 = "tosa.reduce_sum"(%267) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %269 = "tosa.mul"(%268, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %270 = "tosa.add"(%269, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %271 = "tosa.rsqrt"(%270) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %272 = "tosa.mul"(%266, %271) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %273 = "tosa.mul"(%272, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %274 = "tosa.add"(%273, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %275 = "tosa.matmul"(%274, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %276 = "tosa.reshape"(%275) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %277 = "tosa.add"(%113, %276) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %278 = "tosa.reshape"(%277) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %279 = "tosa.mul"(%278, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %280 = "tosa.pow"(%278, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %281 = "tosa.mul"(%280, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %282 = "tosa.add"(%278, %281) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %283 = "tosa.mul"(%282, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %284 = "tosa.tanh"(%283) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %285 = "tosa.add"(%284, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %286 = "tosa.mul"(%279, %285) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %287 = "tosa.matmul"(%286, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %288 = "tosa.reshape"(%287) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %289 = "tosa.add"(%95, %288) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %290 = "tosa.reshape"(%289) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %291 = "tosa.add"(%263, %290) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %292 = "tosa.reduce_sum"(%291) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %293 = "tosa.mul"(%292, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %294 = "tosa.sub"(%291, %293) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %295 = "tosa.mul"(%294, %294) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %296 = "tosa.reduce_sum"(%295) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %297 = "tosa.mul"(%296, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %298 = "tosa.add"(%297, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %299 = "tosa.rsqrt"(%298) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %300 = "tosa.mul"(%294, %299) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %301 = "tosa.mul"(%300, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %302 = "tosa.add"(%301, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %303 = "tosa.matmul"(%302, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %304 = "tosa.reshape"(%303) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %305 = "tosa.add"(%50, %304) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %306 = "tosa.reshape"(%305) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %307 = "tosa.slice"(%306) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %308 = "tosa.slice"(%306) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %309 = "tosa.slice"(%306) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %310 = "tosa.reshape"(%307) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %311 = "tosa.transpose"(%310, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %312 = "tosa.reshape"(%308) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %313 = "tosa.transpose"(%312, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %314 = "tosa.reshape"(%309) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %315 = "tosa.transpose"(%314, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %316 = "tosa.transpose"(%313, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %317 = "tosa.reshape"(%311) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %318 = "tosa.reshape"(%316) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %319 = "tosa.matmul"(%317, %318) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %320 = "tosa.reshape"(%319) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %321 = "tosa.mul"(%320, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %322 = torch_c.from_builtin_tensor %321 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %323 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %324 = torch.aten.to.dtype %323, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %325 = torch.valsem.aten.copy %324, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %326 = torch.aten.where.self %325, %322, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %327 = torch_c.to_builtin_tensor %326 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %328 = "tosa.reduce_max"(%327) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %329 = "tosa.sub"(%327, %328) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %330 = "tosa.exp"(%329) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %331 = "tosa.reduce_sum"(%330) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %332 = "tosa.reciprocal"(%331) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %333 = "tosa.mul"(%330, %332) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %334 = "tosa.reshape"(%333) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %335 = "tosa.reshape"(%315) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %336 = "tosa.matmul"(%334, %335) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %337 = "tosa.reshape"(%336) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %338 = "tosa.transpose"(%337, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %339 = "tosa.reshape"(%338) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %340 = "tosa.matmul"(%339, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %341 = "tosa.reshape"(%340) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %342 = "tosa.add"(%95, %341) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %343 = "tosa.reshape"(%342) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %344 = "tosa.add"(%343, %291) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %345 = "tosa.reduce_sum"(%344) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %346 = "tosa.mul"(%345, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %347 = "tosa.sub"(%344, %346) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %348 = "tosa.mul"(%347, %347) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %349 = "tosa.reduce_sum"(%348) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %350 = "tosa.mul"(%349, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %351 = "tosa.add"(%350, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %352 = "tosa.rsqrt"(%351) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %353 = "tosa.mul"(%347, %352) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %354 = "tosa.mul"(%353, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %355 = "tosa.add"(%354, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %356 = "tosa.matmul"(%355, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %357 = "tosa.reshape"(%356) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %358 = "tosa.add"(%113, %357) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %359 = "tosa.reshape"(%358) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %360 = "tosa.mul"(%359, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %361 = "tosa.pow"(%359, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %362 = "tosa.mul"(%361, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %363 = "tosa.add"(%359, %362) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %364 = "tosa.mul"(%363, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %365 = "tosa.tanh"(%364) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %366 = "tosa.add"(%365, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %367 = "tosa.mul"(%360, %366) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %368 = "tosa.matmul"(%367, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %369 = "tosa.reshape"(%368) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %370 = "tosa.add"(%95, %369) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %371 = "tosa.reshape"(%370) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %372 = "tosa.add"(%344, %371) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %373 = "tosa.reduce_sum"(%372) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %374 = "tosa.mul"(%373, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %375 = "tosa.sub"(%372, %374) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %376 = "tosa.mul"(%375, %375) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %377 = "tosa.reduce_sum"(%376) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %378 = "tosa.mul"(%377, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %379 = "tosa.add"(%378, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %380 = "tosa.rsqrt"(%379) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %381 = "tosa.mul"(%375, %380) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %382 = "tosa.mul"(%381, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %383 = "tosa.add"(%382, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %384 = "tosa.matmul"(%383, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %385 = "tosa.reshape"(%384) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %386 = "tosa.add"(%50, %385) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %387 = "tosa.reshape"(%386) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %388 = "tosa.slice"(%387) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %389 = "tosa.slice"(%387) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %390 = "tosa.slice"(%387) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %391 = "tosa.reshape"(%388) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %392 = "tosa.transpose"(%391, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %393 = "tosa.reshape"(%389) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %394 = "tosa.transpose"(%393, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %395 = "tosa.reshape"(%390) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %396 = "tosa.transpose"(%395, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %397 = "tosa.transpose"(%394, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %398 = "tosa.reshape"(%392) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %399 = "tosa.reshape"(%397) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %400 = "tosa.matmul"(%398, %399) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %401 = "tosa.reshape"(%400) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %402 = "tosa.mul"(%401, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %403 = torch_c.from_builtin_tensor %402 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %404 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %405 = torch.aten.to.dtype %404, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %406 = torch.valsem.aten.copy %405, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %407 = torch.aten.where.self %406, %403, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %408 = torch_c.to_builtin_tensor %407 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %409 = "tosa.reduce_max"(%408) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %410 = "tosa.sub"(%408, %409) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %411 = "tosa.exp"(%410) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %412 = "tosa.reduce_sum"(%411) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %413 = "tosa.reciprocal"(%412) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %414 = "tosa.mul"(%411, %413) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %415 = "tosa.reshape"(%414) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %416 = "tosa.reshape"(%396) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %417 = "tosa.matmul"(%415, %416) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %418 = "tosa.reshape"(%417) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %419 = "tosa.transpose"(%418, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %420 = "tosa.reshape"(%419) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %421 = "tosa.matmul"(%420, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %422 = "tosa.reshape"(%421) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %423 = "tosa.add"(%95, %422) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %424 = "tosa.reshape"(%423) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %425 = "tosa.add"(%424, %372) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %426 = "tosa.reduce_sum"(%425) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %427 = "tosa.mul"(%426, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %428 = "tosa.sub"(%425, %427) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %429 = "tosa.mul"(%428, %428) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %430 = "tosa.reduce_sum"(%429) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %431 = "tosa.mul"(%430, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %432 = "tosa.add"(%431, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %433 = "tosa.rsqrt"(%432) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %434 = "tosa.mul"(%428, %433) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %435 = "tosa.mul"(%434, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %436 = "tosa.add"(%435, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %437 = "tosa.matmul"(%436, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %438 = "tosa.reshape"(%437) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %439 = "tosa.add"(%113, %438) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %440 = "tosa.reshape"(%439) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %441 = "tosa.mul"(%440, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %442 = "tosa.pow"(%440, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %443 = "tosa.mul"(%442, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %444 = "tosa.add"(%440, %443) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %445 = "tosa.mul"(%444, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %446 = "tosa.tanh"(%445) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %447 = "tosa.add"(%446, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %448 = "tosa.mul"(%441, %447) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %449 = "tosa.matmul"(%448, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %450 = "tosa.reshape"(%449) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %451 = "tosa.add"(%95, %450) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %452 = "tosa.reshape"(%451) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %453 = "tosa.add"(%425, %452) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %454 = "tosa.reduce_sum"(%453) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %455 = "tosa.mul"(%454, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %456 = "tosa.sub"(%453, %455) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %457 = "tosa.mul"(%456, %456) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %458 = "tosa.reduce_sum"(%457) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %459 = "tosa.mul"(%458, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %460 = "tosa.add"(%459, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %461 = "tosa.rsqrt"(%460) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %462 = "tosa.mul"(%456, %461) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %463 = "tosa.mul"(%462, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %464 = "tosa.add"(%463, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %465 = "tosa.matmul"(%464, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %466 = "tosa.reshape"(%465) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %467 = "tosa.add"(%50, %466) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %468 = "tosa.reshape"(%467) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %469 = "tosa.slice"(%468) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %470 = "tosa.slice"(%468) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %471 = "tosa.slice"(%468) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %472 = "tosa.reshape"(%469) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %473 = "tosa.transpose"(%472, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %474 = "tosa.reshape"(%470) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %475 = "tosa.transpose"(%474, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %476 = "tosa.reshape"(%471) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %477 = "tosa.transpose"(%476, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %478 = "tosa.transpose"(%475, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %479 = "tosa.reshape"(%473) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %480 = "tosa.reshape"(%478) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %481 = "tosa.matmul"(%479, %480) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %482 = "tosa.reshape"(%481) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %483 = "tosa.mul"(%482, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %484 = torch_c.from_builtin_tensor %483 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %485 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %486 = torch.aten.to.dtype %485, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %487 = torch.valsem.aten.copy %486, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %488 = torch.aten.where.self %487, %484, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %489 = torch_c.to_builtin_tensor %488 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %490 = "tosa.reduce_max"(%489) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %491 = "tosa.sub"(%489, %490) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %492 = "tosa.exp"(%491) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %493 = "tosa.reduce_sum"(%492) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %494 = "tosa.reciprocal"(%493) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %495 = "tosa.mul"(%492, %494) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %496 = "tosa.reshape"(%495) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %497 = "tosa.reshape"(%477) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %498 = "tosa.matmul"(%496, %497) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %499 = "tosa.reshape"(%498) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %500 = "tosa.transpose"(%499, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %501 = "tosa.reshape"(%500) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %502 = "tosa.matmul"(%501, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %503 = "tosa.reshape"(%502) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %504 = "tosa.add"(%95, %503) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %505 = "tosa.reshape"(%504) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %506 = "tosa.add"(%505, %453) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %507 = "tosa.reduce_sum"(%506) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %508 = "tosa.mul"(%507, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %509 = "tosa.sub"(%506, %508) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %510 = "tosa.mul"(%509, %509) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %511 = "tosa.reduce_sum"(%510) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %512 = "tosa.mul"(%511, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %513 = "tosa.add"(%512, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %514 = "tosa.rsqrt"(%513) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %515 = "tosa.mul"(%509, %514) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %516 = "tosa.mul"(%515, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %517 = "tosa.add"(%516, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %518 = "tosa.matmul"(%517, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %519 = "tosa.reshape"(%518) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %520 = "tosa.add"(%113, %519) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %521 = "tosa.reshape"(%520) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %522 = "tosa.mul"(%521, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %523 = "tosa.pow"(%521, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %524 = "tosa.mul"(%523, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %525 = "tosa.add"(%521, %524) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %526 = "tosa.mul"(%525, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %527 = "tosa.tanh"(%526) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %528 = "tosa.add"(%527, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %529 = "tosa.mul"(%522, %528) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %530 = "tosa.matmul"(%529, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %531 = "tosa.reshape"(%530) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %532 = "tosa.add"(%95, %531) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %533 = "tosa.reshape"(%532) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %534 = "tosa.add"(%506, %533) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %535 = "tosa.reduce_sum"(%534) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %536 = "tosa.mul"(%535, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %537 = "tosa.sub"(%534, %536) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %538 = "tosa.mul"(%537, %537) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %539 = "tosa.reduce_sum"(%538) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %540 = "tosa.mul"(%539, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %541 = "tosa.add"(%540, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %542 = "tosa.rsqrt"(%541) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %543 = "tosa.mul"(%537, %542) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %544 = "tosa.mul"(%543, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %545 = "tosa.add"(%544, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %546 = "tosa.matmul"(%545, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %547 = "tosa.reshape"(%546) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %548 = "tosa.add"(%50, %547) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %549 = "tosa.reshape"(%548) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %550 = "tosa.slice"(%549) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %551 = "tosa.slice"(%549) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %552 = "tosa.slice"(%549) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %553 = "tosa.reshape"(%550) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %554 = "tosa.transpose"(%553, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %555 = "tosa.reshape"(%551) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %556 = "tosa.transpose"(%555, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %557 = "tosa.reshape"(%552) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %558 = "tosa.transpose"(%557, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %559 = "tosa.transpose"(%556, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %560 = "tosa.reshape"(%554) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %561 = "tosa.reshape"(%559) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %562 = "tosa.matmul"(%560, %561) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %563 = "tosa.reshape"(%562) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %564 = "tosa.mul"(%563, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %565 = torch_c.from_builtin_tensor %564 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %566 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %567 = torch.aten.to.dtype %566, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %568 = torch.valsem.aten.copy %567, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %569 = torch.aten.where.self %568, %565, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %570 = torch_c.to_builtin_tensor %569 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %571 = "tosa.reduce_max"(%570) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %572 = "tosa.sub"(%570, %571) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %573 = "tosa.exp"(%572) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %574 = "tosa.reduce_sum"(%573) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %575 = "tosa.reciprocal"(%574) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %576 = "tosa.mul"(%573, %575) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %577 = "tosa.reshape"(%576) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %578 = "tosa.reshape"(%558) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %579 = "tosa.matmul"(%577, %578) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %580 = "tosa.reshape"(%579) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %581 = "tosa.transpose"(%580, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %582 = "tosa.reshape"(%581) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %583 = "tosa.matmul"(%582, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %584 = "tosa.reshape"(%583) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %585 = "tosa.add"(%95, %584) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %586 = "tosa.reshape"(%585) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %587 = "tosa.add"(%586, %534) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %588 = "tosa.reduce_sum"(%587) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %589 = "tosa.mul"(%588, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %590 = "tosa.sub"(%587, %589) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %591 = "tosa.mul"(%590, %590) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %592 = "tosa.reduce_sum"(%591) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %593 = "tosa.mul"(%592, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %594 = "tosa.add"(%593, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %595 = "tosa.rsqrt"(%594) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %596 = "tosa.mul"(%590, %595) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %597 = "tosa.mul"(%596, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %598 = "tosa.add"(%597, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %599 = "tosa.matmul"(%598, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %600 = "tosa.reshape"(%599) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %601 = "tosa.add"(%113, %600) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %602 = "tosa.reshape"(%601) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %603 = "tosa.mul"(%602, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %604 = "tosa.pow"(%602, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %605 = "tosa.mul"(%604, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %606 = "tosa.add"(%602, %605) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %607 = "tosa.mul"(%606, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %608 = "tosa.tanh"(%607) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %609 = "tosa.add"(%608, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %610 = "tosa.mul"(%603, %609) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %611 = "tosa.matmul"(%610, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %612 = "tosa.reshape"(%611) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %613 = "tosa.add"(%95, %612) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %614 = "tosa.reshape"(%613) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %615 = "tosa.add"(%587, %614) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %616 = "tosa.reduce_sum"(%615) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %617 = "tosa.mul"(%616, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %618 = "tosa.sub"(%615, %617) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %619 = "tosa.mul"(%618, %618) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %620 = "tosa.reduce_sum"(%619) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %621 = "tosa.mul"(%620, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %622 = "tosa.add"(%621, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %623 = "tosa.rsqrt"(%622) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %624 = "tosa.mul"(%618, %623) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %625 = "tosa.mul"(%624, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %626 = "tosa.add"(%625, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %627 = "tosa.matmul"(%626, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %628 = "tosa.reshape"(%627) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %629 = "tosa.add"(%50, %628) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %630 = "tosa.reshape"(%629) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %631 = "tosa.slice"(%630) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %632 = "tosa.slice"(%630) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %633 = "tosa.slice"(%630) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %634 = "tosa.reshape"(%631) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %635 = "tosa.transpose"(%634, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %636 = "tosa.reshape"(%632) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %637 = "tosa.transpose"(%636, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %638 = "tosa.reshape"(%633) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %639 = "tosa.transpose"(%638, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %640 = "tosa.transpose"(%637, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %641 = "tosa.reshape"(%635) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %642 = "tosa.reshape"(%640) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %643 = "tosa.matmul"(%641, %642) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %644 = "tosa.reshape"(%643) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %645 = "tosa.mul"(%644, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %646 = torch_c.from_builtin_tensor %645 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %647 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %648 = torch.aten.to.dtype %647, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %649 = torch.valsem.aten.copy %648, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %650 = torch.aten.where.self %649, %646, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %651 = torch_c.to_builtin_tensor %650 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %652 = "tosa.reduce_max"(%651) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %653 = "tosa.sub"(%651, %652) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %654 = "tosa.exp"(%653) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %655 = "tosa.reduce_sum"(%654) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %656 = "tosa.reciprocal"(%655) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %657 = "tosa.mul"(%654, %656) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %658 = "tosa.reshape"(%657) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %659 = "tosa.reshape"(%639) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %660 = "tosa.matmul"(%658, %659) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %661 = "tosa.reshape"(%660) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %662 = "tosa.transpose"(%661, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %663 = "tosa.reshape"(%662) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %664 = "tosa.matmul"(%663, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %665 = "tosa.reshape"(%664) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %666 = "tosa.add"(%95, %665) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %667 = "tosa.reshape"(%666) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %668 = "tosa.add"(%667, %615) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %669 = "tosa.reduce_sum"(%668) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %670 = "tosa.mul"(%669, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %671 = "tosa.sub"(%668, %670) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %672 = "tosa.mul"(%671, %671) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %673 = "tosa.reduce_sum"(%672) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %674 = "tosa.mul"(%673, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %675 = "tosa.add"(%674, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %676 = "tosa.rsqrt"(%675) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %677 = "tosa.mul"(%671, %676) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %678 = "tosa.mul"(%677, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %679 = "tosa.add"(%678, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %680 = "tosa.matmul"(%679, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %681 = "tosa.reshape"(%680) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %682 = "tosa.add"(%113, %681) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %683 = "tosa.reshape"(%682) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %684 = "tosa.mul"(%683, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %685 = "tosa.pow"(%683, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %686 = "tosa.mul"(%685, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %687 = "tosa.add"(%683, %686) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %688 = "tosa.mul"(%687, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %689 = "tosa.tanh"(%688) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %690 = "tosa.add"(%689, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %691 = "tosa.mul"(%684, %690) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %692 = "tosa.matmul"(%691, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %693 = "tosa.reshape"(%692) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %694 = "tosa.add"(%95, %693) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %695 = "tosa.reshape"(%694) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %696 = "tosa.add"(%668, %695) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %697 = "tosa.reduce_sum"(%696) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %698 = "tosa.mul"(%697, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %699 = "tosa.sub"(%696, %698) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %700 = "tosa.mul"(%699, %699) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %701 = "tosa.reduce_sum"(%700) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %702 = "tosa.mul"(%701, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %703 = "tosa.add"(%702, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %704 = "tosa.rsqrt"(%703) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %705 = "tosa.mul"(%699, %704) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %706 = "tosa.mul"(%705, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %707 = "tosa.add"(%706, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %708 = "tosa.matmul"(%707, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %709 = "tosa.reshape"(%708) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %710 = "tosa.add"(%50, %709) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %711 = "tosa.reshape"(%710) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %712 = "tosa.slice"(%711) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %713 = "tosa.slice"(%711) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %714 = "tosa.slice"(%711) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %715 = "tosa.reshape"(%712) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %716 = "tosa.transpose"(%715, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %717 = "tosa.reshape"(%713) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %718 = "tosa.transpose"(%717, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %719 = "tosa.reshape"(%714) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %720 = "tosa.transpose"(%719, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %721 = "tosa.transpose"(%718, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %722 = "tosa.reshape"(%716) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %723 = "tosa.reshape"(%721) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %724 = "tosa.matmul"(%722, %723) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %725 = "tosa.reshape"(%724) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %726 = "tosa.mul"(%725, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %727 = torch_c.from_builtin_tensor %726 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %728 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %729 = torch.aten.to.dtype %728, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %730 = torch.valsem.aten.copy %729, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %731 = torch.aten.where.self %730, %727, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %732 = torch_c.to_builtin_tensor %731 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %733 = "tosa.reduce_max"(%732) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %734 = "tosa.sub"(%732, %733) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %735 = "tosa.exp"(%734) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %736 = "tosa.reduce_sum"(%735) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %737 = "tosa.reciprocal"(%736) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %738 = "tosa.mul"(%735, %737) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %739 = "tosa.reshape"(%738) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %740 = "tosa.reshape"(%720) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %741 = "tosa.matmul"(%739, %740) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %742 = "tosa.reshape"(%741) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %743 = "tosa.transpose"(%742, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %744 = "tosa.reshape"(%743) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %745 = "tosa.matmul"(%744, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %746 = "tosa.reshape"(%745) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %747 = "tosa.add"(%95, %746) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %748 = "tosa.reshape"(%747) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %749 = "tosa.add"(%748, %696) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %750 = "tosa.reduce_sum"(%749) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %751 = "tosa.mul"(%750, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %752 = "tosa.sub"(%749, %751) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %753 = "tosa.mul"(%752, %752) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %754 = "tosa.reduce_sum"(%753) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %755 = "tosa.mul"(%754, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %756 = "tosa.add"(%755, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %757 = "tosa.rsqrt"(%756) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %758 = "tosa.mul"(%752, %757) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %759 = "tosa.mul"(%758, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %760 = "tosa.add"(%759, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %761 = "tosa.matmul"(%760, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %762 = "tosa.reshape"(%761) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %763 = "tosa.add"(%113, %762) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %764 = "tosa.reshape"(%763) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %765 = "tosa.mul"(%764, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %766 = "tosa.pow"(%764, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %767 = "tosa.mul"(%766, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %768 = "tosa.add"(%764, %767) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %769 = "tosa.mul"(%768, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %770 = "tosa.tanh"(%769) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %771 = "tosa.add"(%770, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %772 = "tosa.mul"(%765, %771) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %773 = "tosa.matmul"(%772, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %774 = "tosa.reshape"(%773) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %775 = "tosa.add"(%95, %774) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %776 = "tosa.reshape"(%775) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %777 = "tosa.add"(%749, %776) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %778 = "tosa.reduce_sum"(%777) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %779 = "tosa.mul"(%778, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %780 = "tosa.sub"(%777, %779) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %781 = "tosa.mul"(%780, %780) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %782 = "tosa.reduce_sum"(%781) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %783 = "tosa.mul"(%782, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %784 = "tosa.add"(%783, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %785 = "tosa.rsqrt"(%784) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %786 = "tosa.mul"(%780, %785) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %787 = "tosa.mul"(%786, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %788 = "tosa.add"(%787, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %789 = "tosa.matmul"(%788, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %790 = "tosa.reshape"(%789) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %791 = "tosa.add"(%50, %790) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %792 = "tosa.reshape"(%791) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %793 = "tosa.slice"(%792) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %794 = "tosa.slice"(%792) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %795 = "tosa.slice"(%792) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %796 = "tosa.reshape"(%793) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %797 = "tosa.transpose"(%796, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %798 = "tosa.reshape"(%794) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %799 = "tosa.transpose"(%798, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %800 = "tosa.reshape"(%795) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %801 = "tosa.transpose"(%800, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %802 = "tosa.transpose"(%799, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %803 = "tosa.reshape"(%797) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %804 = "tosa.reshape"(%802) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %805 = "tosa.matmul"(%803, %804) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %806 = "tosa.reshape"(%805) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %807 = "tosa.mul"(%806, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %808 = torch_c.from_builtin_tensor %807 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %809 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %810 = torch.aten.to.dtype %809, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %811 = torch.valsem.aten.copy %810, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %812 = torch.aten.where.self %811, %808, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %813 = torch_c.to_builtin_tensor %812 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %814 = "tosa.reduce_max"(%813) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %815 = "tosa.sub"(%813, %814) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %816 = "tosa.exp"(%815) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %817 = "tosa.reduce_sum"(%816) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %818 = "tosa.reciprocal"(%817) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %819 = "tosa.mul"(%816, %818) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %820 = "tosa.reshape"(%819) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %821 = "tosa.reshape"(%801) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %822 = "tosa.matmul"(%820, %821) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %823 = "tosa.reshape"(%822) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %824 = "tosa.transpose"(%823, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %825 = "tosa.reshape"(%824) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %826 = "tosa.matmul"(%825, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %827 = "tosa.reshape"(%826) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %828 = "tosa.add"(%95, %827) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %829 = "tosa.reshape"(%828) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %830 = "tosa.add"(%829, %777) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %831 = "tosa.reduce_sum"(%830) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %832 = "tosa.mul"(%831, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %833 = "tosa.sub"(%830, %832) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %834 = "tosa.mul"(%833, %833) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %835 = "tosa.reduce_sum"(%834) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %836 = "tosa.mul"(%835, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %837 = "tosa.add"(%836, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %838 = "tosa.rsqrt"(%837) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %839 = "tosa.mul"(%833, %838) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %840 = "tosa.mul"(%839, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %841 = "tosa.add"(%840, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %842 = "tosa.matmul"(%841, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %843 = "tosa.reshape"(%842) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %844 = "tosa.add"(%113, %843) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %845 = "tosa.reshape"(%844) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %846 = "tosa.mul"(%845, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %847 = "tosa.pow"(%845, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %848 = "tosa.mul"(%847, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %849 = "tosa.add"(%845, %848) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %850 = "tosa.mul"(%849, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %851 = "tosa.tanh"(%850) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %852 = "tosa.add"(%851, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %853 = "tosa.mul"(%846, %852) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %854 = "tosa.matmul"(%853, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %855 = "tosa.reshape"(%854) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %856 = "tosa.add"(%95, %855) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %857 = "tosa.reshape"(%856) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %858 = "tosa.add"(%830, %857) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %859 = "tosa.reduce_sum"(%858) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %860 = "tosa.mul"(%859, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %861 = "tosa.sub"(%858, %860) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %862 = "tosa.mul"(%861, %861) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %863 = "tosa.reduce_sum"(%862) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %864 = "tosa.mul"(%863, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %865 = "tosa.add"(%864, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %866 = "tosa.rsqrt"(%865) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %867 = "tosa.mul"(%861, %866) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %868 = "tosa.mul"(%867, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %869 = "tosa.add"(%868, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %870 = "tosa.matmul"(%869, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %871 = "tosa.reshape"(%870) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %872 = "tosa.add"(%50, %871) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %873 = "tosa.reshape"(%872) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %874 = "tosa.slice"(%873) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %875 = "tosa.slice"(%873) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %876 = "tosa.slice"(%873) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %877 = "tosa.reshape"(%874) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %878 = "tosa.transpose"(%877, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %879 = "tosa.reshape"(%875) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %880 = "tosa.transpose"(%879, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %881 = "tosa.reshape"(%876) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %882 = "tosa.transpose"(%881, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %883 = "tosa.transpose"(%880, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %884 = "tosa.reshape"(%878) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %885 = "tosa.reshape"(%883) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %886 = "tosa.matmul"(%884, %885) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %887 = "tosa.reshape"(%886) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %888 = "tosa.mul"(%887, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %889 = torch_c.from_builtin_tensor %888 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %890 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %891 = torch.aten.to.dtype %890, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %892 = torch.valsem.aten.copy %891, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %893 = torch.aten.where.self %892, %889, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %894 = torch_c.to_builtin_tensor %893 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %895 = "tosa.reduce_max"(%894) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %896 = "tosa.sub"(%894, %895) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %897 = "tosa.exp"(%896) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %898 = "tosa.reduce_sum"(%897) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %899 = "tosa.reciprocal"(%898) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %900 = "tosa.mul"(%897, %899) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %901 = "tosa.reshape"(%900) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %902 = "tosa.reshape"(%882) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %903 = "tosa.matmul"(%901, %902) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %904 = "tosa.reshape"(%903) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %905 = "tosa.transpose"(%904, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %906 = "tosa.reshape"(%905) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %907 = "tosa.matmul"(%906, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %908 = "tosa.reshape"(%907) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %909 = "tosa.add"(%95, %908) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %910 = "tosa.reshape"(%909) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %911 = "tosa.add"(%910, %858) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %912 = "tosa.reduce_sum"(%911) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %913 = "tosa.mul"(%912, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %914 = "tosa.sub"(%911, %913) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %915 = "tosa.mul"(%914, %914) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %916 = "tosa.reduce_sum"(%915) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %917 = "tosa.mul"(%916, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %918 = "tosa.add"(%917, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %919 = "tosa.rsqrt"(%918) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %920 = "tosa.mul"(%914, %919) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %921 = "tosa.mul"(%920, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %922 = "tosa.add"(%921, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %923 = "tosa.matmul"(%922, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %924 = "tosa.reshape"(%923) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %925 = "tosa.add"(%113, %924) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %926 = "tosa.reshape"(%925) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %927 = "tosa.mul"(%926, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %928 = "tosa.pow"(%926, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %929 = "tosa.mul"(%928, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %930 = "tosa.add"(%926, %929) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %931 = "tosa.mul"(%930, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %932 = "tosa.tanh"(%931) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %933 = "tosa.add"(%932, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %934 = "tosa.mul"(%927, %933) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %935 = "tosa.matmul"(%934, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %936 = "tosa.reshape"(%935) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %937 = "tosa.add"(%95, %936) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %938 = "tosa.reshape"(%937) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %939 = "tosa.add"(%911, %938) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %940 = "tosa.reduce_sum"(%939) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %941 = "tosa.mul"(%940, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %942 = "tosa.sub"(%939, %941) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %943 = "tosa.mul"(%942, %942) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %944 = "tosa.reduce_sum"(%943) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %945 = "tosa.mul"(%944, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %946 = "tosa.add"(%945, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %947 = "tosa.rsqrt"(%946) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %948 = "tosa.mul"(%942, %947) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %949 = "tosa.mul"(%948, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %950 = "tosa.add"(%949, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %951 = "tosa.matmul"(%950, %47) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %952 = "tosa.reshape"(%951) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %953 = "tosa.add"(%50, %952) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %954 = "tosa.reshape"(%953) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %955 = "tosa.slice"(%954) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %956 = "tosa.slice"(%954) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %957 = "tosa.slice"(%954) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %958 = "tosa.reshape"(%955) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %959 = "tosa.transpose"(%958, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %960 = "tosa.reshape"(%956) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %961 = "tosa.transpose"(%960, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %962 = "tosa.reshape"(%957) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %963 = "tosa.transpose"(%962, %8) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %964 = "tosa.transpose"(%961, %7) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %965 = "tosa.reshape"(%959) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %966 = "tosa.reshape"(%964) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %967 = "tosa.matmul"(%965, %966) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %968 = "tosa.reshape"(%967) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %969 = "tosa.mul"(%968, %68) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %970 = torch_c.from_builtin_tensor %969 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %971 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %972 = torch.aten.to.dtype %971, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %973 = torch.valsem.aten.copy %972, %73, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %974 = torch.aten.where.self %973, %970, %77 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %975 = torch_c.to_builtin_tensor %974 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %976 = "tosa.reduce_max"(%975) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %977 = "tosa.sub"(%975, %976) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %978 = "tosa.exp"(%977) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %979 = "tosa.reduce_sum"(%978) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %980 = "tosa.reciprocal"(%979) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %981 = "tosa.mul"(%978, %980) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %982 = "tosa.reshape"(%981) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %983 = "tosa.reshape"(%963) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %984 = "tosa.matmul"(%982, %983) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %985 = "tosa.reshape"(%984) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %986 = "tosa.transpose"(%985, %8) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %987 = "tosa.reshape"(%986) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %988 = "tosa.matmul"(%987, %92) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %989 = "tosa.reshape"(%988) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %990 = "tosa.add"(%95, %989) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %991 = "tosa.reshape"(%990) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %992 = "tosa.add"(%991, %939) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %993 = "tosa.reduce_sum"(%992) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %994 = "tosa.mul"(%993, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %995 = "tosa.sub"(%992, %994) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %996 = "tosa.mul"(%995, %995) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %997 = "tosa.reduce_sum"(%996) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %998 = "tosa.mul"(%997, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %999 = "tosa.add"(%998, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1000 = "tosa.rsqrt"(%999) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1001 = "tosa.mul"(%995, %1000) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1002 = "tosa.mul"(%1001, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1003 = "tosa.add"(%1002, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1004 = "tosa.matmul"(%1003, %110) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1005 = "tosa.reshape"(%1004) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1006 = "tosa.add"(%113, %1005) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1007 = "tosa.reshape"(%1006) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1008 = "tosa.mul"(%1007, %2) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1009 = "tosa.pow"(%1007, %3) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1010 = "tosa.mul"(%1009, %4) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1011 = "tosa.add"(%1007, %1010) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1012 = "tosa.mul"(%1011, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1013 = "tosa.tanh"(%1012) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1014 = "tosa.add"(%1013, %0) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1015 = "tosa.mul"(%1008, %1014) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1016 = "tosa.matmul"(%1015, %124) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1017 = "tosa.reshape"(%1016) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1018 = "tosa.add"(%95, %1017) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1019 = "tosa.reshape"(%1018) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1020 = "tosa.add"(%992, %1019) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1021 = "tosa.reduce_sum"(%1020) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1022 = "tosa.mul"(%1021, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1023 = "tosa.sub"(%1020, %1022) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1024 = "tosa.mul"(%1023, %1023) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1025 = "tosa.reduce_sum"(%1024) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1026 = "tosa.mul"(%1025, %35) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1027 = "tosa.add"(%1026, %1) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1028 = "tosa.rsqrt"(%1027) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1029 = "tosa.mul"(%1023, %1028) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1030 = "tosa.mul"(%1029, %41) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1031 = "tosa.add"(%1030, %41) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1032 = "tosa.transpose"(%10, %6) : (tensor<50257x768xf32>, tensor<2xi32>) -> tensor<768x50257xf32>
  %1033 = "tosa.reshape"(%1032) {new_shape = [1, 768, 50257]} : (tensor<768x50257xf32>) -> tensor<1x768x50257xf32>
  %1034 = "tosa.matmul"(%1031, %1033) : (tensor<1x5x768xf32>, tensor<1x768x50257xf32>) -> tensor<1x5x50257xf32>
  %1035 = torch_c.from_builtin_tensor %1034 : tensor<1x5x50257xf32> -> !torch.vtensor<[1,5,50257],f32>
  return %1035 : !torch.vtensor<[1,5,50257],f32>
 }

 // -----// IR Dump After FuncBackendTypeConversion (torch-func-backend-type-conversion) //----- //
 module attributes {torch.debug_module_name = "_lambda"} {
  func.func @forward(%arg0: tensor<1x5xi64>) -> tensor<1x5x50257xf32> {
    %0 = torch_c.from_builtin_tensor %arg0 : tensor<1x5xi64> -> !torch.vtensor<[1,5],si64>
    %1 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
    %2 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
    %3 = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
    %4 = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
    %5 = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
    %6 = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
    %7 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
    %8 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
    %9 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
    %10 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
    %int0 = torch.constant.int 0
    %int4 = torch.constant.int 4
    %int11 = torch.constant.int 11
    %none = torch.constant.none
    %false = torch.constant.bool false
    %11 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
    %12 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
    %13 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
    %14 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
    %15 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
    %16 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
    %17 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
    %18 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
    %19 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
    %20 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
    %21 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
    %22 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
    %float0.000000e00 = torch.constant.float 0.000000e+00
    %int1 = torch.constant.int 1
    %int5 = torch.constant.int 5
    %23 = torch_c.to_builtin_tensor %0 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
    %cpu = torch.constant.device "cpu"
    %24 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
    %25 = torch_c.to_builtin_tensor %24 : !torch.vtensor<[5],si64> -> tensor<5xi64>
    %26 = "tosa.reshape"(%25) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
    %27 = "tosa.reshape"(%11) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
    %28 = "tosa.cast"(%23) : (tensor<1x5xi64>) -> tensor<1x5xi32>
    %29 = "tosa.gather"(%27, %28) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
    %30 = "tosa.reshape"(%12) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
    %31 = "tosa.cast"(%26) : (tensor<1x5xi64>) -> tensor<1x5xi32>
    %32 = "tosa.gather"(%30, %31) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
    %33 = "tosa.add"(%29, %32) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %34 = "tosa.reciprocal"(%10) : (tensor<1xf32>) -> tensor<1xf32>
    %35 = "tosa.reduce_sum"(%33) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %36 = "tosa.reshape"(%34) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
    %37 = "tosa.mul"(%35, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %38 = "tosa.sub"(%33, %37) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %39 = "tosa.mul"(%38, %38) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %40 = "tosa.reduce_sum"(%39) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %41 = "tosa.mul"(%40, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %42 = "tosa.reshape"(%22) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
    %43 = "tosa.add"(%41, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %44 = "tosa.rsqrt"(%43) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %45 = "tosa.mul"(%38, %44) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %46 = "tosa.mul"(%45, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %47 = "tosa.add"(%46, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %48 = "tosa.reshape"(%17) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
    %49 = "tosa.matmul"(%47, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %50 = "tosa.reshape"(%49) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %51 = "tosa.reshape"(%16) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
    %52 = "tosa.add"(%51, %50) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %53 = "tosa.reshape"(%52) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %54 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %55 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %56 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %57 = "tosa.reshape"(%54) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %58 = "tosa.transpose"(%57, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %59 = "tosa.reshape"(%55) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %60 = "tosa.transpose"(%59, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %61 = "tosa.reshape"(%56) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %62 = "tosa.transpose"(%61, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %63 = "tosa.transpose"(%60, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %64 = "tosa.reshape"(%58) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %65 = "tosa.reshape"(%63) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %66 = "tosa.matmul"(%64, %65) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %67 = "tosa.reshape"(%66) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %68 = "tosa.reciprocal"(%13) : (tensor<f32>) -> tensor<f32>
    %69 = "tosa.reshape"(%68) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
    %70 = "tosa.mul"(%67, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %71 = torch_c.from_builtin_tensor %70 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %72 = "tosa.slice"(%14) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
    %73 = "tosa.slice"(%72) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
    %74 = torch_c.from_builtin_tensor %73 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
    %75 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %76 = torch.aten.to.dtype %75, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %77 = torch.valsem.aten.copy %76, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %78 = torch_c.from_builtin_tensor %15 : tensor<f32> -> !torch.vtensor<[],f32>
    %79 = torch.aten.where.self %77, %71, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %80 = torch_c.to_builtin_tensor %79 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %81 = "tosa.reduce_max"(%80) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %82 = "tosa.sub"(%80, %81) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %83 = "tosa.exp"(%82) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %84 = "tosa.reduce_sum"(%83) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %85 = "tosa.reciprocal"(%84) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %86 = "tosa.mul"(%83, %85) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %87 = "tosa.reshape"(%86) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %88 = "tosa.reshape"(%62) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %89 = "tosa.matmul"(%87, %88) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %90 = "tosa.reshape"(%89) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %91 = "tosa.transpose"(%90, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %92 = "tosa.reshape"(%91) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %93 = "tosa.reshape"(%18) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
    %94 = "tosa.matmul"(%92, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %95 = "tosa.reshape"(%94) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %96 = "tosa.reshape"(%22) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
    %97 = "tosa.add"(%96, %95) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %98 = "tosa.reshape"(%97) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %99 = "tosa.add"(%98, %33) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %100 = "tosa.reduce_sum"(%99) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %101 = "tosa.mul"(%100, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %102 = "tosa.sub"(%99, %101) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %103 = "tosa.mul"(%102, %102) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %104 = "tosa.reduce_sum"(%103) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %105 = "tosa.mul"(%104, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %106 = "tosa.add"(%105, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %107 = "tosa.rsqrt"(%106) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %108 = "tosa.mul"(%102, %107) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %109 = "tosa.mul"(%108, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %110 = "tosa.add"(%109, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %111 = "tosa.reshape"(%20) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
    %112 = "tosa.matmul"(%110, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %113 = "tosa.reshape"(%112) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %114 = "tosa.reshape"(%19) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
    %115 = "tosa.add"(%114, %113) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %116 = "tosa.reshape"(%115) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %117 = "tosa.mul"(%116, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %118 = "tosa.pow"(%116, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %119 = "tosa.mul"(%118, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %120 = "tosa.add"(%116, %119) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %121 = "tosa.mul"(%120, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %122 = "tosa.tanh"(%121) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %123 = "tosa.add"(%122, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %124 = "tosa.mul"(%117, %123) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %125 = "tosa.reshape"(%21) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
    %126 = "tosa.matmul"(%124, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %127 = "tosa.reshape"(%126) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %128 = "tosa.add"(%96, %127) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %129 = "tosa.reshape"(%128) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %130 = "tosa.add"(%99, %129) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %131 = "tosa.reduce_sum"(%130) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %132 = "tosa.mul"(%131, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %133 = "tosa.sub"(%130, %132) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %134 = "tosa.mul"(%133, %133) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %135 = "tosa.reduce_sum"(%134) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %136 = "tosa.mul"(%135, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %137 = "tosa.add"(%136, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %138 = "tosa.rsqrt"(%137) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %139 = "tosa.mul"(%133, %138) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %140 = "tosa.mul"(%139, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %141 = "tosa.add"(%140, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %142 = "tosa.matmul"(%141, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %143 = "tosa.reshape"(%142) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %144 = "tosa.add"(%51, %143) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %145 = "tosa.reshape"(%144) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %146 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %147 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %148 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %149 = "tosa.reshape"(%146) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %150 = "tosa.transpose"(%149, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %151 = "tosa.reshape"(%147) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %152 = "tosa.transpose"(%151, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %153 = "tosa.reshape"(%148) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %154 = "tosa.transpose"(%153, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %155 = "tosa.transpose"(%152, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %156 = "tosa.reshape"(%150) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %157 = "tosa.reshape"(%155) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %158 = "tosa.matmul"(%156, %157) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %159 = "tosa.reshape"(%158) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %160 = "tosa.mul"(%159, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %161 = torch_c.from_builtin_tensor %160 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %162 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %163 = torch.aten.to.dtype %162, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %164 = torch.valsem.aten.copy %163, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %165 = torch.aten.where.self %164, %161, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %166 = torch_c.to_builtin_tensor %165 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %167 = "tosa.reduce_max"(%166) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %168 = "tosa.sub"(%166, %167) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %169 = "tosa.exp"(%168) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %170 = "tosa.reduce_sum"(%169) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %171 = "tosa.reciprocal"(%170) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %172 = "tosa.mul"(%169, %171) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %173 = "tosa.reshape"(%172) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %174 = "tosa.reshape"(%154) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %175 = "tosa.matmul"(%173, %174) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %176 = "tosa.reshape"(%175) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %177 = "tosa.transpose"(%176, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %178 = "tosa.reshape"(%177) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %179 = "tosa.matmul"(%178, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %180 = "tosa.reshape"(%179) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %181 = "tosa.add"(%96, %180) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %182 = "tosa.reshape"(%181) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %183 = "tosa.add"(%182, %130) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %184 = "tosa.reduce_sum"(%183) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %185 = "tosa.mul"(%184, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %186 = "tosa.sub"(%183, %185) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %187 = "tosa.mul"(%186, %186) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %188 = "tosa.reduce_sum"(%187) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %189 = "tosa.mul"(%188, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %190 = "tosa.add"(%189, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %191 = "tosa.rsqrt"(%190) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %192 = "tosa.mul"(%186, %191) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %193 = "tosa.mul"(%192, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %194 = "tosa.add"(%193, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %195 = "tosa.matmul"(%194, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %196 = "tosa.reshape"(%195) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %197 = "tosa.add"(%114, %196) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %198 = "tosa.reshape"(%197) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %199 = "tosa.mul"(%198, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %200 = "tosa.pow"(%198, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %201 = "tosa.mul"(%200, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %202 = "tosa.add"(%198, %201) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %203 = "tosa.mul"(%202, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %204 = "tosa.tanh"(%203) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %205 = "tosa.add"(%204, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %206 = "tosa.mul"(%199, %205) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %207 = "tosa.matmul"(%206, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %208 = "tosa.reshape"(%207) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %209 = "tosa.add"(%96, %208) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %210 = "tosa.reshape"(%209) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %211 = "tosa.add"(%183, %210) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %212 = "tosa.reduce_sum"(%211) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %213 = "tosa.mul"(%212, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %214 = "tosa.sub"(%211, %213) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %215 = "tosa.mul"(%214, %214) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %216 = "tosa.reduce_sum"(%215) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %217 = "tosa.mul"(%216, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %218 = "tosa.add"(%217, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %219 = "tosa.rsqrt"(%218) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %220 = "tosa.mul"(%214, %219) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %221 = "tosa.mul"(%220, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %222 = "tosa.add"(%221, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %223 = "tosa.matmul"(%222, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %224 = "tosa.reshape"(%223) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %225 = "tosa.add"(%51, %224) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %226 = "tosa.reshape"(%225) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %227 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %228 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %229 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %230 = "tosa.reshape"(%227) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %231 = "tosa.transpose"(%230, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %232 = "tosa.reshape"(%228) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %233 = "tosa.transpose"(%232, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %234 = "tosa.reshape"(%229) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %235 = "tosa.transpose"(%234, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %236 = "tosa.transpose"(%233, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %237 = "tosa.reshape"(%231) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %238 = "tosa.reshape"(%236) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %239 = "tosa.matmul"(%237, %238) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %240 = "tosa.reshape"(%239) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %241 = "tosa.mul"(%240, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %242 = torch_c.from_builtin_tensor %241 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %243 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %244 = torch.aten.to.dtype %243, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %245 = torch.valsem.aten.copy %244, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %246 = torch.aten.where.self %245, %242, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %247 = torch_c.to_builtin_tensor %246 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %248 = "tosa.reduce_max"(%247) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %249 = "tosa.sub"(%247, %248) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %250 = "tosa.exp"(%249) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %251 = "tosa.reduce_sum"(%250) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %252 = "tosa.reciprocal"(%251) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %253 = "tosa.mul"(%250, %252) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %254 = "tosa.reshape"(%253) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %255 = "tosa.reshape"(%235) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %256 = "tosa.matmul"(%254, %255) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %257 = "tosa.reshape"(%256) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %258 = "tosa.transpose"(%257, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %259 = "tosa.reshape"(%258) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %260 = "tosa.matmul"(%259, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %261 = "tosa.reshape"(%260) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %262 = "tosa.add"(%96, %261) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %263 = "tosa.reshape"(%262) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %264 = "tosa.add"(%263, %211) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %265 = "tosa.reduce_sum"(%264) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %266 = "tosa.mul"(%265, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %267 = "tosa.sub"(%264, %266) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %268 = "tosa.mul"(%267, %267) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %269 = "tosa.reduce_sum"(%268) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %270 = "tosa.mul"(%269, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %271 = "tosa.add"(%270, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %272 = "tosa.rsqrt"(%271) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %273 = "tosa.mul"(%267, %272) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %274 = "tosa.mul"(%273, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %275 = "tosa.add"(%274, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %276 = "tosa.matmul"(%275, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %277 = "tosa.reshape"(%276) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %278 = "tosa.add"(%114, %277) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %279 = "tosa.reshape"(%278) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %280 = "tosa.mul"(%279, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %281 = "tosa.pow"(%279, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %282 = "tosa.mul"(%281, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %283 = "tosa.add"(%279, %282) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %284 = "tosa.mul"(%283, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %285 = "tosa.tanh"(%284) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %286 = "tosa.add"(%285, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %287 = "tosa.mul"(%280, %286) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %288 = "tosa.matmul"(%287, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %289 = "tosa.reshape"(%288) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %290 = "tosa.add"(%96, %289) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %291 = "tosa.reshape"(%290) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %292 = "tosa.add"(%264, %291) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %293 = "tosa.reduce_sum"(%292) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %294 = "tosa.mul"(%293, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %295 = "tosa.sub"(%292, %294) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %296 = "tosa.mul"(%295, %295) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %297 = "tosa.reduce_sum"(%296) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %298 = "tosa.mul"(%297, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %299 = "tosa.add"(%298, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %300 = "tosa.rsqrt"(%299) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %301 = "tosa.mul"(%295, %300) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %302 = "tosa.mul"(%301, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %303 = "tosa.add"(%302, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %304 = "tosa.matmul"(%303, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %305 = "tosa.reshape"(%304) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %306 = "tosa.add"(%51, %305) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %307 = "tosa.reshape"(%306) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %308 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %309 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %310 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %311 = "tosa.reshape"(%308) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %312 = "tosa.transpose"(%311, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %313 = "tosa.reshape"(%309) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %314 = "tosa.transpose"(%313, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %315 = "tosa.reshape"(%310) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %316 = "tosa.transpose"(%315, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %317 = "tosa.transpose"(%314, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %318 = "tosa.reshape"(%312) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %319 = "tosa.reshape"(%317) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %320 = "tosa.matmul"(%318, %319) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %321 = "tosa.reshape"(%320) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %322 = "tosa.mul"(%321, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %323 = torch_c.from_builtin_tensor %322 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %324 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %325 = torch.aten.to.dtype %324, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %326 = torch.valsem.aten.copy %325, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %327 = torch.aten.where.self %326, %323, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %328 = torch_c.to_builtin_tensor %327 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %329 = "tosa.reduce_max"(%328) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %330 = "tosa.sub"(%328, %329) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %331 = "tosa.exp"(%330) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %332 = "tosa.reduce_sum"(%331) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %333 = "tosa.reciprocal"(%332) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %334 = "tosa.mul"(%331, %333) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %335 = "tosa.reshape"(%334) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %336 = "tosa.reshape"(%316) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %337 = "tosa.matmul"(%335, %336) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %338 = "tosa.reshape"(%337) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %339 = "tosa.transpose"(%338, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %340 = "tosa.reshape"(%339) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %341 = "tosa.matmul"(%340, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %342 = "tosa.reshape"(%341) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %343 = "tosa.add"(%96, %342) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %344 = "tosa.reshape"(%343) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %345 = "tosa.add"(%344, %292) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %346 = "tosa.reduce_sum"(%345) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %347 = "tosa.mul"(%346, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %348 = "tosa.sub"(%345, %347) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %349 = "tosa.mul"(%348, %348) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %350 = "tosa.reduce_sum"(%349) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %351 = "tosa.mul"(%350, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %352 = "tosa.add"(%351, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %353 = "tosa.rsqrt"(%352) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %354 = "tosa.mul"(%348, %353) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %355 = "tosa.mul"(%354, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %356 = "tosa.add"(%355, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %357 = "tosa.matmul"(%356, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %358 = "tosa.reshape"(%357) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %359 = "tosa.add"(%114, %358) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %360 = "tosa.reshape"(%359) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %361 = "tosa.mul"(%360, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %362 = "tosa.pow"(%360, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %363 = "tosa.mul"(%362, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %364 = "tosa.add"(%360, %363) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %365 = "tosa.mul"(%364, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %366 = "tosa.tanh"(%365) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %367 = "tosa.add"(%366, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %368 = "tosa.mul"(%361, %367) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %369 = "tosa.matmul"(%368, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %370 = "tosa.reshape"(%369) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %371 = "tosa.add"(%96, %370) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %372 = "tosa.reshape"(%371) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %373 = "tosa.add"(%345, %372) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %374 = "tosa.reduce_sum"(%373) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %375 = "tosa.mul"(%374, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %376 = "tosa.sub"(%373, %375) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %377 = "tosa.mul"(%376, %376) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %378 = "tosa.reduce_sum"(%377) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %379 = "tosa.mul"(%378, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %380 = "tosa.add"(%379, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %381 = "tosa.rsqrt"(%380) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %382 = "tosa.mul"(%376, %381) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %383 = "tosa.mul"(%382, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %384 = "tosa.add"(%383, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %385 = "tosa.matmul"(%384, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %386 = "tosa.reshape"(%385) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %387 = "tosa.add"(%51, %386) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %388 = "tosa.reshape"(%387) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %389 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %390 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %391 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %392 = "tosa.reshape"(%389) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %393 = "tosa.transpose"(%392, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %394 = "tosa.reshape"(%390) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %395 = "tosa.transpose"(%394, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %396 = "tosa.reshape"(%391) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %397 = "tosa.transpose"(%396, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %398 = "tosa.transpose"(%395, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %399 = "tosa.reshape"(%393) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %400 = "tosa.reshape"(%398) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %401 = "tosa.matmul"(%399, %400) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %402 = "tosa.reshape"(%401) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %403 = "tosa.mul"(%402, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %404 = torch_c.from_builtin_tensor %403 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %405 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %406 = torch.aten.to.dtype %405, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %407 = torch.valsem.aten.copy %406, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %408 = torch.aten.where.self %407, %404, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %409 = torch_c.to_builtin_tensor %408 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %410 = "tosa.reduce_max"(%409) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %411 = "tosa.sub"(%409, %410) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %412 = "tosa.exp"(%411) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %413 = "tosa.reduce_sum"(%412) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %414 = "tosa.reciprocal"(%413) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %415 = "tosa.mul"(%412, %414) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %416 = "tosa.reshape"(%415) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %417 = "tosa.reshape"(%397) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %418 = "tosa.matmul"(%416, %417) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %419 = "tosa.reshape"(%418) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %420 = "tosa.transpose"(%419, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %421 = "tosa.reshape"(%420) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %422 = "tosa.matmul"(%421, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %423 = "tosa.reshape"(%422) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %424 = "tosa.add"(%96, %423) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %425 = "tosa.reshape"(%424) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %426 = "tosa.add"(%425, %373) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %427 = "tosa.reduce_sum"(%426) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %428 = "tosa.mul"(%427, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %429 = "tosa.sub"(%426, %428) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %430 = "tosa.mul"(%429, %429) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %431 = "tosa.reduce_sum"(%430) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %432 = "tosa.mul"(%431, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %433 = "tosa.add"(%432, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %434 = "tosa.rsqrt"(%433) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %435 = "tosa.mul"(%429, %434) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %436 = "tosa.mul"(%435, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %437 = "tosa.add"(%436, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %438 = "tosa.matmul"(%437, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %439 = "tosa.reshape"(%438) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %440 = "tosa.add"(%114, %439) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %441 = "tosa.reshape"(%440) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %442 = "tosa.mul"(%441, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %443 = "tosa.pow"(%441, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %444 = "tosa.mul"(%443, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %445 = "tosa.add"(%441, %444) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %446 = "tosa.mul"(%445, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %447 = "tosa.tanh"(%446) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %448 = "tosa.add"(%447, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %449 = "tosa.mul"(%442, %448) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %450 = "tosa.matmul"(%449, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %451 = "tosa.reshape"(%450) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %452 = "tosa.add"(%96, %451) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %453 = "tosa.reshape"(%452) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %454 = "tosa.add"(%426, %453) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %455 = "tosa.reduce_sum"(%454) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %456 = "tosa.mul"(%455, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %457 = "tosa.sub"(%454, %456) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %458 = "tosa.mul"(%457, %457) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %459 = "tosa.reduce_sum"(%458) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %460 = "tosa.mul"(%459, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %461 = "tosa.add"(%460, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %462 = "tosa.rsqrt"(%461) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %463 = "tosa.mul"(%457, %462) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %464 = "tosa.mul"(%463, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %465 = "tosa.add"(%464, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %466 = "tosa.matmul"(%465, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %467 = "tosa.reshape"(%466) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %468 = "tosa.add"(%51, %467) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %469 = "tosa.reshape"(%468) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %470 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %471 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %472 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %473 = "tosa.reshape"(%470) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %474 = "tosa.transpose"(%473, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %475 = "tosa.reshape"(%471) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %476 = "tosa.transpose"(%475, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %477 = "tosa.reshape"(%472) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %478 = "tosa.transpose"(%477, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %479 = "tosa.transpose"(%476, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %480 = "tosa.reshape"(%474) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %481 = "tosa.reshape"(%479) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %482 = "tosa.matmul"(%480, %481) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %483 = "tosa.reshape"(%482) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %484 = "tosa.mul"(%483, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %485 = torch_c.from_builtin_tensor %484 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %486 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %487 = torch.aten.to.dtype %486, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %488 = torch.valsem.aten.copy %487, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %489 = torch.aten.where.self %488, %485, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %490 = torch_c.to_builtin_tensor %489 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %491 = "tosa.reduce_max"(%490) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %492 = "tosa.sub"(%490, %491) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %493 = "tosa.exp"(%492) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %494 = "tosa.reduce_sum"(%493) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %495 = "tosa.reciprocal"(%494) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %496 = "tosa.mul"(%493, %495) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %497 = "tosa.reshape"(%496) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %498 = "tosa.reshape"(%478) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %499 = "tosa.matmul"(%497, %498) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %500 = "tosa.reshape"(%499) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %501 = "tosa.transpose"(%500, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %502 = "tosa.reshape"(%501) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %503 = "tosa.matmul"(%502, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %504 = "tosa.reshape"(%503) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %505 = "tosa.add"(%96, %504) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %506 = "tosa.reshape"(%505) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %507 = "tosa.add"(%506, %454) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %508 = "tosa.reduce_sum"(%507) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %509 = "tosa.mul"(%508, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %510 = "tosa.sub"(%507, %509) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %511 = "tosa.mul"(%510, %510) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %512 = "tosa.reduce_sum"(%511) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %513 = "tosa.mul"(%512, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %514 = "tosa.add"(%513, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %515 = "tosa.rsqrt"(%514) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %516 = "tosa.mul"(%510, %515) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %517 = "tosa.mul"(%516, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %518 = "tosa.add"(%517, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %519 = "tosa.matmul"(%518, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %520 = "tosa.reshape"(%519) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %521 = "tosa.add"(%114, %520) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %522 = "tosa.reshape"(%521) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %523 = "tosa.mul"(%522, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %524 = "tosa.pow"(%522, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %525 = "tosa.mul"(%524, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %526 = "tosa.add"(%522, %525) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %527 = "tosa.mul"(%526, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %528 = "tosa.tanh"(%527) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %529 = "tosa.add"(%528, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %530 = "tosa.mul"(%523, %529) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %531 = "tosa.matmul"(%530, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %532 = "tosa.reshape"(%531) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %533 = "tosa.add"(%96, %532) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %534 = "tosa.reshape"(%533) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %535 = "tosa.add"(%507, %534) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %536 = "tosa.reduce_sum"(%535) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %537 = "tosa.mul"(%536, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %538 = "tosa.sub"(%535, %537) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %539 = "tosa.mul"(%538, %538) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %540 = "tosa.reduce_sum"(%539) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %541 = "tosa.mul"(%540, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %542 = "tosa.add"(%541, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %543 = "tosa.rsqrt"(%542) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %544 = "tosa.mul"(%538, %543) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %545 = "tosa.mul"(%544, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %546 = "tosa.add"(%545, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %547 = "tosa.matmul"(%546, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %548 = "tosa.reshape"(%547) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %549 = "tosa.add"(%51, %548) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %550 = "tosa.reshape"(%549) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %551 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %552 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %553 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %554 = "tosa.reshape"(%551) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %555 = "tosa.transpose"(%554, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %556 = "tosa.reshape"(%552) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %557 = "tosa.transpose"(%556, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %558 = "tosa.reshape"(%553) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %559 = "tosa.transpose"(%558, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %560 = "tosa.transpose"(%557, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %561 = "tosa.reshape"(%555) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %562 = "tosa.reshape"(%560) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %563 = "tosa.matmul"(%561, %562) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %564 = "tosa.reshape"(%563) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %565 = "tosa.mul"(%564, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %566 = torch_c.from_builtin_tensor %565 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %567 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %568 = torch.aten.to.dtype %567, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %569 = torch.valsem.aten.copy %568, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %570 = torch.aten.where.self %569, %566, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %571 = torch_c.to_builtin_tensor %570 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %572 = "tosa.reduce_max"(%571) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %573 = "tosa.sub"(%571, %572) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %574 = "tosa.exp"(%573) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %575 = "tosa.reduce_sum"(%574) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %576 = "tosa.reciprocal"(%575) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %577 = "tosa.mul"(%574, %576) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %578 = "tosa.reshape"(%577) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %579 = "tosa.reshape"(%559) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %580 = "tosa.matmul"(%578, %579) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %581 = "tosa.reshape"(%580) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %582 = "tosa.transpose"(%581, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %583 = "tosa.reshape"(%582) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %584 = "tosa.matmul"(%583, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %585 = "tosa.reshape"(%584) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %586 = "tosa.add"(%96, %585) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %587 = "tosa.reshape"(%586) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %588 = "tosa.add"(%587, %535) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %589 = "tosa.reduce_sum"(%588) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %590 = "tosa.mul"(%589, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %591 = "tosa.sub"(%588, %590) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %592 = "tosa.mul"(%591, %591) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %593 = "tosa.reduce_sum"(%592) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %594 = "tosa.mul"(%593, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %595 = "tosa.add"(%594, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %596 = "tosa.rsqrt"(%595) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %597 = "tosa.mul"(%591, %596) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %598 = "tosa.mul"(%597, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %599 = "tosa.add"(%598, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %600 = "tosa.matmul"(%599, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %601 = "tosa.reshape"(%600) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %602 = "tosa.add"(%114, %601) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %603 = "tosa.reshape"(%602) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %604 = "tosa.mul"(%603, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %605 = "tosa.pow"(%603, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %606 = "tosa.mul"(%605, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %607 = "tosa.add"(%603, %606) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %608 = "tosa.mul"(%607, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %609 = "tosa.tanh"(%608) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %610 = "tosa.add"(%609, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %611 = "tosa.mul"(%604, %610) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %612 = "tosa.matmul"(%611, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %613 = "tosa.reshape"(%612) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %614 = "tosa.add"(%96, %613) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %615 = "tosa.reshape"(%614) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %616 = "tosa.add"(%588, %615) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %617 = "tosa.reduce_sum"(%616) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %618 = "tosa.mul"(%617, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %619 = "tosa.sub"(%616, %618) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %620 = "tosa.mul"(%619, %619) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %621 = "tosa.reduce_sum"(%620) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %622 = "tosa.mul"(%621, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %623 = "tosa.add"(%622, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %624 = "tosa.rsqrt"(%623) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %625 = "tosa.mul"(%619, %624) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %626 = "tosa.mul"(%625, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %627 = "tosa.add"(%626, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %628 = "tosa.matmul"(%627, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %629 = "tosa.reshape"(%628) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %630 = "tosa.add"(%51, %629) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %631 = "tosa.reshape"(%630) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %632 = "tosa.slice"(%631) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %633 = "tosa.slice"(%631) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %634 = "tosa.slice"(%631) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %635 = "tosa.reshape"(%632) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %636 = "tosa.transpose"(%635, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %637 = "tosa.reshape"(%633) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %638 = "tosa.transpose"(%637, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %639 = "tosa.reshape"(%634) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %640 = "tosa.transpose"(%639, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %641 = "tosa.transpose"(%638, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %642 = "tosa.reshape"(%636) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %643 = "tosa.reshape"(%641) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %644 = "tosa.matmul"(%642, %643) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %645 = "tosa.reshape"(%644) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %646 = "tosa.mul"(%645, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %647 = torch_c.from_builtin_tensor %646 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %648 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %649 = torch.aten.to.dtype %648, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %650 = torch.valsem.aten.copy %649, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %651 = torch.aten.where.self %650, %647, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %652 = torch_c.to_builtin_tensor %651 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %653 = "tosa.reduce_max"(%652) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %654 = "tosa.sub"(%652, %653) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %655 = "tosa.exp"(%654) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %656 = "tosa.reduce_sum"(%655) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %657 = "tosa.reciprocal"(%656) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %658 = "tosa.mul"(%655, %657) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %659 = "tosa.reshape"(%658) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %660 = "tosa.reshape"(%640) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %661 = "tosa.matmul"(%659, %660) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %662 = "tosa.reshape"(%661) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %663 = "tosa.transpose"(%662, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %664 = "tosa.reshape"(%663) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %665 = "tosa.matmul"(%664, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %666 = "tosa.reshape"(%665) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %667 = "tosa.add"(%96, %666) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %668 = "tosa.reshape"(%667) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %669 = "tosa.add"(%668, %616) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %670 = "tosa.reduce_sum"(%669) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %671 = "tosa.mul"(%670, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %672 = "tosa.sub"(%669, %671) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %673 = "tosa.mul"(%672, %672) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %674 = "tosa.reduce_sum"(%673) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %675 = "tosa.mul"(%674, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %676 = "tosa.add"(%675, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %677 = "tosa.rsqrt"(%676) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %678 = "tosa.mul"(%672, %677) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %679 = "tosa.mul"(%678, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %680 = "tosa.add"(%679, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %681 = "tosa.matmul"(%680, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %682 = "tosa.reshape"(%681) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %683 = "tosa.add"(%114, %682) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %684 = "tosa.reshape"(%683) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %685 = "tosa.mul"(%684, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %686 = "tosa.pow"(%684, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %687 = "tosa.mul"(%686, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %688 = "tosa.add"(%684, %687) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %689 = "tosa.mul"(%688, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %690 = "tosa.tanh"(%689) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %691 = "tosa.add"(%690, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %692 = "tosa.mul"(%685, %691) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %693 = "tosa.matmul"(%692, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %694 = "tosa.reshape"(%693) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %695 = "tosa.add"(%96, %694) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %696 = "tosa.reshape"(%695) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %697 = "tosa.add"(%669, %696) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %698 = "tosa.reduce_sum"(%697) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %699 = "tosa.mul"(%698, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %700 = "tosa.sub"(%697, %699) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %701 = "tosa.mul"(%700, %700) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %702 = "tosa.reduce_sum"(%701) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %703 = "tosa.mul"(%702, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %704 = "tosa.add"(%703, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %705 = "tosa.rsqrt"(%704) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %706 = "tosa.mul"(%700, %705) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %707 = "tosa.mul"(%706, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %708 = "tosa.add"(%707, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %709 = "tosa.matmul"(%708, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %710 = "tosa.reshape"(%709) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %711 = "tosa.add"(%51, %710) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %712 = "tosa.reshape"(%711) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %713 = "tosa.slice"(%712) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %714 = "tosa.slice"(%712) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %715 = "tosa.slice"(%712) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %716 = "tosa.reshape"(%713) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %717 = "tosa.transpose"(%716, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %718 = "tosa.reshape"(%714) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %719 = "tosa.transpose"(%718, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %720 = "tosa.reshape"(%715) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %721 = "tosa.transpose"(%720, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %722 = "tosa.transpose"(%719, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %723 = "tosa.reshape"(%717) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %724 = "tosa.reshape"(%722) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %725 = "tosa.matmul"(%723, %724) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %726 = "tosa.reshape"(%725) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %727 = "tosa.mul"(%726, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %728 = torch_c.from_builtin_tensor %727 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %729 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %730 = torch.aten.to.dtype %729, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %731 = torch.valsem.aten.copy %730, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %732 = torch.aten.where.self %731, %728, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %733 = torch_c.to_builtin_tensor %732 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %734 = "tosa.reduce_max"(%733) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %735 = "tosa.sub"(%733, %734) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %736 = "tosa.exp"(%735) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %737 = "tosa.reduce_sum"(%736) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %738 = "tosa.reciprocal"(%737) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %739 = "tosa.mul"(%736, %738) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %740 = "tosa.reshape"(%739) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %741 = "tosa.reshape"(%721) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %742 = "tosa.matmul"(%740, %741) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %743 = "tosa.reshape"(%742) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %744 = "tosa.transpose"(%743, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %745 = "tosa.reshape"(%744) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %746 = "tosa.matmul"(%745, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %747 = "tosa.reshape"(%746) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %748 = "tosa.add"(%96, %747) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %749 = "tosa.reshape"(%748) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %750 = "tosa.add"(%749, %697) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %751 = "tosa.reduce_sum"(%750) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %752 = "tosa.mul"(%751, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %753 = "tosa.sub"(%750, %752) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %754 = "tosa.mul"(%753, %753) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %755 = "tosa.reduce_sum"(%754) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %756 = "tosa.mul"(%755, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %757 = "tosa.add"(%756, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %758 = "tosa.rsqrt"(%757) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %759 = "tosa.mul"(%753, %758) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %760 = "tosa.mul"(%759, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %761 = "tosa.add"(%760, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %762 = "tosa.matmul"(%761, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %763 = "tosa.reshape"(%762) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %764 = "tosa.add"(%114, %763) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %765 = "tosa.reshape"(%764) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %766 = "tosa.mul"(%765, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %767 = "tosa.pow"(%765, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %768 = "tosa.mul"(%767, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %769 = "tosa.add"(%765, %768) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %770 = "tosa.mul"(%769, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %771 = "tosa.tanh"(%770) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %772 = "tosa.add"(%771, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %773 = "tosa.mul"(%766, %772) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %774 = "tosa.matmul"(%773, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %775 = "tosa.reshape"(%774) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %776 = "tosa.add"(%96, %775) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %777 = "tosa.reshape"(%776) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %778 = "tosa.add"(%750, %777) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %779 = "tosa.reduce_sum"(%778) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %780 = "tosa.mul"(%779, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %781 = "tosa.sub"(%778, %780) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %782 = "tosa.mul"(%781, %781) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %783 = "tosa.reduce_sum"(%782) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %784 = "tosa.mul"(%783, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %785 = "tosa.add"(%784, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %786 = "tosa.rsqrt"(%785) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %787 = "tosa.mul"(%781, %786) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %788 = "tosa.mul"(%787, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %789 = "tosa.add"(%788, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %790 = "tosa.matmul"(%789, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %791 = "tosa.reshape"(%790) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %792 = "tosa.add"(%51, %791) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %793 = "tosa.reshape"(%792) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %794 = "tosa.slice"(%793) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %795 = "tosa.slice"(%793) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %796 = "tosa.slice"(%793) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %797 = "tosa.reshape"(%794) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %798 = "tosa.transpose"(%797, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %799 = "tosa.reshape"(%795) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %800 = "tosa.transpose"(%799, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %801 = "tosa.reshape"(%796) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %802 = "tosa.transpose"(%801, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %803 = "tosa.transpose"(%800, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %804 = "tosa.reshape"(%798) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %805 = "tosa.reshape"(%803) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %806 = "tosa.matmul"(%804, %805) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %807 = "tosa.reshape"(%806) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %808 = "tosa.mul"(%807, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %809 = torch_c.from_builtin_tensor %808 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %810 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %811 = torch.aten.to.dtype %810, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %812 = torch.valsem.aten.copy %811, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %813 = torch.aten.where.self %812, %809, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %814 = torch_c.to_builtin_tensor %813 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %815 = "tosa.reduce_max"(%814) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %816 = "tosa.sub"(%814, %815) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %817 = "tosa.exp"(%816) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %818 = "tosa.reduce_sum"(%817) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %819 = "tosa.reciprocal"(%818) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %820 = "tosa.mul"(%817, %819) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %821 = "tosa.reshape"(%820) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %822 = "tosa.reshape"(%802) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %823 = "tosa.matmul"(%821, %822) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %824 = "tosa.reshape"(%823) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %825 = "tosa.transpose"(%824, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %826 = "tosa.reshape"(%825) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %827 = "tosa.matmul"(%826, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %828 = "tosa.reshape"(%827) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %829 = "tosa.add"(%96, %828) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %830 = "tosa.reshape"(%829) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %831 = "tosa.add"(%830, %778) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %832 = "tosa.reduce_sum"(%831) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %833 = "tosa.mul"(%832, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %834 = "tosa.sub"(%831, %833) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %835 = "tosa.mul"(%834, %834) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %836 = "tosa.reduce_sum"(%835) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %837 = "tosa.mul"(%836, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %838 = "tosa.add"(%837, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %839 = "tosa.rsqrt"(%838) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %840 = "tosa.mul"(%834, %839) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %841 = "tosa.mul"(%840, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %842 = "tosa.add"(%841, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %843 = "tosa.matmul"(%842, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %844 = "tosa.reshape"(%843) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %845 = "tosa.add"(%114, %844) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %846 = "tosa.reshape"(%845) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %847 = "tosa.mul"(%846, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %848 = "tosa.pow"(%846, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %849 = "tosa.mul"(%848, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %850 = "tosa.add"(%846, %849) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %851 = "tosa.mul"(%850, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %852 = "tosa.tanh"(%851) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %853 = "tosa.add"(%852, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %854 = "tosa.mul"(%847, %853) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %855 = "tosa.matmul"(%854, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %856 = "tosa.reshape"(%855) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %857 = "tosa.add"(%96, %856) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %858 = "tosa.reshape"(%857) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %859 = "tosa.add"(%831, %858) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %860 = "tosa.reduce_sum"(%859) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %861 = "tosa.mul"(%860, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %862 = "tosa.sub"(%859, %861) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %863 = "tosa.mul"(%862, %862) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %864 = "tosa.reduce_sum"(%863) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %865 = "tosa.mul"(%864, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %866 = "tosa.add"(%865, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %867 = "tosa.rsqrt"(%866) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %868 = "tosa.mul"(%862, %867) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %869 = "tosa.mul"(%868, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %870 = "tosa.add"(%869, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %871 = "tosa.matmul"(%870, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %872 = "tosa.reshape"(%871) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %873 = "tosa.add"(%51, %872) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %874 = "tosa.reshape"(%873) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %875 = "tosa.slice"(%874) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %876 = "tosa.slice"(%874) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %877 = "tosa.slice"(%874) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %878 = "tosa.reshape"(%875) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %879 = "tosa.transpose"(%878, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %880 = "tosa.reshape"(%876) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %881 = "tosa.transpose"(%880, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %882 = "tosa.reshape"(%877) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %883 = "tosa.transpose"(%882, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %884 = "tosa.transpose"(%881, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %885 = "tosa.reshape"(%879) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %886 = "tosa.reshape"(%884) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %887 = "tosa.matmul"(%885, %886) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %888 = "tosa.reshape"(%887) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %889 = "tosa.mul"(%888, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %890 = torch_c.from_builtin_tensor %889 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %891 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %892 = torch.aten.to.dtype %891, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %893 = torch.valsem.aten.copy %892, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %894 = torch.aten.where.self %893, %890, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %895 = torch_c.to_builtin_tensor %894 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %896 = "tosa.reduce_max"(%895) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %897 = "tosa.sub"(%895, %896) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %898 = "tosa.exp"(%897) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %899 = "tosa.reduce_sum"(%898) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %900 = "tosa.reciprocal"(%899) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %901 = "tosa.mul"(%898, %900) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %902 = "tosa.reshape"(%901) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %903 = "tosa.reshape"(%883) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %904 = "tosa.matmul"(%902, %903) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %905 = "tosa.reshape"(%904) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %906 = "tosa.transpose"(%905, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %907 = "tosa.reshape"(%906) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %908 = "tosa.matmul"(%907, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %909 = "tosa.reshape"(%908) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %910 = "tosa.add"(%96, %909) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %911 = "tosa.reshape"(%910) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %912 = "tosa.add"(%911, %859) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %913 = "tosa.reduce_sum"(%912) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %914 = "tosa.mul"(%913, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %915 = "tosa.sub"(%912, %914) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %916 = "tosa.mul"(%915, %915) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %917 = "tosa.reduce_sum"(%916) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %918 = "tosa.mul"(%917, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %919 = "tosa.add"(%918, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %920 = "tosa.rsqrt"(%919) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %921 = "tosa.mul"(%915, %920) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %922 = "tosa.mul"(%921, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %923 = "tosa.add"(%922, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %924 = "tosa.matmul"(%923, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %925 = "tosa.reshape"(%924) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %926 = "tosa.add"(%114, %925) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %927 = "tosa.reshape"(%926) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %928 = "tosa.mul"(%927, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %929 = "tosa.pow"(%927, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %930 = "tosa.mul"(%929, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %931 = "tosa.add"(%927, %930) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %932 = "tosa.mul"(%931, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %933 = "tosa.tanh"(%932) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %934 = "tosa.add"(%933, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %935 = "tosa.mul"(%928, %934) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %936 = "tosa.matmul"(%935, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %937 = "tosa.reshape"(%936) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %938 = "tosa.add"(%96, %937) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %939 = "tosa.reshape"(%938) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %940 = "tosa.add"(%912, %939) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %941 = "tosa.reduce_sum"(%940) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %942 = "tosa.mul"(%941, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %943 = "tosa.sub"(%940, %942) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %944 = "tosa.mul"(%943, %943) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %945 = "tosa.reduce_sum"(%944) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %946 = "tosa.mul"(%945, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %947 = "tosa.add"(%946, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %948 = "tosa.rsqrt"(%947) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %949 = "tosa.mul"(%943, %948) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %950 = "tosa.mul"(%949, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %951 = "tosa.add"(%950, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %952 = "tosa.matmul"(%951, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
    %953 = "tosa.reshape"(%952) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
    %954 = "tosa.add"(%51, %953) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
    %955 = "tosa.reshape"(%954) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
    %956 = "tosa.slice"(%955) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %957 = "tosa.slice"(%955) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %958 = "tosa.slice"(%955) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
    %959 = "tosa.reshape"(%956) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %960 = "tosa.transpose"(%959, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %961 = "tosa.reshape"(%957) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %962 = "tosa.transpose"(%961, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %963 = "tosa.reshape"(%958) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
    %964 = "tosa.transpose"(%963, %9) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
    %965 = "tosa.transpose"(%962, %8) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
    %966 = "tosa.reshape"(%960) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %967 = "tosa.reshape"(%965) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
    %968 = "tosa.matmul"(%966, %967) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
    %969 = "tosa.reshape"(%968) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %970 = "tosa.mul"(%969, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
    %971 = torch_c.from_builtin_tensor %970 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
    %972 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
    %973 = torch.aten.to.dtype %972, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
    %974 = torch.valsem.aten.copy %973, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
    %975 = torch.aten.where.self %974, %971, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
    %976 = torch_c.to_builtin_tensor %975 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
    %977 = "tosa.reduce_max"(%976) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %978 = "tosa.sub"(%976, %977) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %979 = "tosa.exp"(%978) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
    %980 = "tosa.reduce_sum"(%979) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
    %981 = "tosa.reciprocal"(%980) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
    %982 = "tosa.mul"(%979, %981) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
    %983 = "tosa.reshape"(%982) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
    %984 = "tosa.reshape"(%964) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
    %985 = "tosa.matmul"(%983, %984) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
    %986 = "tosa.reshape"(%985) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
    %987 = "tosa.transpose"(%986, %9) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
    %988 = "tosa.reshape"(%987) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
    %989 = "tosa.matmul"(%988, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
    %990 = "tosa.reshape"(%989) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %991 = "tosa.add"(%96, %990) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %992 = "tosa.reshape"(%991) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %993 = "tosa.add"(%992, %940) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %994 = "tosa.reduce_sum"(%993) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %995 = "tosa.mul"(%994, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %996 = "tosa.sub"(%993, %995) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %997 = "tosa.mul"(%996, %996) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %998 = "tosa.reduce_sum"(%997) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %999 = "tosa.mul"(%998, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %1000 = "tosa.add"(%999, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %1001 = "tosa.rsqrt"(%1000) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %1002 = "tosa.mul"(%996, %1001) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %1003 = "tosa.mul"(%1002, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %1004 = "tosa.add"(%1003, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %1005 = "tosa.matmul"(%1004, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
    %1006 = "tosa.reshape"(%1005) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
    %1007 = "tosa.add"(%114, %1006) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
    %1008 = "tosa.reshape"(%1007) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
    %1009 = "tosa.mul"(%1008, %3) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %1010 = "tosa.pow"(%1008, %4) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %1011 = "tosa.mul"(%1010, %5) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %1012 = "tosa.add"(%1008, %1011) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %1013 = "tosa.mul"(%1012, %6) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %1014 = "tosa.tanh"(%1013) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %1015 = "tosa.add"(%1014, %1) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
    %1016 = "tosa.mul"(%1009, %1015) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
    %1017 = "tosa.matmul"(%1016, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
    %1018 = "tosa.reshape"(%1017) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
    %1019 = "tosa.add"(%96, %1018) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
    %1020 = "tosa.reshape"(%1019) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
    %1021 = "tosa.add"(%993, %1020) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %1022 = "tosa.reduce_sum"(%1021) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %1023 = "tosa.mul"(%1022, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %1024 = "tosa.sub"(%1021, %1023) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %1025 = "tosa.mul"(%1024, %1024) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
    %1026 = "tosa.reduce_sum"(%1025) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
    %1027 = "tosa.mul"(%1026, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %1028 = "tosa.add"(%1027, %2) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
    %1029 = "tosa.rsqrt"(%1028) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
    %1030 = "tosa.mul"(%1024, %1029) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
    %1031 = "tosa.mul"(%1030, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %1032 = "tosa.add"(%1031, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
    %1033 = "tosa.transpose"(%11, %7) : (tensor<50257x768xf32>, tensor<2xi32>) -> tensor<768x50257xf32>
    %1034 = "tosa.reshape"(%1033) {new_shape = [1, 768, 50257]} : (tensor<768x50257xf32>) -> tensor<1x768x50257xf32>
    %1035 = "tosa.matmul"(%1032, %1034) : (tensor<1x5x768xf32>, tensor<1x768x50257xf32>) -> tensor<1x5x50257xf32>
    %1036 = torch_c.from_builtin_tensor %1035 : tensor<1x5x50257xf32> -> !torch.vtensor<[1,5,50257],f32>
    %1037 = torch_c.to_builtin_tensor %1036 : !torch.vtensor<[1,5,50257],f32> -> tensor<1x5x50257xf32>
    return %1037 : tensor<1x5x50257xf32>
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @forward(%arg0: tensor<1x5xi64>) -> tensor<1x5x50257xf32> {
  %int5 = torch.constant.int 5
  %int1 = torch.constant.int 1
  %float0.000000e00 = torch.constant.float 0.000000e+00
  %0 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %1 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %3 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %5 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %7 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
  %9 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
  %11 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int11 = torch.constant.int 11
  %int4 = torch.constant.int 4
  %int0 = torch.constant.int 0
  %12 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %13 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %14 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %15 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
  %16 = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %17 = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %18 = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %19 = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %20 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %21 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %22 = torch_c.from_builtin_tensor %arg0 : tensor<1x5xi64> -> !torch.vtensor<[1,5],si64>
  %23 = torch_c.to_builtin_tensor %22 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
  %cpu = torch.constant.device "cpu"
  %24 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
  %25 = torch_c.to_builtin_tensor %24 : !torch.vtensor<[5],si64> -> tensor<5xi64>
  %26 = "tosa.reshape"(%25) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
  %27 = "tosa.reshape"(%11) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
  %28 = "tosa.cast"(%23) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %29 = "tosa.gather"(%27, %28) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %30 = "tosa.reshape"(%10) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
  %31 = "tosa.cast"(%26) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %32 = "tosa.gather"(%30, %31) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %33 = "tosa.add"(%29, %32) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %34 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %35 = "tosa.reduce_sum"(%33) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %36 = "tosa.reshape"(%34) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %37 = "tosa.mul"(%35, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %38 = "tosa.sub"(%33, %37) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %39 = "tosa.mul"(%38, %38) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %40 = "tosa.reduce_sum"(%39) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %41 = "tosa.mul"(%40, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %42 = "tosa.reshape"(%0) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %43 = "tosa.add"(%41, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %44 = "tosa.rsqrt"(%43) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %45 = "tosa.mul"(%38, %44) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %46 = "tosa.mul"(%45, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %47 = "tosa.add"(%46, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %48 = "tosa.reshape"(%5) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %49 = "tosa.matmul"(%47, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %50 = "tosa.reshape"(%49) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %51 = "tosa.reshape"(%6) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %52 = "tosa.add"(%51, %50) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %53 = "tosa.reshape"(%52) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %54 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %55 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %56 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %57 = "tosa.reshape"(%54) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %58 = "tosa.transpose"(%57, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %59 = "tosa.reshape"(%55) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %60 = "tosa.transpose"(%59, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %61 = "tosa.reshape"(%56) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %62 = "tosa.transpose"(%61, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %63 = "tosa.transpose"(%60, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %64 = "tosa.reshape"(%58) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %65 = "tosa.reshape"(%63) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %66 = "tosa.matmul"(%64, %65) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %67 = "tosa.reshape"(%66) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %68 = "tosa.reciprocal"(%9) : (tensor<f32>) -> tensor<f32>
  %69 = "tosa.reshape"(%68) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %70 = "tosa.mul"(%67, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %71 = torch_c.from_builtin_tensor %70 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %72 = "tosa.slice"(%8) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %73 = "tosa.slice"(%72) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %74 = torch_c.from_builtin_tensor %73 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %75 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %76 = torch.aten.to.dtype %75, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %77 = torch.valsem.aten.copy %76, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %78 = torch_c.from_builtin_tensor %7 : tensor<f32> -> !torch.vtensor<[],f32>
  %79 = torch.aten.where.self %77, %71, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %80 = torch_c.to_builtin_tensor %79 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %81 = "tosa.reduce_max"(%80) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %82 = "tosa.sub"(%80, %81) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %83 = "tosa.exp"(%82) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %84 = "tosa.reduce_sum"(%83) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %85 = "tosa.reciprocal"(%84) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %86 = "tosa.mul"(%83, %85) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %87 = "tosa.reshape"(%86) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %88 = "tosa.reshape"(%62) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %89 = "tosa.matmul"(%87, %88) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %90 = "tosa.reshape"(%89) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %91 = "tosa.transpose"(%90, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %92 = "tosa.reshape"(%91) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %93 = "tosa.reshape"(%4) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %94 = "tosa.matmul"(%92, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %95 = "tosa.reshape"(%94) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %96 = "tosa.reshape"(%0) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %97 = "tosa.add"(%96, %95) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %98 = "tosa.reshape"(%97) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %99 = "tosa.add"(%98, %33) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %100 = "tosa.reduce_sum"(%99) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %101 = "tosa.mul"(%100, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %102 = "tosa.sub"(%99, %101) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %103 = "tosa.mul"(%102, %102) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %104 = "tosa.reduce_sum"(%103) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %105 = "tosa.mul"(%104, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %106 = "tosa.add"(%105, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %107 = "tosa.rsqrt"(%106) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %108 = "tosa.mul"(%102, %107) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %109 = "tosa.mul"(%108, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %110 = "tosa.add"(%109, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %111 = "tosa.reshape"(%2) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %112 = "tosa.matmul"(%110, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %113 = "tosa.reshape"(%112) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %114 = "tosa.reshape"(%3) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %115 = "tosa.add"(%114, %113) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %116 = "tosa.reshape"(%115) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %117 = "tosa.mul"(%116, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %118 = "tosa.pow"(%116, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %119 = "tosa.mul"(%118, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %120 = "tosa.add"(%116, %119) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %121 = "tosa.mul"(%120, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %122 = "tosa.tanh"(%121) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %123 = "tosa.add"(%122, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %124 = "tosa.mul"(%117, %123) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %125 = "tosa.reshape"(%1) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %126 = "tosa.matmul"(%124, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %127 = "tosa.reshape"(%126) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %128 = "tosa.add"(%96, %127) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %129 = "tosa.reshape"(%128) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %130 = "tosa.add"(%99, %129) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %131 = "tosa.reduce_sum"(%130) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %132 = "tosa.mul"(%131, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %133 = "tosa.sub"(%130, %132) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %134 = "tosa.mul"(%133, %133) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %135 = "tosa.reduce_sum"(%134) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %136 = "tosa.mul"(%135, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %137 = "tosa.add"(%136, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %138 = "tosa.rsqrt"(%137) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %139 = "tosa.mul"(%133, %138) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %140 = "tosa.mul"(%139, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %141 = "tosa.add"(%140, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %142 = "tosa.matmul"(%141, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %143 = "tosa.reshape"(%142) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %144 = "tosa.add"(%51, %143) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %145 = "tosa.reshape"(%144) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %146 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %147 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %148 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %149 = "tosa.reshape"(%146) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %150 = "tosa.transpose"(%149, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %151 = "tosa.reshape"(%147) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %152 = "tosa.transpose"(%151, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %153 = "tosa.reshape"(%148) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %154 = "tosa.transpose"(%153, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %155 = "tosa.transpose"(%152, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %156 = "tosa.reshape"(%150) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %157 = "tosa.reshape"(%155) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %158 = "tosa.matmul"(%156, %157) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %159 = "tosa.reshape"(%158) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %160 = "tosa.mul"(%159, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %161 = torch_c.from_builtin_tensor %160 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %162 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %163 = torch.aten.to.dtype %162, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %164 = torch.valsem.aten.copy %163, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %165 = torch.aten.where.self %164, %161, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %166 = torch_c.to_builtin_tensor %165 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %167 = "tosa.reduce_max"(%166) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %168 = "tosa.sub"(%166, %167) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %169 = "tosa.exp"(%168) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %170 = "tosa.reduce_sum"(%169) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %171 = "tosa.reciprocal"(%170) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %172 = "tosa.mul"(%169, %171) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %173 = "tosa.reshape"(%172) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %174 = "tosa.reshape"(%154) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %175 = "tosa.matmul"(%173, %174) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %176 = "tosa.reshape"(%175) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %177 = "tosa.transpose"(%176, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %178 = "tosa.reshape"(%177) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %179 = "tosa.matmul"(%178, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %180 = "tosa.reshape"(%179) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %181 = "tosa.add"(%96, %180) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %182 = "tosa.reshape"(%181) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %183 = "tosa.add"(%182, %130) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %184 = "tosa.reduce_sum"(%183) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %185 = "tosa.mul"(%184, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %186 = "tosa.sub"(%183, %185) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %187 = "tosa.mul"(%186, %186) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %188 = "tosa.reduce_sum"(%187) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %189 = "tosa.mul"(%188, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %190 = "tosa.add"(%189, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %191 = "tosa.rsqrt"(%190) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %192 = "tosa.mul"(%186, %191) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %193 = "tosa.mul"(%192, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %194 = "tosa.add"(%193, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %195 = "tosa.matmul"(%194, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %196 = "tosa.reshape"(%195) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %197 = "tosa.add"(%114, %196) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %198 = "tosa.reshape"(%197) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %199 = "tosa.mul"(%198, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %200 = "tosa.pow"(%198, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %201 = "tosa.mul"(%200, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %202 = "tosa.add"(%198, %201) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %203 = "tosa.mul"(%202, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %204 = "tosa.tanh"(%203) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %205 = "tosa.add"(%204, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %206 = "tosa.mul"(%199, %205) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %207 = "tosa.matmul"(%206, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %208 = "tosa.reshape"(%207) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %209 = "tosa.add"(%96, %208) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %210 = "tosa.reshape"(%209) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %211 = "tosa.add"(%183, %210) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %212 = "tosa.reduce_sum"(%211) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %213 = "tosa.mul"(%212, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %214 = "tosa.sub"(%211, %213) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %215 = "tosa.mul"(%214, %214) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %216 = "tosa.reduce_sum"(%215) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %217 = "tosa.mul"(%216, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %218 = "tosa.add"(%217, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %219 = "tosa.rsqrt"(%218) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %220 = "tosa.mul"(%214, %219) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %221 = "tosa.mul"(%220, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %222 = "tosa.add"(%221, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %223 = "tosa.matmul"(%222, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %224 = "tosa.reshape"(%223) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %225 = "tosa.add"(%51, %224) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %226 = "tosa.reshape"(%225) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %227 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %228 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %229 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %230 = "tosa.reshape"(%227) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %231 = "tosa.transpose"(%230, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %232 = "tosa.reshape"(%228) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %233 = "tosa.transpose"(%232, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %234 = "tosa.reshape"(%229) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %235 = "tosa.transpose"(%234, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %236 = "tosa.transpose"(%233, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %237 = "tosa.reshape"(%231) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %238 = "tosa.reshape"(%236) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %239 = "tosa.matmul"(%237, %238) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %240 = "tosa.reshape"(%239) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %241 = "tosa.mul"(%240, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %242 = torch_c.from_builtin_tensor %241 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %243 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %244 = torch.aten.to.dtype %243, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %245 = torch.valsem.aten.copy %244, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %246 = torch.aten.where.self %245, %242, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %247 = torch_c.to_builtin_tensor %246 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %248 = "tosa.reduce_max"(%247) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %249 = "tosa.sub"(%247, %248) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %250 = "tosa.exp"(%249) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %251 = "tosa.reduce_sum"(%250) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %252 = "tosa.reciprocal"(%251) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %253 = "tosa.mul"(%250, %252) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %254 = "tosa.reshape"(%253) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %255 = "tosa.reshape"(%235) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %256 = "tosa.matmul"(%254, %255) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %257 = "tosa.reshape"(%256) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %258 = "tosa.transpose"(%257, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %259 = "tosa.reshape"(%258) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %260 = "tosa.matmul"(%259, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %261 = "tosa.reshape"(%260) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %262 = "tosa.add"(%96, %261) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %263 = "tosa.reshape"(%262) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %264 = "tosa.add"(%263, %211) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %265 = "tosa.reduce_sum"(%264) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %266 = "tosa.mul"(%265, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %267 = "tosa.sub"(%264, %266) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %268 = "tosa.mul"(%267, %267) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %269 = "tosa.reduce_sum"(%268) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %270 = "tosa.mul"(%269, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %271 = "tosa.add"(%270, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %272 = "tosa.rsqrt"(%271) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %273 = "tosa.mul"(%267, %272) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %274 = "tosa.mul"(%273, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %275 = "tosa.add"(%274, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %276 = "tosa.matmul"(%275, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %277 = "tosa.reshape"(%276) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %278 = "tosa.add"(%114, %277) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %279 = "tosa.reshape"(%278) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %280 = "tosa.mul"(%279, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %281 = "tosa.pow"(%279, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %282 = "tosa.mul"(%281, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %283 = "tosa.add"(%279, %282) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %284 = "tosa.mul"(%283, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %285 = "tosa.tanh"(%284) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %286 = "tosa.add"(%285, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %287 = "tosa.mul"(%280, %286) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %288 = "tosa.matmul"(%287, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %289 = "tosa.reshape"(%288) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %290 = "tosa.add"(%96, %289) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %291 = "tosa.reshape"(%290) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %292 = "tosa.add"(%264, %291) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %293 = "tosa.reduce_sum"(%292) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %294 = "tosa.mul"(%293, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %295 = "tosa.sub"(%292, %294) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %296 = "tosa.mul"(%295, %295) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %297 = "tosa.reduce_sum"(%296) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %298 = "tosa.mul"(%297, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %299 = "tosa.add"(%298, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %300 = "tosa.rsqrt"(%299) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %301 = "tosa.mul"(%295, %300) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %302 = "tosa.mul"(%301, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %303 = "tosa.add"(%302, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %304 = "tosa.matmul"(%303, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %305 = "tosa.reshape"(%304) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %306 = "tosa.add"(%51, %305) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %307 = "tosa.reshape"(%306) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %308 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %309 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %310 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %311 = "tosa.reshape"(%308) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %312 = "tosa.transpose"(%311, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %313 = "tosa.reshape"(%309) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %314 = "tosa.transpose"(%313, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %315 = "tosa.reshape"(%310) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %316 = "tosa.transpose"(%315, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %317 = "tosa.transpose"(%314, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %318 = "tosa.reshape"(%312) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %319 = "tosa.reshape"(%317) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %320 = "tosa.matmul"(%318, %319) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %321 = "tosa.reshape"(%320) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %322 = "tosa.mul"(%321, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %323 = torch_c.from_builtin_tensor %322 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %324 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %325 = torch.aten.to.dtype %324, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %326 = torch.valsem.aten.copy %325, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %327 = torch.aten.where.self %326, %323, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %328 = torch_c.to_builtin_tensor %327 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %329 = "tosa.reduce_max"(%328) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %330 = "tosa.sub"(%328, %329) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %331 = "tosa.exp"(%330) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %332 = "tosa.reduce_sum"(%331) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %333 = "tosa.reciprocal"(%332) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %334 = "tosa.mul"(%331, %333) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %335 = "tosa.reshape"(%334) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %336 = "tosa.reshape"(%316) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %337 = "tosa.matmul"(%335, %336) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %338 = "tosa.reshape"(%337) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %339 = "tosa.transpose"(%338, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %340 = "tosa.reshape"(%339) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %341 = "tosa.matmul"(%340, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %342 = "tosa.reshape"(%341) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %343 = "tosa.add"(%96, %342) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %344 = "tosa.reshape"(%343) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %345 = "tosa.add"(%344, %292) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %346 = "tosa.reduce_sum"(%345) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %347 = "tosa.mul"(%346, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %348 = "tosa.sub"(%345, %347) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %349 = "tosa.mul"(%348, %348) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %350 = "tosa.reduce_sum"(%349) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %351 = "tosa.mul"(%350, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %352 = "tosa.add"(%351, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %353 = "tosa.rsqrt"(%352) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %354 = "tosa.mul"(%348, %353) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %355 = "tosa.mul"(%354, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %356 = "tosa.add"(%355, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %357 = "tosa.matmul"(%356, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %358 = "tosa.reshape"(%357) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %359 = "tosa.add"(%114, %358) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %360 = "tosa.reshape"(%359) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %361 = "tosa.mul"(%360, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %362 = "tosa.pow"(%360, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %363 = "tosa.mul"(%362, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %364 = "tosa.add"(%360, %363) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %365 = "tosa.mul"(%364, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %366 = "tosa.tanh"(%365) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %367 = "tosa.add"(%366, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %368 = "tosa.mul"(%361, %367) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %369 = "tosa.matmul"(%368, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %370 = "tosa.reshape"(%369) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %371 = "tosa.add"(%96, %370) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %372 = "tosa.reshape"(%371) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %373 = "tosa.add"(%345, %372) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %374 = "tosa.reduce_sum"(%373) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %375 = "tosa.mul"(%374, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %376 = "tosa.sub"(%373, %375) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %377 = "tosa.mul"(%376, %376) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %378 = "tosa.reduce_sum"(%377) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %379 = "tosa.mul"(%378, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %380 = "tosa.add"(%379, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %381 = "tosa.rsqrt"(%380) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %382 = "tosa.mul"(%376, %381) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %383 = "tosa.mul"(%382, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %384 = "tosa.add"(%383, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %385 = "tosa.matmul"(%384, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %386 = "tosa.reshape"(%385) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %387 = "tosa.add"(%51, %386) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %388 = "tosa.reshape"(%387) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %389 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %390 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %391 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %392 = "tosa.reshape"(%389) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %393 = "tosa.transpose"(%392, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %394 = "tosa.reshape"(%390) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %395 = "tosa.transpose"(%394, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %396 = "tosa.reshape"(%391) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %397 = "tosa.transpose"(%396, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %398 = "tosa.transpose"(%395, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %399 = "tosa.reshape"(%393) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %400 = "tosa.reshape"(%398) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %401 = "tosa.matmul"(%399, %400) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %402 = "tosa.reshape"(%401) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %403 = "tosa.mul"(%402, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %404 = torch_c.from_builtin_tensor %403 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %405 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %406 = torch.aten.to.dtype %405, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %407 = torch.valsem.aten.copy %406, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %408 = torch.aten.where.self %407, %404, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %409 = torch_c.to_builtin_tensor %408 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %410 = "tosa.reduce_max"(%409) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %411 = "tosa.sub"(%409, %410) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %412 = "tosa.exp"(%411) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %413 = "tosa.reduce_sum"(%412) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %414 = "tosa.reciprocal"(%413) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %415 = "tosa.mul"(%412, %414) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %416 = "tosa.reshape"(%415) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %417 = "tosa.reshape"(%397) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %418 = "tosa.matmul"(%416, %417) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %419 = "tosa.reshape"(%418) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %420 = "tosa.transpose"(%419, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %421 = "tosa.reshape"(%420) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %422 = "tosa.matmul"(%421, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %423 = "tosa.reshape"(%422) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %424 = "tosa.add"(%96, %423) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %425 = "tosa.reshape"(%424) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %426 = "tosa.add"(%425, %373) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %427 = "tosa.reduce_sum"(%426) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %428 = "tosa.mul"(%427, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %429 = "tosa.sub"(%426, %428) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %430 = "tosa.mul"(%429, %429) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %431 = "tosa.reduce_sum"(%430) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %432 = "tosa.mul"(%431, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %433 = "tosa.add"(%432, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %434 = "tosa.rsqrt"(%433) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %435 = "tosa.mul"(%429, %434) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %436 = "tosa.mul"(%435, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %437 = "tosa.add"(%436, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %438 = "tosa.matmul"(%437, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %439 = "tosa.reshape"(%438) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %440 = "tosa.add"(%114, %439) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %441 = "tosa.reshape"(%440) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %442 = "tosa.mul"(%441, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %443 = "tosa.pow"(%441, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %444 = "tosa.mul"(%443, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %445 = "tosa.add"(%441, %444) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %446 = "tosa.mul"(%445, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %447 = "tosa.tanh"(%446) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %448 = "tosa.add"(%447, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %449 = "tosa.mul"(%442, %448) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %450 = "tosa.matmul"(%449, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %451 = "tosa.reshape"(%450) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %452 = "tosa.add"(%96, %451) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %453 = "tosa.reshape"(%452) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %454 = "tosa.add"(%426, %453) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %455 = "tosa.reduce_sum"(%454) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %456 = "tosa.mul"(%455, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %457 = "tosa.sub"(%454, %456) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %458 = "tosa.mul"(%457, %457) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %459 = "tosa.reduce_sum"(%458) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %460 = "tosa.mul"(%459, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %461 = "tosa.add"(%460, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %462 = "tosa.rsqrt"(%461) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %463 = "tosa.mul"(%457, %462) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %464 = "tosa.mul"(%463, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %465 = "tosa.add"(%464, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %466 = "tosa.matmul"(%465, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %467 = "tosa.reshape"(%466) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %468 = "tosa.add"(%51, %467) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %469 = "tosa.reshape"(%468) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %470 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %471 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %472 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %473 = "tosa.reshape"(%470) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %474 = "tosa.transpose"(%473, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %475 = "tosa.reshape"(%471) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %476 = "tosa.transpose"(%475, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %477 = "tosa.reshape"(%472) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %478 = "tosa.transpose"(%477, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %479 = "tosa.transpose"(%476, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %480 = "tosa.reshape"(%474) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %481 = "tosa.reshape"(%479) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %482 = "tosa.matmul"(%480, %481) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %483 = "tosa.reshape"(%482) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %484 = "tosa.mul"(%483, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %485 = torch_c.from_builtin_tensor %484 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %486 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %487 = torch.aten.to.dtype %486, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %488 = torch.valsem.aten.copy %487, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %489 = torch.aten.where.self %488, %485, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %490 = torch_c.to_builtin_tensor %489 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %491 = "tosa.reduce_max"(%490) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %492 = "tosa.sub"(%490, %491) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %493 = "tosa.exp"(%492) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %494 = "tosa.reduce_sum"(%493) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %495 = "tosa.reciprocal"(%494) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %496 = "tosa.mul"(%493, %495) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %497 = "tosa.reshape"(%496) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %498 = "tosa.reshape"(%478) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %499 = "tosa.matmul"(%497, %498) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %500 = "tosa.reshape"(%499) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %501 = "tosa.transpose"(%500, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %502 = "tosa.reshape"(%501) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %503 = "tosa.matmul"(%502, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %504 = "tosa.reshape"(%503) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %505 = "tosa.add"(%96, %504) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %506 = "tosa.reshape"(%505) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %507 = "tosa.add"(%506, %454) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %508 = "tosa.reduce_sum"(%507) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %509 = "tosa.mul"(%508, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %510 = "tosa.sub"(%507, %509) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %511 = "tosa.mul"(%510, %510) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %512 = "tosa.reduce_sum"(%511) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %513 = "tosa.mul"(%512, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %514 = "tosa.add"(%513, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %515 = "tosa.rsqrt"(%514) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %516 = "tosa.mul"(%510, %515) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %517 = "tosa.mul"(%516, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %518 = "tosa.add"(%517, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %519 = "tosa.matmul"(%518, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %520 = "tosa.reshape"(%519) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %521 = "tosa.add"(%114, %520) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %522 = "tosa.reshape"(%521) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %523 = "tosa.mul"(%522, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %524 = "tosa.pow"(%522, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %525 = "tosa.mul"(%524, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %526 = "tosa.add"(%522, %525) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %527 = "tosa.mul"(%526, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %528 = "tosa.tanh"(%527) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %529 = "tosa.add"(%528, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %530 = "tosa.mul"(%523, %529) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %531 = "tosa.matmul"(%530, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %532 = "tosa.reshape"(%531) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %533 = "tosa.add"(%96, %532) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %534 = "tosa.reshape"(%533) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %535 = "tosa.add"(%507, %534) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %536 = "tosa.reduce_sum"(%535) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %537 = "tosa.mul"(%536, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %538 = "tosa.sub"(%535, %537) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %539 = "tosa.mul"(%538, %538) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %540 = "tosa.reduce_sum"(%539) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %541 = "tosa.mul"(%540, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %542 = "tosa.add"(%541, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %543 = "tosa.rsqrt"(%542) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %544 = "tosa.mul"(%538, %543) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %545 = "tosa.mul"(%544, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %546 = "tosa.add"(%545, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %547 = "tosa.matmul"(%546, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %548 = "tosa.reshape"(%547) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %549 = "tosa.add"(%51, %548) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %550 = "tosa.reshape"(%549) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %551 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %552 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %553 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %554 = "tosa.reshape"(%551) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %555 = "tosa.transpose"(%554, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %556 = "tosa.reshape"(%552) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %557 = "tosa.transpose"(%556, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %558 = "tosa.reshape"(%553) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %559 = "tosa.transpose"(%558, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %560 = "tosa.transpose"(%557, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %561 = "tosa.reshape"(%555) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %562 = "tosa.reshape"(%560) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %563 = "tosa.matmul"(%561, %562) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %564 = "tosa.reshape"(%563) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %565 = "tosa.mul"(%564, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %566 = torch_c.from_builtin_tensor %565 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %567 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %568 = torch.aten.to.dtype %567, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %569 = torch.valsem.aten.copy %568, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %570 = torch.aten.where.self %569, %566, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %571 = torch_c.to_builtin_tensor %570 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %572 = "tosa.reduce_max"(%571) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %573 = "tosa.sub"(%571, %572) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %574 = "tosa.exp"(%573) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %575 = "tosa.reduce_sum"(%574) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %576 = "tosa.reciprocal"(%575) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %577 = "tosa.mul"(%574, %576) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %578 = "tosa.reshape"(%577) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %579 = "tosa.reshape"(%559) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %580 = "tosa.matmul"(%578, %579) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %581 = "tosa.reshape"(%580) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %582 = "tosa.transpose"(%581, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %583 = "tosa.reshape"(%582) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %584 = "tosa.matmul"(%583, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %585 = "tosa.reshape"(%584) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %586 = "tosa.add"(%96, %585) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %587 = "tosa.reshape"(%586) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %588 = "tosa.add"(%587, %535) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %589 = "tosa.reduce_sum"(%588) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %590 = "tosa.mul"(%589, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %591 = "tosa.sub"(%588, %590) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %592 = "tosa.mul"(%591, %591) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %593 = "tosa.reduce_sum"(%592) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %594 = "tosa.mul"(%593, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %595 = "tosa.add"(%594, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %596 = "tosa.rsqrt"(%595) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %597 = "tosa.mul"(%591, %596) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %598 = "tosa.mul"(%597, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %599 = "tosa.add"(%598, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %600 = "tosa.matmul"(%599, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %601 = "tosa.reshape"(%600) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %602 = "tosa.add"(%114, %601) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %603 = "tosa.reshape"(%602) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %604 = "tosa.mul"(%603, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %605 = "tosa.pow"(%603, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %606 = "tosa.mul"(%605, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %607 = "tosa.add"(%603, %606) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %608 = "tosa.mul"(%607, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %609 = "tosa.tanh"(%608) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %610 = "tosa.add"(%609, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %611 = "tosa.mul"(%604, %610) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %612 = "tosa.matmul"(%611, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %613 = "tosa.reshape"(%612) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %614 = "tosa.add"(%96, %613) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %615 = "tosa.reshape"(%614) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %616 = "tosa.add"(%588, %615) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %617 = "tosa.reduce_sum"(%616) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %618 = "tosa.mul"(%617, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %619 = "tosa.sub"(%616, %618) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %620 = "tosa.mul"(%619, %619) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %621 = "tosa.reduce_sum"(%620) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %622 = "tosa.mul"(%621, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %623 = "tosa.add"(%622, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %624 = "tosa.rsqrt"(%623) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %625 = "tosa.mul"(%619, %624) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %626 = "tosa.mul"(%625, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %627 = "tosa.add"(%626, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %628 = "tosa.matmul"(%627, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %629 = "tosa.reshape"(%628) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %630 = "tosa.add"(%51, %629) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %631 = "tosa.reshape"(%630) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %632 = "tosa.slice"(%631) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %633 = "tosa.slice"(%631) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %634 = "tosa.slice"(%631) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %635 = "tosa.reshape"(%632) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %636 = "tosa.transpose"(%635, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %637 = "tosa.reshape"(%633) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %638 = "tosa.transpose"(%637, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %639 = "tosa.reshape"(%634) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %640 = "tosa.transpose"(%639, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %641 = "tosa.transpose"(%638, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %642 = "tosa.reshape"(%636) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %643 = "tosa.reshape"(%641) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %644 = "tosa.matmul"(%642, %643) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %645 = "tosa.reshape"(%644) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %646 = "tosa.mul"(%645, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %647 = torch_c.from_builtin_tensor %646 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %648 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %649 = torch.aten.to.dtype %648, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %650 = torch.valsem.aten.copy %649, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %651 = torch.aten.where.self %650, %647, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %652 = torch_c.to_builtin_tensor %651 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %653 = "tosa.reduce_max"(%652) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %654 = "tosa.sub"(%652, %653) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %655 = "tosa.exp"(%654) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %656 = "tosa.reduce_sum"(%655) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %657 = "tosa.reciprocal"(%656) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %658 = "tosa.mul"(%655, %657) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %659 = "tosa.reshape"(%658) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %660 = "tosa.reshape"(%640) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %661 = "tosa.matmul"(%659, %660) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %662 = "tosa.reshape"(%661) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %663 = "tosa.transpose"(%662, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %664 = "tosa.reshape"(%663) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %665 = "tosa.matmul"(%664, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %666 = "tosa.reshape"(%665) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %667 = "tosa.add"(%96, %666) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %668 = "tosa.reshape"(%667) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %669 = "tosa.add"(%668, %616) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %670 = "tosa.reduce_sum"(%669) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %671 = "tosa.mul"(%670, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %672 = "tosa.sub"(%669, %671) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %673 = "tosa.mul"(%672, %672) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %674 = "tosa.reduce_sum"(%673) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %675 = "tosa.mul"(%674, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %676 = "tosa.add"(%675, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %677 = "tosa.rsqrt"(%676) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %678 = "tosa.mul"(%672, %677) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %679 = "tosa.mul"(%678, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %680 = "tosa.add"(%679, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %681 = "tosa.matmul"(%680, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %682 = "tosa.reshape"(%681) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %683 = "tosa.add"(%114, %682) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %684 = "tosa.reshape"(%683) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %685 = "tosa.mul"(%684, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %686 = "tosa.pow"(%684, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %687 = "tosa.mul"(%686, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %688 = "tosa.add"(%684, %687) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %689 = "tosa.mul"(%688, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %690 = "tosa.tanh"(%689) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %691 = "tosa.add"(%690, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %692 = "tosa.mul"(%685, %691) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %693 = "tosa.matmul"(%692, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %694 = "tosa.reshape"(%693) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %695 = "tosa.add"(%96, %694) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %696 = "tosa.reshape"(%695) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %697 = "tosa.add"(%669, %696) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %698 = "tosa.reduce_sum"(%697) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %699 = "tosa.mul"(%698, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %700 = "tosa.sub"(%697, %699) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %701 = "tosa.mul"(%700, %700) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %702 = "tosa.reduce_sum"(%701) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %703 = "tosa.mul"(%702, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %704 = "tosa.add"(%703, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %705 = "tosa.rsqrt"(%704) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %706 = "tosa.mul"(%700, %705) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %707 = "tosa.mul"(%706, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %708 = "tosa.add"(%707, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %709 = "tosa.matmul"(%708, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %710 = "tosa.reshape"(%709) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %711 = "tosa.add"(%51, %710) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %712 = "tosa.reshape"(%711) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %713 = "tosa.slice"(%712) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %714 = "tosa.slice"(%712) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %715 = "tosa.slice"(%712) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %716 = "tosa.reshape"(%713) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %717 = "tosa.transpose"(%716, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %718 = "tosa.reshape"(%714) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %719 = "tosa.transpose"(%718, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %720 = "tosa.reshape"(%715) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %721 = "tosa.transpose"(%720, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %722 = "tosa.transpose"(%719, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %723 = "tosa.reshape"(%717) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %724 = "tosa.reshape"(%722) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %725 = "tosa.matmul"(%723, %724) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %726 = "tosa.reshape"(%725) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %727 = "tosa.mul"(%726, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %728 = torch_c.from_builtin_tensor %727 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %729 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %730 = torch.aten.to.dtype %729, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %731 = torch.valsem.aten.copy %730, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %732 = torch.aten.where.self %731, %728, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %733 = torch_c.to_builtin_tensor %732 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %734 = "tosa.reduce_max"(%733) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %735 = "tosa.sub"(%733, %734) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %736 = "tosa.exp"(%735) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %737 = "tosa.reduce_sum"(%736) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %738 = "tosa.reciprocal"(%737) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %739 = "tosa.mul"(%736, %738) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %740 = "tosa.reshape"(%739) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %741 = "tosa.reshape"(%721) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %742 = "tosa.matmul"(%740, %741) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %743 = "tosa.reshape"(%742) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %744 = "tosa.transpose"(%743, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %745 = "tosa.reshape"(%744) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %746 = "tosa.matmul"(%745, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %747 = "tosa.reshape"(%746) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %748 = "tosa.add"(%96, %747) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %749 = "tosa.reshape"(%748) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %750 = "tosa.add"(%749, %697) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %751 = "tosa.reduce_sum"(%750) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %752 = "tosa.mul"(%751, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %753 = "tosa.sub"(%750, %752) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %754 = "tosa.mul"(%753, %753) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %755 = "tosa.reduce_sum"(%754) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %756 = "tosa.mul"(%755, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %757 = "tosa.add"(%756, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %758 = "tosa.rsqrt"(%757) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %759 = "tosa.mul"(%753, %758) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %760 = "tosa.mul"(%759, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %761 = "tosa.add"(%760, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %762 = "tosa.matmul"(%761, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %763 = "tosa.reshape"(%762) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %764 = "tosa.add"(%114, %763) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %765 = "tosa.reshape"(%764) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %766 = "tosa.mul"(%765, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %767 = "tosa.pow"(%765, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %768 = "tosa.mul"(%767, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %769 = "tosa.add"(%765, %768) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %770 = "tosa.mul"(%769, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %771 = "tosa.tanh"(%770) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %772 = "tosa.add"(%771, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %773 = "tosa.mul"(%766, %772) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %774 = "tosa.matmul"(%773, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %775 = "tosa.reshape"(%774) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %776 = "tosa.add"(%96, %775) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %777 = "tosa.reshape"(%776) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %778 = "tosa.add"(%750, %777) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %779 = "tosa.reduce_sum"(%778) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %780 = "tosa.mul"(%779, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %781 = "tosa.sub"(%778, %780) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %782 = "tosa.mul"(%781, %781) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %783 = "tosa.reduce_sum"(%782) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %784 = "tosa.mul"(%783, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %785 = "tosa.add"(%784, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %786 = "tosa.rsqrt"(%785) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %787 = "tosa.mul"(%781, %786) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %788 = "tosa.mul"(%787, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %789 = "tosa.add"(%788, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %790 = "tosa.matmul"(%789, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %791 = "tosa.reshape"(%790) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %792 = "tosa.add"(%51, %791) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %793 = "tosa.reshape"(%792) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %794 = "tosa.slice"(%793) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %795 = "tosa.slice"(%793) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %796 = "tosa.slice"(%793) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %797 = "tosa.reshape"(%794) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %798 = "tosa.transpose"(%797, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %799 = "tosa.reshape"(%795) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %800 = "tosa.transpose"(%799, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %801 = "tosa.reshape"(%796) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %802 = "tosa.transpose"(%801, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %803 = "tosa.transpose"(%800, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %804 = "tosa.reshape"(%798) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %805 = "tosa.reshape"(%803) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %806 = "tosa.matmul"(%804, %805) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %807 = "tosa.reshape"(%806) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %808 = "tosa.mul"(%807, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %809 = torch_c.from_builtin_tensor %808 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %810 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %811 = torch.aten.to.dtype %810, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %812 = torch.valsem.aten.copy %811, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %813 = torch.aten.where.self %812, %809, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %814 = torch_c.to_builtin_tensor %813 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %815 = "tosa.reduce_max"(%814) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %816 = "tosa.sub"(%814, %815) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %817 = "tosa.exp"(%816) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %818 = "tosa.reduce_sum"(%817) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %819 = "tosa.reciprocal"(%818) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %820 = "tosa.mul"(%817, %819) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %821 = "tosa.reshape"(%820) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %822 = "tosa.reshape"(%802) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %823 = "tosa.matmul"(%821, %822) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %824 = "tosa.reshape"(%823) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %825 = "tosa.transpose"(%824, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %826 = "tosa.reshape"(%825) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %827 = "tosa.matmul"(%826, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %828 = "tosa.reshape"(%827) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %829 = "tosa.add"(%96, %828) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %830 = "tosa.reshape"(%829) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %831 = "tosa.add"(%830, %778) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %832 = "tosa.reduce_sum"(%831) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %833 = "tosa.mul"(%832, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %834 = "tosa.sub"(%831, %833) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %835 = "tosa.mul"(%834, %834) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %836 = "tosa.reduce_sum"(%835) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %837 = "tosa.mul"(%836, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %838 = "tosa.add"(%837, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %839 = "tosa.rsqrt"(%838) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %840 = "tosa.mul"(%834, %839) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %841 = "tosa.mul"(%840, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %842 = "tosa.add"(%841, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %843 = "tosa.matmul"(%842, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %844 = "tosa.reshape"(%843) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %845 = "tosa.add"(%114, %844) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %846 = "tosa.reshape"(%845) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %847 = "tosa.mul"(%846, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %848 = "tosa.pow"(%846, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %849 = "tosa.mul"(%848, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %850 = "tosa.add"(%846, %849) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %851 = "tosa.mul"(%850, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %852 = "tosa.tanh"(%851) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %853 = "tosa.add"(%852, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %854 = "tosa.mul"(%847, %853) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %855 = "tosa.matmul"(%854, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %856 = "tosa.reshape"(%855) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %857 = "tosa.add"(%96, %856) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %858 = "tosa.reshape"(%857) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %859 = "tosa.add"(%831, %858) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %860 = "tosa.reduce_sum"(%859) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %861 = "tosa.mul"(%860, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %862 = "tosa.sub"(%859, %861) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %863 = "tosa.mul"(%862, %862) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %864 = "tosa.reduce_sum"(%863) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %865 = "tosa.mul"(%864, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %866 = "tosa.add"(%865, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %867 = "tosa.rsqrt"(%866) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %868 = "tosa.mul"(%862, %867) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %869 = "tosa.mul"(%868, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %870 = "tosa.add"(%869, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %871 = "tosa.matmul"(%870, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %872 = "tosa.reshape"(%871) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %873 = "tosa.add"(%51, %872) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %874 = "tosa.reshape"(%873) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %875 = "tosa.slice"(%874) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %876 = "tosa.slice"(%874) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %877 = "tosa.slice"(%874) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %878 = "tosa.reshape"(%875) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %879 = "tosa.transpose"(%878, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %880 = "tosa.reshape"(%876) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %881 = "tosa.transpose"(%880, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %882 = "tosa.reshape"(%877) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %883 = "tosa.transpose"(%882, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %884 = "tosa.transpose"(%881, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %885 = "tosa.reshape"(%879) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %886 = "tosa.reshape"(%884) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %887 = "tosa.matmul"(%885, %886) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %888 = "tosa.reshape"(%887) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %889 = "tosa.mul"(%888, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %890 = torch_c.from_builtin_tensor %889 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %891 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %892 = torch.aten.to.dtype %891, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %893 = torch.valsem.aten.copy %892, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %894 = torch.aten.where.self %893, %890, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %895 = torch_c.to_builtin_tensor %894 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %896 = "tosa.reduce_max"(%895) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %897 = "tosa.sub"(%895, %896) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %898 = "tosa.exp"(%897) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %899 = "tosa.reduce_sum"(%898) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %900 = "tosa.reciprocal"(%899) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %901 = "tosa.mul"(%898, %900) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %902 = "tosa.reshape"(%901) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %903 = "tosa.reshape"(%883) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %904 = "tosa.matmul"(%902, %903) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %905 = "tosa.reshape"(%904) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %906 = "tosa.transpose"(%905, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %907 = "tosa.reshape"(%906) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %908 = "tosa.matmul"(%907, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %909 = "tosa.reshape"(%908) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %910 = "tosa.add"(%96, %909) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %911 = "tosa.reshape"(%910) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %912 = "tosa.add"(%911, %859) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %913 = "tosa.reduce_sum"(%912) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %914 = "tosa.mul"(%913, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %915 = "tosa.sub"(%912, %914) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %916 = "tosa.mul"(%915, %915) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %917 = "tosa.reduce_sum"(%916) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %918 = "tosa.mul"(%917, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %919 = "tosa.add"(%918, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %920 = "tosa.rsqrt"(%919) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %921 = "tosa.mul"(%915, %920) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %922 = "tosa.mul"(%921, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %923 = "tosa.add"(%922, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %924 = "tosa.matmul"(%923, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %925 = "tosa.reshape"(%924) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %926 = "tosa.add"(%114, %925) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %927 = "tosa.reshape"(%926) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %928 = "tosa.mul"(%927, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %929 = "tosa.pow"(%927, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %930 = "tosa.mul"(%929, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %931 = "tosa.add"(%927, %930) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %932 = "tosa.mul"(%931, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %933 = "tosa.tanh"(%932) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %934 = "tosa.add"(%933, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %935 = "tosa.mul"(%928, %934) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %936 = "tosa.matmul"(%935, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %937 = "tosa.reshape"(%936) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %938 = "tosa.add"(%96, %937) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %939 = "tosa.reshape"(%938) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %940 = "tosa.add"(%912, %939) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %941 = "tosa.reduce_sum"(%940) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %942 = "tosa.mul"(%941, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %943 = "tosa.sub"(%940, %942) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %944 = "tosa.mul"(%943, %943) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %945 = "tosa.reduce_sum"(%944) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %946 = "tosa.mul"(%945, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %947 = "tosa.add"(%946, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %948 = "tosa.rsqrt"(%947) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %949 = "tosa.mul"(%943, %948) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %950 = "tosa.mul"(%949, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %951 = "tosa.add"(%950, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %952 = "tosa.matmul"(%951, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %953 = "tosa.reshape"(%952) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %954 = "tosa.add"(%51, %953) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %955 = "tosa.reshape"(%954) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %956 = "tosa.slice"(%955) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %957 = "tosa.slice"(%955) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %958 = "tosa.slice"(%955) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %959 = "tosa.reshape"(%956) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %960 = "tosa.transpose"(%959, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %961 = "tosa.reshape"(%957) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %962 = "tosa.transpose"(%961, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %963 = "tosa.reshape"(%958) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %964 = "tosa.transpose"(%963, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %965 = "tosa.transpose"(%962, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %966 = "tosa.reshape"(%960) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %967 = "tosa.reshape"(%965) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %968 = "tosa.matmul"(%966, %967) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %969 = "tosa.reshape"(%968) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %970 = "tosa.mul"(%969, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %971 = torch_c.from_builtin_tensor %970 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %972 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %973 = torch.aten.to.dtype %972, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %974 = torch.valsem.aten.copy %973, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %975 = torch.aten.where.self %974, %971, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %976 = torch_c.to_builtin_tensor %975 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %977 = "tosa.reduce_max"(%976) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %978 = "tosa.sub"(%976, %977) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %979 = "tosa.exp"(%978) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %980 = "tosa.reduce_sum"(%979) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %981 = "tosa.reciprocal"(%980) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %982 = "tosa.mul"(%979, %981) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %983 = "tosa.reshape"(%982) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %984 = "tosa.reshape"(%964) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %985 = "tosa.matmul"(%983, %984) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %986 = "tosa.reshape"(%985) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %987 = "tosa.transpose"(%986, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %988 = "tosa.reshape"(%987) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %989 = "tosa.matmul"(%988, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %990 = "tosa.reshape"(%989) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %991 = "tosa.add"(%96, %990) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %992 = "tosa.reshape"(%991) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %993 = "tosa.add"(%992, %940) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %994 = "tosa.reduce_sum"(%993) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %995 = "tosa.mul"(%994, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %996 = "tosa.sub"(%993, %995) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %997 = "tosa.mul"(%996, %996) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %998 = "tosa.reduce_sum"(%997) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %999 = "tosa.mul"(%998, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1000 = "tosa.add"(%999, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1001 = "tosa.rsqrt"(%1000) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1002 = "tosa.mul"(%996, %1001) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1003 = "tosa.mul"(%1002, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1004 = "tosa.add"(%1003, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1005 = "tosa.matmul"(%1004, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %1006 = "tosa.reshape"(%1005) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %1007 = "tosa.add"(%114, %1006) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %1008 = "tosa.reshape"(%1007) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %1009 = "tosa.mul"(%1008, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1010 = "tosa.pow"(%1008, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1011 = "tosa.mul"(%1010, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1012 = "tosa.add"(%1008, %1011) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1013 = "tosa.mul"(%1012, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1014 = "tosa.tanh"(%1013) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1015 = "tosa.add"(%1014, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %1016 = "tosa.mul"(%1009, %1015) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %1017 = "tosa.matmul"(%1016, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %1018 = "tosa.reshape"(%1017) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %1019 = "tosa.add"(%96, %1018) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %1020 = "tosa.reshape"(%1019) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %1021 = "tosa.add"(%993, %1020) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1022 = "tosa.reduce_sum"(%1021) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1023 = "tosa.mul"(%1022, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1024 = "tosa.sub"(%1021, %1023) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1025 = "tosa.mul"(%1024, %1024) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %1026 = "tosa.reduce_sum"(%1025) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %1027 = "tosa.mul"(%1026, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1028 = "tosa.add"(%1027, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %1029 = "tosa.rsqrt"(%1028) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %1030 = "tosa.mul"(%1024, %1029) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %1031 = "tosa.mul"(%1030, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1032 = "tosa.add"(%1031, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %1033 = "tosa.transpose"(%11, %15) : (tensor<50257x768xf32>, tensor<2xi32>) -> tensor<768x50257xf32>
  %1034 = "tosa.reshape"(%1033) {new_shape = [1, 768, 50257]} : (tensor<768x50257xf32>) -> tensor<1x768x50257xf32>
  %1035 = "tosa.matmul"(%1032, %1034) : (tensor<1x5x768xf32>, tensor<1x768x50257xf32>) -> tensor<1x5x50257xf32>
  %1036 = torch_c.from_builtin_tensor %1035 : tensor<1x5x50257xf32> -> !torch.vtensor<[1,5,50257],f32>
  %1037 = torch_c.to_builtin_tensor %1036 : !torch.vtensor<[1,5,50257],f32> -> tensor<1x5x50257xf32>
  return %1037 : tensor<1x5x50257xf32>
 }

 <eval_with_key>.2:5:44: error: failed to legalize operation 'torch.constant.int'
 <eval_with_key>.2:5:44: note: see current operation: %0 = "torch.constant.int"() {value = 5 : i64} : () -> !torch.int
 // -----// IR Dump After FinalizingBackendTypeConversion Failed (torch-finalizing-backend-type-conversion) //----- //
 func.func @forward(%arg0: tensor<1x5xi64>) -> tensor<1x5x50257xf32> {
  %int5 = torch.constant.int 5
  %int1 = torch.constant.int 1
  %float0.000000e00 = torch.constant.float 0.000000e+00
  %0 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768xf32>} : () -> tensor<768xf32>
  %1 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072x768xf32>} : () -> tensor<3072x768xf32>
  %2 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x3072xf32>} : () -> tensor<768x3072xf32>
  %3 = "tosa.const"() {value = dense_resource<__elided__> : tensor<3072xf32>} : () -> tensor<3072xf32>
  %4 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x768xf32>} : () -> tensor<768x768xf32>
  %5 = "tosa.const"() {value = dense_resource<__elided__> : tensor<768x2304xf32>} : () -> tensor<768x2304xf32>
  %6 = "tosa.const"() {value = dense_resource<__elided__> : tensor<2304xf32>} : () -> tensor<2304xf32>
  %7 = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
  %8 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1x1x1024x1024xui8>} : () -> tensor<1x1x1024x1024xi8>
  %9 = "tosa.const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
  %10 = "tosa.const"() {value = dense_resource<__elided__> : tensor<1024x768xf32>} : () -> tensor<1024x768xf32>
  %11 = "tosa.const"() {value = dense_resource<__elided__> : tensor<50257x768xf32>} : () -> tensor<50257x768xf32>
  %false = torch.constant.bool false
  %none = torch.constant.none
  %int11 = torch.constant.int 11
  %int4 = torch.constant.int 4
  %int0 = torch.constant.int 0
  %12 = "tosa.const"() {value = dense<7.680000e+02> : tensor<1xf32>} : () -> tensor<1xf32>
  %13 = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
  %14 = "tosa.const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
  %15 = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
  %16 = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %17 = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %18 = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %19 = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %20 = "tosa.const"() {value = dense<9.99999974E-6> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %21 = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
  %22 = torch_c.from_builtin_tensor %arg0 : tensor<1x5xi64> -> !torch.vtensor<[1,5],si64>
  %23 = torch_c.to_builtin_tensor %22 : !torch.vtensor<[1,5],si64> -> tensor<1x5xi64>
  %cpu = torch.constant.device "cpu"
  %24 = torch.aten.arange.start_step %int0, %int5, %int1, %int4, %none, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[5],si64>
  %25 = torch_c.to_builtin_tensor %24 : !torch.vtensor<[5],si64> -> tensor<5xi64>
  %26 = "tosa.reshape"(%25) {new_shape = [1, 5]} : (tensor<5xi64>) -> tensor<1x5xi64>
  %27 = "tosa.reshape"(%11) {new_shape = [1, 50257, 768]} : (tensor<50257x768xf32>) -> tensor<1x50257x768xf32>
  %28 = "tosa.cast"(%23) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %29 = "tosa.gather"(%27, %28) : (tensor<1x50257x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %30 = "tosa.reshape"(%10) {new_shape = [1, 1024, 768]} : (tensor<1024x768xf32>) -> tensor<1x1024x768xf32>
  %31 = "tosa.cast"(%26) : (tensor<1x5xi64>) -> tensor<1x5xi32>
  %32 = "tosa.gather"(%30, %31) : (tensor<1x1024x768xf32>, tensor<1x5xi32>) -> tensor<1x5x768xf32>
  %33 = "tosa.add"(%29, %32) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %34 = "tosa.reciprocal"(%12) : (tensor<1xf32>) -> tensor<1xf32>
  %35 = "tosa.reduce_sum"(%33) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %36 = "tosa.reshape"(%34) {new_shape = [1, 1, 1]} : (tensor<1xf32>) -> tensor<1x1x1xf32>
  %37 = "tosa.mul"(%35, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %38 = "tosa.sub"(%33, %37) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %39 = "tosa.mul"(%38, %38) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %40 = "tosa.reduce_sum"(%39) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %41 = "tosa.mul"(%40, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %42 = "tosa.reshape"(%0) {new_shape = [1, 1, 768]} : (tensor<768xf32>) -> tensor<1x1x768xf32>
  %43 = "tosa.add"(%41, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %44 = "tosa.rsqrt"(%43) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %45 = "tosa.mul"(%38, %44) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %46 = "tosa.mul"(%45, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %47 = "tosa.add"(%46, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %48 = "tosa.reshape"(%5) {new_shape = [1, 768, 2304]} : (tensor<768x2304xf32>) -> tensor<1x768x2304xf32>
  %49 = "tosa.matmul"(%47, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %50 = "tosa.reshape"(%49) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %51 = "tosa.reshape"(%6) {new_shape = [1, 2304]} : (tensor<2304xf32>) -> tensor<1x2304xf32>
  %52 = "tosa.add"(%51, %50) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %53 = "tosa.reshape"(%52) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %54 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %55 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %56 = "tosa.slice"(%53) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %57 = "tosa.reshape"(%54) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %58 = "tosa.transpose"(%57, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %59 = "tosa.reshape"(%55) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %60 = "tosa.transpose"(%59, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %61 = "tosa.reshape"(%56) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %62 = "tosa.transpose"(%61, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %63 = "tosa.transpose"(%60, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %64 = "tosa.reshape"(%58) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %65 = "tosa.reshape"(%63) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %66 = "tosa.matmul"(%64, %65) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %67 = "tosa.reshape"(%66) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %68 = "tosa.reciprocal"(%9) : (tensor<f32>) -> tensor<f32>
  %69 = "tosa.reshape"(%68) {new_shape = [1, 1, 1, 1]} : (tensor<f32>) -> tensor<1x1x1x1xf32>
  %70 = "tosa.mul"(%67, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %71 = torch_c.from_builtin_tensor %70 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %72 = "tosa.slice"(%8) {size = [1, 1, 5, 1024], start = [0, 0, 0, 0]} : (tensor<1x1x1024x1024xi8>) -> tensor<1x1x5x1024xi8>
  %73 = "tosa.slice"(%72) {size = [1, 1, 5, 5], start = [0, 0, 0, 0]} : (tensor<1x1x5x1024xi8>) -> tensor<1x1x5x5xi8>
  %74 = torch_c.from_builtin_tensor %73 : tensor<1x1x5x5xi8> -> !torch.vtensor<[1,1,5,5],ui8>
  %75 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %76 = torch.aten.to.dtype %75, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %77 = torch.valsem.aten.copy %76, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %78 = torch_c.from_builtin_tensor %7 : tensor<f32> -> !torch.vtensor<[],f32>
  %79 = torch.aten.where.self %77, %71, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %80 = torch_c.to_builtin_tensor %79 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %81 = "tosa.reduce_max"(%80) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %82 = "tosa.sub"(%80, %81) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %83 = "tosa.exp"(%82) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %84 = "tosa.reduce_sum"(%83) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %85 = "tosa.reciprocal"(%84) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %86 = "tosa.mul"(%83, %85) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %87 = "tosa.reshape"(%86) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %88 = "tosa.reshape"(%62) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %89 = "tosa.matmul"(%87, %88) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %90 = "tosa.reshape"(%89) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %91 = "tosa.transpose"(%90, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %92 = "tosa.reshape"(%91) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %93 = "tosa.reshape"(%4) {new_shape = [1, 768, 768]} : (tensor<768x768xf32>) -> tensor<1x768x768xf32>
  %94 = "tosa.matmul"(%92, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %95 = "tosa.reshape"(%94) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %96 = "tosa.reshape"(%0) {new_shape = [1, 768]} : (tensor<768xf32>) -> tensor<1x768xf32>
  %97 = "tosa.add"(%96, %95) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %98 = "tosa.reshape"(%97) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %99 = "tosa.add"(%98, %33) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %100 = "tosa.reduce_sum"(%99) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %101 = "tosa.mul"(%100, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %102 = "tosa.sub"(%99, %101) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %103 = "tosa.mul"(%102, %102) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %104 = "tosa.reduce_sum"(%103) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %105 = "tosa.mul"(%104, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %106 = "tosa.add"(%105, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %107 = "tosa.rsqrt"(%106) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %108 = "tosa.mul"(%102, %107) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %109 = "tosa.mul"(%108, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %110 = "tosa.add"(%109, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %111 = "tosa.reshape"(%2) {new_shape = [1, 768, 3072]} : (tensor<768x3072xf32>) -> tensor<1x768x3072xf32>
  %112 = "tosa.matmul"(%110, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %113 = "tosa.reshape"(%112) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %114 = "tosa.reshape"(%3) {new_shape = [1, 3072]} : (tensor<3072xf32>) -> tensor<1x3072xf32>
  %115 = "tosa.add"(%114, %113) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %116 = "tosa.reshape"(%115) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %117 = "tosa.mul"(%116, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %118 = "tosa.pow"(%116, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %119 = "tosa.mul"(%118, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %120 = "tosa.add"(%116, %119) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %121 = "tosa.mul"(%120, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %122 = "tosa.tanh"(%121) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %123 = "tosa.add"(%122, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %124 = "tosa.mul"(%117, %123) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %125 = "tosa.reshape"(%1) {new_shape = [1, 3072, 768]} : (tensor<3072x768xf32>) -> tensor<1x3072x768xf32>
  %126 = "tosa.matmul"(%124, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %127 = "tosa.reshape"(%126) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %128 = "tosa.add"(%96, %127) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %129 = "tosa.reshape"(%128) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %130 = "tosa.add"(%99, %129) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %131 = "tosa.reduce_sum"(%130) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %132 = "tosa.mul"(%131, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %133 = "tosa.sub"(%130, %132) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %134 = "tosa.mul"(%133, %133) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %135 = "tosa.reduce_sum"(%134) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %136 = "tosa.mul"(%135, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %137 = "tosa.add"(%136, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %138 = "tosa.rsqrt"(%137) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %139 = "tosa.mul"(%133, %138) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %140 = "tosa.mul"(%139, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %141 = "tosa.add"(%140, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %142 = "tosa.matmul"(%141, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %143 = "tosa.reshape"(%142) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %144 = "tosa.add"(%51, %143) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %145 = "tosa.reshape"(%144) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %146 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %147 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %148 = "tosa.slice"(%145) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %149 = "tosa.reshape"(%146) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %150 = "tosa.transpose"(%149, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %151 = "tosa.reshape"(%147) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %152 = "tosa.transpose"(%151, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %153 = "tosa.reshape"(%148) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %154 = "tosa.transpose"(%153, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %155 = "tosa.transpose"(%152, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %156 = "tosa.reshape"(%150) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %157 = "tosa.reshape"(%155) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %158 = "tosa.matmul"(%156, %157) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %159 = "tosa.reshape"(%158) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %160 = "tosa.mul"(%159, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %161 = torch_c.from_builtin_tensor %160 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %162 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %163 = torch.aten.to.dtype %162, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %164 = torch.valsem.aten.copy %163, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %165 = torch.aten.where.self %164, %161, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %166 = torch_c.to_builtin_tensor %165 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %167 = "tosa.reduce_max"(%166) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %168 = "tosa.sub"(%166, %167) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %169 = "tosa.exp"(%168) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %170 = "tosa.reduce_sum"(%169) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %171 = "tosa.reciprocal"(%170) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %172 = "tosa.mul"(%169, %171) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %173 = "tosa.reshape"(%172) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %174 = "tosa.reshape"(%154) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %175 = "tosa.matmul"(%173, %174) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %176 = "tosa.reshape"(%175) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %177 = "tosa.transpose"(%176, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %178 = "tosa.reshape"(%177) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %179 = "tosa.matmul"(%178, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %180 = "tosa.reshape"(%179) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %181 = "tosa.add"(%96, %180) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %182 = "tosa.reshape"(%181) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %183 = "tosa.add"(%182, %130) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %184 = "tosa.reduce_sum"(%183) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %185 = "tosa.mul"(%184, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %186 = "tosa.sub"(%183, %185) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %187 = "tosa.mul"(%186, %186) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %188 = "tosa.reduce_sum"(%187) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %189 = "tosa.mul"(%188, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %190 = "tosa.add"(%189, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %191 = "tosa.rsqrt"(%190) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %192 = "tosa.mul"(%186, %191) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %193 = "tosa.mul"(%192, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %194 = "tosa.add"(%193, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %195 = "tosa.matmul"(%194, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %196 = "tosa.reshape"(%195) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %197 = "tosa.add"(%114, %196) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %198 = "tosa.reshape"(%197) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %199 = "tosa.mul"(%198, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %200 = "tosa.pow"(%198, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %201 = "tosa.mul"(%200, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %202 = "tosa.add"(%198, %201) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %203 = "tosa.mul"(%202, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %204 = "tosa.tanh"(%203) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %205 = "tosa.add"(%204, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %206 = "tosa.mul"(%199, %205) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %207 = "tosa.matmul"(%206, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %208 = "tosa.reshape"(%207) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %209 = "tosa.add"(%96, %208) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %210 = "tosa.reshape"(%209) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %211 = "tosa.add"(%183, %210) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %212 = "tosa.reduce_sum"(%211) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %213 = "tosa.mul"(%212, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %214 = "tosa.sub"(%211, %213) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %215 = "tosa.mul"(%214, %214) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %216 = "tosa.reduce_sum"(%215) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %217 = "tosa.mul"(%216, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %218 = "tosa.add"(%217, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %219 = "tosa.rsqrt"(%218) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %220 = "tosa.mul"(%214, %219) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %221 = "tosa.mul"(%220, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %222 = "tosa.add"(%221, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %223 = "tosa.matmul"(%222, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %224 = "tosa.reshape"(%223) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %225 = "tosa.add"(%51, %224) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %226 = "tosa.reshape"(%225) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %227 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %228 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %229 = "tosa.slice"(%226) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %230 = "tosa.reshape"(%227) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %231 = "tosa.transpose"(%230, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %232 = "tosa.reshape"(%228) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %233 = "tosa.transpose"(%232, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %234 = "tosa.reshape"(%229) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %235 = "tosa.transpose"(%234, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %236 = "tosa.transpose"(%233, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %237 = "tosa.reshape"(%231) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %238 = "tosa.reshape"(%236) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %239 = "tosa.matmul"(%237, %238) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %240 = "tosa.reshape"(%239) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %241 = "tosa.mul"(%240, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %242 = torch_c.from_builtin_tensor %241 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %243 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %244 = torch.aten.to.dtype %243, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %245 = torch.valsem.aten.copy %244, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %246 = torch.aten.where.self %245, %242, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %247 = torch_c.to_builtin_tensor %246 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %248 = "tosa.reduce_max"(%247) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %249 = "tosa.sub"(%247, %248) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %250 = "tosa.exp"(%249) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %251 = "tosa.reduce_sum"(%250) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %252 = "tosa.reciprocal"(%251) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %253 = "tosa.mul"(%250, %252) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %254 = "tosa.reshape"(%253) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %255 = "tosa.reshape"(%235) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %256 = "tosa.matmul"(%254, %255) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %257 = "tosa.reshape"(%256) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %258 = "tosa.transpose"(%257, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %259 = "tosa.reshape"(%258) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %260 = "tosa.matmul"(%259, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %261 = "tosa.reshape"(%260) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %262 = "tosa.add"(%96, %261) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %263 = "tosa.reshape"(%262) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %264 = "tosa.add"(%263, %211) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %265 = "tosa.reduce_sum"(%264) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %266 = "tosa.mul"(%265, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %267 = "tosa.sub"(%264, %266) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %268 = "tosa.mul"(%267, %267) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %269 = "tosa.reduce_sum"(%268) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %270 = "tosa.mul"(%269, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %271 = "tosa.add"(%270, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %272 = "tosa.rsqrt"(%271) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %273 = "tosa.mul"(%267, %272) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %274 = "tosa.mul"(%273, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %275 = "tosa.add"(%274, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %276 = "tosa.matmul"(%275, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %277 = "tosa.reshape"(%276) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %278 = "tosa.add"(%114, %277) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %279 = "tosa.reshape"(%278) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %280 = "tosa.mul"(%279, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %281 = "tosa.pow"(%279, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %282 = "tosa.mul"(%281, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %283 = "tosa.add"(%279, %282) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %284 = "tosa.mul"(%283, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %285 = "tosa.tanh"(%284) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %286 = "tosa.add"(%285, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %287 = "tosa.mul"(%280, %286) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %288 = "tosa.matmul"(%287, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %289 = "tosa.reshape"(%288) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %290 = "tosa.add"(%96, %289) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %291 = "tosa.reshape"(%290) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %292 = "tosa.add"(%264, %291) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %293 = "tosa.reduce_sum"(%292) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %294 = "tosa.mul"(%293, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %295 = "tosa.sub"(%292, %294) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %296 = "tosa.mul"(%295, %295) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %297 = "tosa.reduce_sum"(%296) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %298 = "tosa.mul"(%297, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %299 = "tosa.add"(%298, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %300 = "tosa.rsqrt"(%299) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %301 = "tosa.mul"(%295, %300) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %302 = "tosa.mul"(%301, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %303 = "tosa.add"(%302, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %304 = "tosa.matmul"(%303, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %305 = "tosa.reshape"(%304) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %306 = "tosa.add"(%51, %305) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %307 = "tosa.reshape"(%306) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %308 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %309 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %310 = "tosa.slice"(%307) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %311 = "tosa.reshape"(%308) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %312 = "tosa.transpose"(%311, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %313 = "tosa.reshape"(%309) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %314 = "tosa.transpose"(%313, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %315 = "tosa.reshape"(%310) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %316 = "tosa.transpose"(%315, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %317 = "tosa.transpose"(%314, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %318 = "tosa.reshape"(%312) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %319 = "tosa.reshape"(%317) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %320 = "tosa.matmul"(%318, %319) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %321 = "tosa.reshape"(%320) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %322 = "tosa.mul"(%321, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %323 = torch_c.from_builtin_tensor %322 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %324 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %325 = torch.aten.to.dtype %324, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %326 = torch.valsem.aten.copy %325, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %327 = torch.aten.where.self %326, %323, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %328 = torch_c.to_builtin_tensor %327 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %329 = "tosa.reduce_max"(%328) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %330 = "tosa.sub"(%328, %329) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %331 = "tosa.exp"(%330) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %332 = "tosa.reduce_sum"(%331) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %333 = "tosa.reciprocal"(%332) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %334 = "tosa.mul"(%331, %333) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %335 = "tosa.reshape"(%334) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %336 = "tosa.reshape"(%316) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %337 = "tosa.matmul"(%335, %336) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %338 = "tosa.reshape"(%337) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %339 = "tosa.transpose"(%338, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %340 = "tosa.reshape"(%339) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %341 = "tosa.matmul"(%340, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %342 = "tosa.reshape"(%341) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %343 = "tosa.add"(%96, %342) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %344 = "tosa.reshape"(%343) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %345 = "tosa.add"(%344, %292) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %346 = "tosa.reduce_sum"(%345) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %347 = "tosa.mul"(%346, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %348 = "tosa.sub"(%345, %347) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %349 = "tosa.mul"(%348, %348) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %350 = "tosa.reduce_sum"(%349) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %351 = "tosa.mul"(%350, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %352 = "tosa.add"(%351, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %353 = "tosa.rsqrt"(%352) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %354 = "tosa.mul"(%348, %353) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %355 = "tosa.mul"(%354, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %356 = "tosa.add"(%355, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %357 = "tosa.matmul"(%356, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %358 = "tosa.reshape"(%357) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %359 = "tosa.add"(%114, %358) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %360 = "tosa.reshape"(%359) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %361 = "tosa.mul"(%360, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %362 = "tosa.pow"(%360, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %363 = "tosa.mul"(%362, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %364 = "tosa.add"(%360, %363) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %365 = "tosa.mul"(%364, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %366 = "tosa.tanh"(%365) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %367 = "tosa.add"(%366, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %368 = "tosa.mul"(%361, %367) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %369 = "tosa.matmul"(%368, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %370 = "tosa.reshape"(%369) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %371 = "tosa.add"(%96, %370) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %372 = "tosa.reshape"(%371) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %373 = "tosa.add"(%345, %372) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %374 = "tosa.reduce_sum"(%373) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %375 = "tosa.mul"(%374, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %376 = "tosa.sub"(%373, %375) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %377 = "tosa.mul"(%376, %376) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %378 = "tosa.reduce_sum"(%377) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %379 = "tosa.mul"(%378, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %380 = "tosa.add"(%379, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %381 = "tosa.rsqrt"(%380) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %382 = "tosa.mul"(%376, %381) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %383 = "tosa.mul"(%382, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %384 = "tosa.add"(%383, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %385 = "tosa.matmul"(%384, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %386 = "tosa.reshape"(%385) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %387 = "tosa.add"(%51, %386) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %388 = "tosa.reshape"(%387) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %389 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %390 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %391 = "tosa.slice"(%388) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %392 = "tosa.reshape"(%389) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %393 = "tosa.transpose"(%392, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %394 = "tosa.reshape"(%390) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %395 = "tosa.transpose"(%394, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %396 = "tosa.reshape"(%391) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %397 = "tosa.transpose"(%396, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %398 = "tosa.transpose"(%395, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %399 = "tosa.reshape"(%393) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %400 = "tosa.reshape"(%398) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %401 = "tosa.matmul"(%399, %400) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %402 = "tosa.reshape"(%401) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %403 = "tosa.mul"(%402, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %404 = torch_c.from_builtin_tensor %403 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %405 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %406 = torch.aten.to.dtype %405, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %407 = torch.valsem.aten.copy %406, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %408 = torch.aten.where.self %407, %404, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %409 = torch_c.to_builtin_tensor %408 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %410 = "tosa.reduce_max"(%409) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %411 = "tosa.sub"(%409, %410) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %412 = "tosa.exp"(%411) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %413 = "tosa.reduce_sum"(%412) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %414 = "tosa.reciprocal"(%413) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %415 = "tosa.mul"(%412, %414) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %416 = "tosa.reshape"(%415) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %417 = "tosa.reshape"(%397) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %418 = "tosa.matmul"(%416, %417) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %419 = "tosa.reshape"(%418) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %420 = "tosa.transpose"(%419, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %421 = "tosa.reshape"(%420) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %422 = "tosa.matmul"(%421, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %423 = "tosa.reshape"(%422) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %424 = "tosa.add"(%96, %423) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %425 = "tosa.reshape"(%424) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %426 = "tosa.add"(%425, %373) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %427 = "tosa.reduce_sum"(%426) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %428 = "tosa.mul"(%427, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %429 = "tosa.sub"(%426, %428) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %430 = "tosa.mul"(%429, %429) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %431 = "tosa.reduce_sum"(%430) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %432 = "tosa.mul"(%431, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %433 = "tosa.add"(%432, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %434 = "tosa.rsqrt"(%433) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %435 = "tosa.mul"(%429, %434) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %436 = "tosa.mul"(%435, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %437 = "tosa.add"(%436, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %438 = "tosa.matmul"(%437, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %439 = "tosa.reshape"(%438) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %440 = "tosa.add"(%114, %439) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %441 = "tosa.reshape"(%440) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %442 = "tosa.mul"(%441, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %443 = "tosa.pow"(%441, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %444 = "tosa.mul"(%443, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %445 = "tosa.add"(%441, %444) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %446 = "tosa.mul"(%445, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %447 = "tosa.tanh"(%446) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %448 = "tosa.add"(%447, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %449 = "tosa.mul"(%442, %448) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %450 = "tosa.matmul"(%449, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %451 = "tosa.reshape"(%450) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %452 = "tosa.add"(%96, %451) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %453 = "tosa.reshape"(%452) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %454 = "tosa.add"(%426, %453) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %455 = "tosa.reduce_sum"(%454) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %456 = "tosa.mul"(%455, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %457 = "tosa.sub"(%454, %456) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %458 = "tosa.mul"(%457, %457) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %459 = "tosa.reduce_sum"(%458) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %460 = "tosa.mul"(%459, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %461 = "tosa.add"(%460, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %462 = "tosa.rsqrt"(%461) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %463 = "tosa.mul"(%457, %462) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %464 = "tosa.mul"(%463, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %465 = "tosa.add"(%464, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %466 = "tosa.matmul"(%465, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %467 = "tosa.reshape"(%466) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %468 = "tosa.add"(%51, %467) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %469 = "tosa.reshape"(%468) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %470 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %471 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %472 = "tosa.slice"(%469) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %473 = "tosa.reshape"(%470) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %474 = "tosa.transpose"(%473, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %475 = "tosa.reshape"(%471) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %476 = "tosa.transpose"(%475, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %477 = "tosa.reshape"(%472) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %478 = "tosa.transpose"(%477, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %479 = "tosa.transpose"(%476, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %480 = "tosa.reshape"(%474) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %481 = "tosa.reshape"(%479) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %482 = "tosa.matmul"(%480, %481) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %483 = "tosa.reshape"(%482) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %484 = "tosa.mul"(%483, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %485 = torch_c.from_builtin_tensor %484 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %486 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %487 = torch.aten.to.dtype %486, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %488 = torch.valsem.aten.copy %487, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %489 = torch.aten.where.self %488, %485, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %490 = torch_c.to_builtin_tensor %489 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %491 = "tosa.reduce_max"(%490) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %492 = "tosa.sub"(%490, %491) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %493 = "tosa.exp"(%492) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %494 = "tosa.reduce_sum"(%493) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %495 = "tosa.reciprocal"(%494) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %496 = "tosa.mul"(%493, %495) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %497 = "tosa.reshape"(%496) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %498 = "tosa.reshape"(%478) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %499 = "tosa.matmul"(%497, %498) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %500 = "tosa.reshape"(%499) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %501 = "tosa.transpose"(%500, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %502 = "tosa.reshape"(%501) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %503 = "tosa.matmul"(%502, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %504 = "tosa.reshape"(%503) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %505 = "tosa.add"(%96, %504) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %506 = "tosa.reshape"(%505) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %507 = "tosa.add"(%506, %454) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %508 = "tosa.reduce_sum"(%507) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %509 = "tosa.mul"(%508, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %510 = "tosa.sub"(%507, %509) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %511 = "tosa.mul"(%510, %510) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %512 = "tosa.reduce_sum"(%511) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %513 = "tosa.mul"(%512, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %514 = "tosa.add"(%513, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %515 = "tosa.rsqrt"(%514) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %516 = "tosa.mul"(%510, %515) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %517 = "tosa.mul"(%516, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %518 = "tosa.add"(%517, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %519 = "tosa.matmul"(%518, %111) : (tensor<1x5x768xf32>, tensor<1x768x3072xf32>) -> tensor<1x5x3072xf32>
  %520 = "tosa.reshape"(%519) {new_shape = [5, 3072]} : (tensor<1x5x3072xf32>) -> tensor<5x3072xf32>
  %521 = "tosa.add"(%114, %520) : (tensor<1x3072xf32>, tensor<5x3072xf32>) -> tensor<5x3072xf32>
  %522 = "tosa.reshape"(%521) {new_shape = [1, 5, 3072]} : (tensor<5x3072xf32>) -> tensor<1x5x3072xf32>
  %523 = "tosa.mul"(%522, %19) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %524 = "tosa.pow"(%522, %18) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %525 = "tosa.mul"(%524, %17) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %526 = "tosa.add"(%522, %525) : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %527 = "tosa.mul"(%526, %16) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %528 = "tosa.tanh"(%527) : (tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %529 = "tosa.add"(%528, %21) : (tensor<1x5x3072xf32>, tensor<1x1x1xf32>) -> tensor<1x5x3072xf32>
  %530 = "tosa.mul"(%523, %529) {shift = 0 : i32} : (tensor<1x5x3072xf32>, tensor<1x5x3072xf32>) -> tensor<1x5x3072xf32>
  %531 = "tosa.matmul"(%530, %125) : (tensor<1x5x3072xf32>, tensor<1x3072x768xf32>) -> tensor<1x5x768xf32>
  %532 = "tosa.reshape"(%531) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %533 = "tosa.add"(%96, %532) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %534 = "tosa.reshape"(%533) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %535 = "tosa.add"(%507, %534) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %536 = "tosa.reduce_sum"(%535) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %537 = "tosa.mul"(%536, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %538 = "tosa.sub"(%535, %537) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %539 = "tosa.mul"(%538, %538) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %540 = "tosa.reduce_sum"(%539) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %541 = "tosa.mul"(%540, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %542 = "tosa.add"(%541, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %543 = "tosa.rsqrt"(%542) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %544 = "tosa.mul"(%538, %543) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %545 = "tosa.mul"(%544, %42) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %546 = "tosa.add"(%545, %42) : (tensor<1x5x768xf32>, tensor<1x1x768xf32>) -> tensor<1x5x768xf32>
  %547 = "tosa.matmul"(%546, %48) : (tensor<1x5x768xf32>, tensor<1x768x2304xf32>) -> tensor<1x5x2304xf32>
  %548 = "tosa.reshape"(%547) {new_shape = [5, 2304]} : (tensor<1x5x2304xf32>) -> tensor<5x2304xf32>
  %549 = "tosa.add"(%51, %548) : (tensor<1x2304xf32>, tensor<5x2304xf32>) -> tensor<5x2304xf32>
  %550 = "tosa.reshape"(%549) {new_shape = [1, 5, 2304]} : (tensor<5x2304xf32>) -> tensor<1x5x2304xf32>
  %551 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 0]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %552 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 768]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %553 = "tosa.slice"(%550) {size = [1, 5, 768], start = [0, 0, 1536]} : (tensor<1x5x2304xf32>) -> tensor<1x5x768xf32>
  %554 = "tosa.reshape"(%551) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %555 = "tosa.transpose"(%554, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %556 = "tosa.reshape"(%552) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %557 = "tosa.transpose"(%556, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %558 = "tosa.reshape"(%553) {new_shape = [1, 5, 12, 64]} : (tensor<1x5x768xf32>) -> tensor<1x5x12x64xf32>
  %559 = "tosa.transpose"(%558, %13) : (tensor<1x5x12x64xf32>, tensor<4xi64>) -> tensor<1x12x5x64xf32>
  %560 = "tosa.transpose"(%557, %14) : (tensor<1x12x5x64xf32>, tensor<4xi32>) -> tensor<1x12x64x5xf32>
  %561 = "tosa.reshape"(%555) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %562 = "tosa.reshape"(%560) {new_shape = [12, 64, 5]} : (tensor<1x12x64x5xf32>) -> tensor<12x64x5xf32>
  %563 = "tosa.matmul"(%561, %562) : (tensor<12x5x64xf32>, tensor<12x64x5xf32>) -> tensor<12x5x5xf32>
  %564 = "tosa.reshape"(%563) {new_shape = [1, 12, 5, 5]} : (tensor<12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %565 = "tosa.mul"(%564, %69) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x1x1x1xf32>) -> tensor<1x12x5x5xf32>
  %566 = torch_c.from_builtin_tensor %565 : tensor<1x12x5x5xf32> -> !torch.vtensor<[1,12,5,5],f32>
  %567 = torch.prim.NumToTensor.Scalar %float0.000000e00 : !torch.float -> !torch.vtensor<[],f64>
  %568 = torch.aten.to.dtype %567, %int11, %false, %false, %none : !torch.vtensor<[],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[],i1>
  %569 = torch.valsem.aten.copy %568, %74, %false : !torch.vtensor<[],i1>, !torch.vtensor<[1,1,5,5],ui8>, !torch.bool -> !torch.vtensor<[1,1,5,5],i1>
  %570 = torch.aten.where.self %569, %566, %78 : !torch.vtensor<[1,1,5,5],i1>, !torch.vtensor<[1,12,5,5],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,12,5,5],f32>
  %571 = torch_c.to_builtin_tensor %570 : !torch.vtensor<[1,12,5,5],f32> -> tensor<1x12x5x5xf32>
  %572 = "tosa.reduce_max"(%571) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %573 = "tosa.sub"(%571, %572) : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %574 = "tosa.exp"(%573) : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x5xf32>
  %575 = "tosa.reduce_sum"(%574) {axis = 3 : i64} : (tensor<1x12x5x5xf32>) -> tensor<1x12x5x1xf32>
  %576 = "tosa.reciprocal"(%575) : (tensor<1x12x5x1xf32>) -> tensor<1x12x5x1xf32>
  %577 = "tosa.mul"(%574, %576) {shift = 0 : i32} : (tensor<1x12x5x5xf32>, tensor<1x12x5x1xf32>) -> tensor<1x12x5x5xf32>
  %578 = "tosa.reshape"(%577) {new_shape = [12, 5, 5]} : (tensor<1x12x5x5xf32>) -> tensor<12x5x5xf32>
  %579 = "tosa.reshape"(%559) {new_shape = [12, 5, 64]} : (tensor<1x12x5x64xf32>) -> tensor<12x5x64xf32>
  %580 = "tosa.matmul"(%578, %579) : (tensor<12x5x5xf32>, tensor<12x5x64xf32>) -> tensor<12x5x64xf32>
  %581 = "tosa.reshape"(%580) {new_shape = [1, 12, 5, 64]} : (tensor<12x5x64xf32>) -> tensor<1x12x5x64xf32>
  %582 = "tosa.transpose"(%581, %13) : (tensor<1x12x5x64xf32>, tensor<4xi64>) -> tensor<1x5x12x64xf32>
  %583 = "tosa.reshape"(%582) {new_shape = [1, 5, 768]} : (tensor<1x5x12x64xf32>) -> tensor<1x5x768xf32>
  %584 = "tosa.matmul"(%583, %93) : (tensor<1x5x768xf32>, tensor<1x768x768xf32>) -> tensor<1x5x768xf32>
  %585 = "tosa.reshape"(%584) {new_shape = [5, 768]} : (tensor<1x5x768xf32>) -> tensor<5x768xf32>
  %586 = "tosa.add"(%96, %585) : (tensor<1x768xf32>, tensor<5x768xf32>) -> tensor<5x768xf32>
  %587 = "tosa.reshape"(%586) {new_shape = [1, 5, 768]} : (tensor<5x768xf32>) -> tensor<1x5x768xf32>
  %588 = "tosa.add"(%587, %535) : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %589 = "tosa.reduce_sum"(%588) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %590 = "tosa.mul"(%589, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %591 = "tosa.sub"(%588, %590) : (tensor<1x5x768xf32>, tensor<1x5x1xf32>) -> tensor<1x5x768xf32>
  %592 = "tosa.mul"(%591, %591) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x768xf32>) -> tensor<1x5x768xf32>
  %593 = "tosa.reduce_sum"(%592) {axis = 2 : i64} : (tensor<1x5x768xf32>) -> tensor<1x5x1xf32>
  %594 = "tosa.mul"(%593, %36) {shift = 0 : i32} : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %595 = "tosa.add"(%594, %20) : (tensor<1x5x1xf32>, tensor<1x1x1xf32>) -> tensor<1x5x1xf32>
  %596 = "tosa.rsqrt"(%595) : (tensor<1x5x1xf32>) -> tensor<1x5x1xf32>
  %597 = "tosa.mul"(%591, %596) {shift = 0 : i32} : (tensor<1x5x768xf32>, tensor<1x5x1xf