AmosLewis · January 2, 2023 05:37 · AmosLewis · Jan 2, 2023 · AmosLewis · Jan 2, 2023
diff --git a/gpt2_torch_raw_elide.mlir b/gpt2_torch_raw_elide.mlir
 module attributes {torch.debug_module_name = "_lambda"} {
  func.func private @__torch__.torch.fx.graph_module._lambda.__code_getter(%arg0: !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda">) -> !torch.str {
    %186 = torch.prim.GetAttr %arg0["_code"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.str
    return %186 : !torch.str
  }
  func.func private @__torch__.torch.fx.graph_module._lambda.forward(%arg0: !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda">, %arg1: !torch.tensor {torch.type_bound = !torch.vtensor<[1,5],si64>}) -> !torch.tensor {
    %int11 = torch.constant.int 11
    %int-2 = torch.constant.int -2
    %none_0 = torch.constant.none
    %false = torch.constant.bool false
    %cpu = torch.constant.device "cpu"
    %int4 = torch.constant.int 4
    %int-1 = torch.constant.int -1
    %int1 = torch.constant.int 1
    %int5 = torch.constant.int 5
    %int0 = torch.constant.int 0
    %int768 = torch.constant.int 768
    %float1.000000e-05 = torch.constant.float 1.000000e-05
    %int2 = torch.constant.int 2
    %int2304 = torch.constant.int 2304
    %int1536 = torch.constant.int 1536
    %int12 = torch.constant.int 12
    %int64 = torch.constant.int 64
    %int3 = torch.constant.int 3
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int3072 = torch.constant.int 3072
    %float5.000000e-01 = torch.constant.float 5.000000e-01
    %float3.000000e00 = torch.constant.float 3.000000e+00
    %float4.471500e-02 = torch.constant.float 4.471500e-02
    %float7.978850e-01 = torch.constant.float 0.79788456080286541
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %int50257 = torch.constant.int 50257
    %186 = torch.prim.ListConstruct %int-1, %int5 : (!torch.int, !torch.int) -> !torch.list<int>
    %187 = torch.aten.view %arg1, %186 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %188 = torch.aten.arange.start %int0, %int5, %int4, %none_0, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.tensor
    %189 = torch.aten.unsqueeze %188, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %190 = torch.prim.ListConstruct %int-1, %int5 : (!torch.int, !torch.int) -> !torch.list<int>
    %191 = torch.aten.view %189, %190 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %192 = torch.prim.GetAttr %arg0["_param_constant0"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %193 = torch.aten.embedding %192, %187, %int-1, %false, %false : !torch.tensor, !torch.tensor, !torch.int, !torch.bool, !torch.bool -> !torch.tensor
    %194 = torch.prim.GetAttr %arg0["_param_constant1"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %195 = torch.aten.embedding %194, %191, %int-1, %false, %false : !torch.tensor, !torch.tensor, !torch.int, !torch.bool, !torch.bool -> !torch.tensor
    %196 = torch.aten.add.Tensor %193, %195, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %197 = torch.prim.GetAttr %arg0["_param_constant2"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %198 = torch.prim.GetAttr %arg0["_param_constant3"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %199 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0, %result1, %result2 = torch.aten.native_layer_norm %196, %199, %197, %198, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %200 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %201 = torch.aten.view %result0, %200 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %202 = torch.prim.GetAttr %arg0["_param_constant4"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %203 = torch.prim.GetAttr %arg0["_param_constant5"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %204 = torch.aten.addmm %202, %201, %203, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %205 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %206 = torch.aten.view %204, %205 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %207 = torch.aten.slice.Tensor %206, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %208 = torch.aten.slice.Tensor %206, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %209 = torch.aten.slice.Tensor %206, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %210 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %211 = torch.aten.view %207, %210 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %212 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %213 = torch.aten.permute %211, %212 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %214 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %215 = torch.aten.view %208, %214 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %216 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %217 = torch.aten.permute %215, %216 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %218 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %219 = torch.aten.view %209, %218 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %220 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %221 = torch.aten.permute %219, %220 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %222 = torch.aten.transpose.int %217, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %223 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %224 = torch.aten.expand %213, %223, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %225 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %226 = torch.aten.view %224, %225 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %227 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %228 = torch.aten.expand %222, %227, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %229 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %230 = torch.aten.view %228, %229 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %231 = torch.aten.bmm %226, %230 : !torch.tensor, !torch.tensor -> !torch.tensor
    %232 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %233 = torch.aten._unsafe_view %231, %232 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %234 = torch.prim.GetAttr %arg0["_tensor_constant0"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %235 = torch.aten.lift_fresh_copy %234 : !torch.tensor -> !torch.tensor
    %236 = torch.aten.div.Tensor %233, %235 : !torch.tensor, !torch.tensor -> !torch.tensor
    %237 = torch.prim.GetAttr %arg0["_tensor_constant1"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %238 = torch.aten.slice.Tensor %237, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %239 = torch.aten.slice.Tensor %238, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %240 = torch.aten.slice.Tensor %239, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %241 = torch.aten.slice.Tensor %240, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %242 = torch.aten._to_copy %241, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %243 = torch.prim.GetAttr %arg0["_tensor_constant2"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %244 = torch.aten.lift_fresh_copy %243 : !torch.tensor -> !torch.tensor
    %245 = torch.aten.where.self %242, %236, %244 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %246 = torch.aten._softmax %245, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %247 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %248 = torch.aten.expand %246, %247, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %249 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %250 = torch.aten.view %248, %249 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %251 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %252 = torch.aten.expand %221, %251, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %253 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %254 = torch.aten.view %252, %253 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %255 = torch.aten.bmm %250, %254 : !torch.tensor, !torch.tensor -> !torch.tensor
    %256 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %257 = torch.aten._unsafe_view %255, %256 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %258 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %259 = torch.aten.permute %257, %258 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %260 = torch.aten.clone %259, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %261 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %262 = torch.aten.view %260, %261 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %263 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %264 = torch.aten.view %262, %263 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %265 = torch.prim.GetAttr %arg0["_param_constant6"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %266 = torch.prim.GetAttr %arg0["_param_constant7"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %267 = torch.aten.addmm %265, %264, %266, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %268 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %269 = torch.aten.view %267, %268 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %270 = torch.aten.add.Tensor %269, %196, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %271 = torch.prim.GetAttr %arg0["_param_constant8"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %272 = torch.prim.GetAttr %arg0["_param_constant9"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %273 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_1, %result1_2, %result2_3 = torch.aten.native_layer_norm %270, %273, %271, %272, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %274 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %275 = torch.aten.view %result0_1, %274 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %276 = torch.prim.GetAttr %arg0["_param_constant10"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %277 = torch.prim.GetAttr %arg0["_param_constant11"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %278 = torch.aten.addmm %276, %275, %277, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %279 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %280 = torch.aten.view %278, %279 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %281 = torch.aten.mul.Scalar %280, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %282 = torch.aten.pow.Tensor_Scalar %280, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %283 = torch.aten.mul.Scalar %282, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %284 = torch.aten.add.Tensor %280, %283, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %285 = torch.aten.mul.Scalar %284, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %286 = torch.aten.tanh %285 : !torch.tensor -> !torch.tensor
    %287 = torch.aten.add.Scalar %286, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %288 = torch.aten.mul.Tensor %281, %287 : !torch.tensor, !torch.tensor -> !torch.tensor
    %289 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %290 = torch.aten.view %288, %289 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %291 = torch.prim.GetAttr %arg0["_param_constant12"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %292 = torch.prim.GetAttr %arg0["_param_constant13"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %293 = torch.aten.addmm %291, %290, %292, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %294 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %295 = torch.aten.view %293, %294 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %296 = torch.aten.add.Tensor %270, %295, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %297 = torch.prim.GetAttr %arg0["_param_constant14"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %298 = torch.prim.GetAttr %arg0["_param_constant15"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %299 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_4, %result1_5, %result2_6 = torch.aten.native_layer_norm %296, %299, %297, %298, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %300 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %301 = torch.aten.view %result0_4, %300 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %302 = torch.prim.GetAttr %arg0["_param_constant16"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %303 = torch.prim.GetAttr %arg0["_param_constant17"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %304 = torch.aten.addmm %302, %301, %303, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %305 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %306 = torch.aten.view %304, %305 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %307 = torch.aten.slice.Tensor %306, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %308 = torch.aten.slice.Tensor %306, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %309 = torch.aten.slice.Tensor %306, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %310 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %311 = torch.aten.view %307, %310 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %312 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %313 = torch.aten.permute %311, %312 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %314 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %315 = torch.aten.view %308, %314 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %316 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %317 = torch.aten.permute %315, %316 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %318 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %319 = torch.aten.view %309, %318 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %320 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %321 = torch.aten.permute %319, %320 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %322 = torch.aten.transpose.int %317, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %323 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %324 = torch.aten.expand %313, %323, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %325 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %326 = torch.aten.view %324, %325 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %327 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %328 = torch.aten.expand %322, %327, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %329 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %330 = torch.aten.view %328, %329 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %331 = torch.aten.bmm %326, %330 : !torch.tensor, !torch.tensor -> !torch.tensor
    %332 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %333 = torch.aten._unsafe_view %331, %332 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %334 = torch.prim.GetAttr %arg0["_tensor_constant3"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %335 = torch.aten.lift_fresh_copy %334 : !torch.tensor -> !torch.tensor
    %336 = torch.aten.div.Tensor %333, %335 : !torch.tensor, !torch.tensor -> !torch.tensor
    %337 = torch.prim.GetAttr %arg0["_tensor_constant4"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %338 = torch.aten.slice.Tensor %337, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %339 = torch.aten.slice.Tensor %338, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %340 = torch.aten.slice.Tensor %339, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %341 = torch.aten.slice.Tensor %340, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %342 = torch.aten._to_copy %341, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %343 = torch.prim.GetAttr %arg0["_tensor_constant5"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %344 = torch.aten.lift_fresh_copy %343 : !torch.tensor -> !torch.tensor
    %345 = torch.aten.where.self %342, %336, %344 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %346 = torch.aten._softmax %345, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %347 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %348 = torch.aten.expand %346, %347, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %349 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %350 = torch.aten.view %348, %349 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %351 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %352 = torch.aten.expand %321, %351, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %353 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %354 = torch.aten.view %352, %353 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %355 = torch.aten.bmm %350, %354 : !torch.tensor, !torch.tensor -> !torch.tensor
    %356 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %357 = torch.aten._unsafe_view %355, %356 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %358 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %359 = torch.aten.permute %357, %358 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %360 = torch.aten.clone %359, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %361 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %362 = torch.aten.view %360, %361 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %363 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %364 = torch.aten.view %362, %363 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %365 = torch.prim.GetAttr %arg0["_param_constant18"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %366 = torch.prim.GetAttr %arg0["_param_constant19"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %367 = torch.aten.addmm %365, %364, %366, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %368 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %369 = torch.aten.view %367, %368 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %370 = torch.aten.add.Tensor %369, %296, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %371 = torch.prim.GetAttr %arg0["_param_constant20"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %372 = torch.prim.GetAttr %arg0["_param_constant21"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %373 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_7, %result1_8, %result2_9 = torch.aten.native_layer_norm %370, %373, %371, %372, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %374 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %375 = torch.aten.view %result0_7, %374 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %376 = torch.prim.GetAttr %arg0["_param_constant22"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %377 = torch.prim.GetAttr %arg0["_param_constant23"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %378 = torch.aten.addmm %376, %375, %377, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %379 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %380 = torch.aten.view %378, %379 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %381 = torch.aten.mul.Scalar %380, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %382 = torch.aten.pow.Tensor_Scalar %380, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %383 = torch.aten.mul.Scalar %382, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %384 = torch.aten.add.Tensor %380, %383, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %385 = torch.aten.mul.Scalar %384, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %386 = torch.aten.tanh %385 : !torch.tensor -> !torch.tensor
    %387 = torch.aten.add.Scalar %386, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %388 = torch.aten.mul.Tensor %381, %387 : !torch.tensor, !torch.tensor -> !torch.tensor
    %389 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %390 = torch.aten.view %388, %389 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %391 = torch.prim.GetAttr %arg0["_param_constant24"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %392 = torch.prim.GetAttr %arg0["_param_constant25"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %393 = torch.aten.addmm %391, %390, %392, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %394 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %395 = torch.aten.view %393, %394 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %396 = torch.aten.add.Tensor %370, %395, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %397 = torch.prim.GetAttr %arg0["_param_constant26"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %398 = torch.prim.GetAttr %arg0["_param_constant27"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %399 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_10, %result1_11, %result2_12 = torch.aten.native_layer_norm %396, %399, %397, %398, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %400 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %401 = torch.aten.view %result0_10, %400 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %402 = torch.prim.GetAttr %arg0["_param_constant28"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %403 = torch.prim.GetAttr %arg0["_param_constant29"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %404 = torch.aten.addmm %402, %401, %403, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %405 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %406 = torch.aten.view %404, %405 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %407 = torch.aten.slice.Tensor %406, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %408 = torch.aten.slice.Tensor %406, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %409 = torch.aten.slice.Tensor %406, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %410 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %411 = torch.aten.view %407, %410 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %412 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %413 = torch.aten.permute %411, %412 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %414 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %415 = torch.aten.view %408, %414 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %416 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %417 = torch.aten.permute %415, %416 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %418 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %419 = torch.aten.view %409, %418 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %420 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %421 = torch.aten.permute %419, %420 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %422 = torch.aten.transpose.int %417, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %423 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %424 = torch.aten.expand %413, %423, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %425 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %426 = torch.aten.view %424, %425 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %427 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %428 = torch.aten.expand %422, %427, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %429 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %430 = torch.aten.view %428, %429 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %431 = torch.aten.bmm %426, %430 : !torch.tensor, !torch.tensor -> !torch.tensor
    %432 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %433 = torch.aten._unsafe_view %431, %432 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %434 = torch.prim.GetAttr %arg0["_tensor_constant6"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %435 = torch.aten.lift_fresh_copy %434 : !torch.tensor -> !torch.tensor
    %436 = torch.aten.div.Tensor %433, %435 : !torch.tensor, !torch.tensor -> !torch.tensor
    %437 = torch.prim.GetAttr %arg0["_tensor_constant7"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %438 = torch.aten.slice.Tensor %437, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %439 = torch.aten.slice.Tensor %438, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %440 = torch.aten.slice.Tensor %439, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %441 = torch.aten.slice.Tensor %440, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %442 = torch.aten._to_copy %441, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %443 = torch.prim.GetAttr %arg0["_tensor_constant8"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %444 = torch.aten.lift_fresh_copy %443 : !torch.tensor -> !torch.tensor
    %445 = torch.aten.where.self %442, %436, %444 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %446 = torch.aten._softmax %445, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %447 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %448 = torch.aten.expand %446, %447, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %449 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %450 = torch.aten.view %448, %449 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %451 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %452 = torch.aten.expand %421, %451, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %453 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %454 = torch.aten.view %452, %453 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %455 = torch.aten.bmm %450, %454 : !torch.tensor, !torch.tensor -> !torch.tensor
    %456 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %457 = torch.aten._unsafe_view %455, %456 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %458 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %459 = torch.aten.permute %457, %458 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %460 = torch.aten.clone %459, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %461 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %462 = torch.aten.view %460, %461 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %463 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %464 = torch.aten.view %462, %463 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %465 = torch.prim.GetAttr %arg0["_param_constant30"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %466 = torch.prim.GetAttr %arg0["_param_constant31"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %467 = torch.aten.addmm %465, %464, %466, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %468 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %469 = torch.aten.view %467, %468 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %470 = torch.aten.add.Tensor %469, %396, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %471 = torch.prim.GetAttr %arg0["_param_constant32"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %472 = torch.prim.GetAttr %arg0["_param_constant33"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %473 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_13, %result1_14, %result2_15 = torch.aten.native_layer_norm %470, %473, %471, %472, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %474 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %475 = torch.aten.view %result0_13, %474 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %476 = torch.prim.GetAttr %arg0["_param_constant34"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %477 = torch.prim.GetAttr %arg0["_param_constant35"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %478 = torch.aten.addmm %476, %475, %477, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %479 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %480 = torch.aten.view %478, %479 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %481 = torch.aten.mul.Scalar %480, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %482 = torch.aten.pow.Tensor_Scalar %480, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %483 = torch.aten.mul.Scalar %482, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %484 = torch.aten.add.Tensor %480, %483, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %485 = torch.aten.mul.Scalar %484, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %486 = torch.aten.tanh %485 : !torch.tensor -> !torch.tensor
    %487 = torch.aten.add.Scalar %486, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %488 = torch.aten.mul.Tensor %481, %487 : !torch.tensor, !torch.tensor -> !torch.tensor
    %489 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %490 = torch.aten.view %488, %489 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %491 = torch.prim.GetAttr %arg0["_param_constant36"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %492 = torch.prim.GetAttr %arg0["_param_constant37"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %493 = torch.aten.addmm %491, %490, %492, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %494 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %495 = torch.aten.view %493, %494 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %496 = torch.aten.add.Tensor %470, %495, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %497 = torch.prim.GetAttr %arg0["_param_constant38"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %498 = torch.prim.GetAttr %arg0["_param_constant39"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %499 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_16, %result1_17, %result2_18 = torch.aten.native_layer_norm %496, %499, %497, %498, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %500 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %501 = torch.aten.view %result0_16, %500 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %502 = torch.prim.GetAttr %arg0["_param_constant40"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %503 = torch.prim.GetAttr %arg0["_param_constant41"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %504 = torch.aten.addmm %502, %501, %503, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %505 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %506 = torch.aten.view %504, %505 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %507 = torch.aten.slice.Tensor %506, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %508 = torch.aten.slice.Tensor %506, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %509 = torch.aten.slice.Tensor %506, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %510 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %511 = torch.aten.view %507, %510 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %512 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %513 = torch.aten.permute %511, %512 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %514 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %515 = torch.aten.view %508, %514 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %516 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %517 = torch.aten.permute %515, %516 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %518 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %519 = torch.aten.view %509, %518 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %520 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %521 = torch.aten.permute %519, %520 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %522 = torch.aten.transpose.int %517, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %523 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %524 = torch.aten.expand %513, %523, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %525 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %526 = torch.aten.view %524, %525 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %527 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %528 = torch.aten.expand %522, %527, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %529 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %530 = torch.aten.view %528, %529 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %531 = torch.aten.bmm %526, %530 : !torch.tensor, !torch.tensor -> !torch.tensor
    %532 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %533 = torch.aten._unsafe_view %531, %532 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %534 = torch.prim.GetAttr %arg0["_tensor_constant9"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %535 = torch.aten.lift_fresh_copy %534 : !torch.tensor -> !torch.tensor
    %536 = torch.aten.div.Tensor %533, %535 : !torch.tensor, !torch.tensor -> !torch.tensor
    %537 = torch.prim.GetAttr %arg0["_tensor_constant10"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %538 = torch.aten.slice.Tensor %537, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %539 = torch.aten.slice.Tensor %538, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %540 = torch.aten.slice.Tensor %539, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %541 = torch.aten.slice.Tensor %540, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %542 = torch.aten._to_copy %541, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %543 = torch.prim.GetAttr %arg0["_tensor_constant11"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %544 = torch.aten.lift_fresh_copy %543 : !torch.tensor -> !torch.tensor
    %545 = torch.aten.where.self %542, %536, %544 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %546 = torch.aten._softmax %545, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %547 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %548 = torch.aten.expand %546, %547, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %549 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %550 = torch.aten.view %548, %549 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %551 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %552 = torch.aten.expand %521, %551, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %553 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %554 = torch.aten.view %552, %553 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %555 = torch.aten.bmm %550, %554 : !torch.tensor, !torch.tensor -> !torch.tensor
    %556 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %557 = torch.aten._unsafe_view %555, %556 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %558 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %559 = torch.aten.permute %557, %558 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %560 = torch.aten.clone %559, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %561 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %562 = torch.aten.view %560, %561 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %563 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %564 = torch.aten.view %562, %563 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %565 = torch.prim.GetAttr %arg0["_param_constant42"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %566 = torch.prim.GetAttr %arg0["_param_constant43"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %567 = torch.aten.addmm %565, %564, %566, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %568 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %569 = torch.aten.view %567, %568 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %570 = torch.aten.add.Tensor %569, %496, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %571 = torch.prim.GetAttr %arg0["_param_constant44"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %572 = torch.prim.GetAttr %arg0["_param_constant45"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %573 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_19, %result1_20, %result2_21 = torch.aten.native_layer_norm %570, %573, %571, %572, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %574 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %575 = torch.aten.view %result0_19, %574 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %576 = torch.prim.GetAttr %arg0["_param_constant46"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %577 = torch.prim.GetAttr %arg0["_param_constant47"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %578 = torch.aten.addmm %576, %575, %577, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %579 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %580 = torch.aten.view %578, %579 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %581 = torch.aten.mul.Scalar %580, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %582 = torch.aten.pow.Tensor_Scalar %580, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %583 = torch.aten.mul.Scalar %582, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %584 = torch.aten.add.Tensor %580, %583, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %585 = torch.aten.mul.Scalar %584, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %586 = torch.aten.tanh %585 : !torch.tensor -> !torch.tensor
    %587 = torch.aten.add.Scalar %586, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %588 = torch.aten.mul.Tensor %581, %587 : !torch.tensor, !torch.tensor -> !torch.tensor
    %589 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %590 = torch.aten.view %588, %589 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %591 = torch.prim.GetAttr %arg0["_param_constant48"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %592 = torch.prim.GetAttr %arg0["_param_constant49"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %593 = torch.aten.addmm %591, %590, %592, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %594 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %595 = torch.aten.view %593, %594 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %596 = torch.aten.add.Tensor %570, %595, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %597 = torch.prim.GetAttr %arg0["_param_constant50"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %598 = torch.prim.GetAttr %arg0["_param_constant51"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %599 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_22, %result1_23, %result2_24 = torch.aten.native_layer_norm %596, %599, %597, %598, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %600 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %601 = torch.aten.view %result0_22, %600 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %602 = torch.prim.GetAttr %arg0["_param_constant52"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %603 = torch.prim.GetAttr %arg0["_param_constant53"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %604 = torch.aten.addmm %602, %601, %603, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %605 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %606 = torch.aten.view %604, %605 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %607 = torch.aten.slice.Tensor %606, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %608 = torch.aten.slice.Tensor %606, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %609 = torch.aten.slice.Tensor %606, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %610 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %611 = torch.aten.view %607, %610 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %612 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %613 = torch.aten.permute %611, %612 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %614 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %615 = torch.aten.view %608, %614 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %616 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %617 = torch.aten.permute %615, %616 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %618 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %619 = torch.aten.view %609, %618 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %620 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %621 = torch.aten.permute %619, %620 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %622 = torch.aten.transpose.int %617, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %623 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %624 = torch.aten.expand %613, %623, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %625 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %626 = torch.aten.view %624, %625 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %627 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %628 = torch.aten.expand %622, %627, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %629 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %630 = torch.aten.view %628, %629 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %631 = torch.aten.bmm %626, %630 : !torch.tensor, !torch.tensor -> !torch.tensor
    %632 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %633 = torch.aten._unsafe_view %631, %632 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %634 = torch.prim.GetAttr %arg0["_tensor_constant12"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %635 = torch.aten.lift_fresh_copy %634 : !torch.tensor -> !torch.tensor
    %636 = torch.aten.div.Tensor %633, %635 : !torch.tensor, !torch.tensor -> !torch.tensor
    %637 = torch.prim.GetAttr %arg0["_tensor_constant13"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %638 = torch.aten.slice.Tensor %637, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %639 = torch.aten.slice.Tensor %638, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %640 = torch.aten.slice.Tensor %639, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %641 = torch.aten.slice.Tensor %640, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %642 = torch.aten._to_copy %641, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %643 = torch.prim.GetAttr %arg0["_tensor_constant14"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %644 = torch.aten.lift_fresh_copy %643 : !torch.tensor -> !torch.tensor
    %645 = torch.aten.where.self %642, %636, %644 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %646 = torch.aten._softmax %645, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %647 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %648 = torch.aten.expand %646, %647, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %649 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %650 = torch.aten.view %648, %649 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %651 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %652 = torch.aten.expand %621, %651, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %653 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %654 = torch.aten.view %652, %653 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %655 = torch.aten.bmm %650, %654 : !torch.tensor, !torch.tensor -> !torch.tensor
    %656 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %657 = torch.aten._unsafe_view %655, %656 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %658 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %659 = torch.aten.permute %657, %658 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %660 = torch.aten.clone %659, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %661 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %662 = torch.aten.view %660, %661 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %663 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %664 = torch.aten.view %662, %663 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %665 = torch.prim.GetAttr %arg0["_param_constant54"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %666 = torch.prim.GetAttr %arg0["_param_constant55"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %667 = torch.aten.addmm %665, %664, %666, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %668 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %669 = torch.aten.view %667, %668 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %670 = torch.aten.add.Tensor %669, %596, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %671 = torch.prim.GetAttr %arg0["_param_constant56"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %672 = torch.prim.GetAttr %arg0["_param_constant57"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %673 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_25, %result1_26, %result2_27 = torch.aten.native_layer_norm %670, %673, %671, %672, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %674 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %675 = torch.aten.view %result0_25, %674 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %676 = torch.prim.GetAttr %arg0["_param_constant58"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %677 = torch.prim.GetAttr %arg0["_param_constant59"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %678 = torch.aten.addmm %676, %675, %677, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %679 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %680 = torch.aten.view %678, %679 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %681 = torch.aten.mul.Scalar %680, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %682 = torch.aten.pow.Tensor_Scalar %680, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %683 = torch.aten.mul.Scalar %682, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %684 = torch.aten.add.Tensor %680, %683, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %685 = torch.aten.mul.Scalar %684, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %686 = torch.aten.tanh %685 : !torch.tensor -> !torch.tensor
    %687 = torch.aten.add.Scalar %686, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %688 = torch.aten.mul.Tensor %681, %687 : !torch.tensor, !torch.tensor -> !torch.tensor
    %689 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %690 = torch.aten.view %688, %689 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %691 = torch.prim.GetAttr %arg0["_param_constant60"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %692 = torch.prim.GetAttr %arg0["_param_constant61"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %693 = torch.aten.addmm %691, %690, %692, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %694 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %695 = torch.aten.view %693, %694 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %696 = torch.aten.add.Tensor %670, %695, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %697 = torch.prim.GetAttr %arg0["_param_constant62"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %698 = torch.prim.GetAttr %arg0["_param_constant63"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %699 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_28, %result1_29, %result2_30 = torch.aten.native_layer_norm %696, %699, %697, %698, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %700 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %701 = torch.aten.view %result0_28, %700 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %702 = torch.prim.GetAttr %arg0["_param_constant64"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %703 = torch.prim.GetAttr %arg0["_param_constant65"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %704 = torch.aten.addmm %702, %701, %703, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %705 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %706 = torch.aten.view %704, %705 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %707 = torch.aten.slice.Tensor %706, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %708 = torch.aten.slice.Tensor %706, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %709 = torch.aten.slice.Tensor %706, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %710 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %711 = torch.aten.view %707, %710 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %712 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %713 = torch.aten.permute %711, %712 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %714 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %715 = torch.aten.view %708, %714 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %716 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %717 = torch.aten.permute %715, %716 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %718 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %719 = torch.aten.view %709, %718 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %720 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %721 = torch.aten.permute %719, %720 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %722 = torch.aten.transpose.int %717, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %723 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %724 = torch.aten.expand %713, %723, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %725 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %726 = torch.aten.view %724, %725 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %727 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %728 = torch.aten.expand %722, %727, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %729 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %730 = torch.aten.view %728, %729 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %731 = torch.aten.bmm %726, %730 : !torch.tensor, !torch.tensor -> !torch.tensor
    %732 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %733 = torch.aten._unsafe_view %731, %732 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %734 = torch.prim.GetAttr %arg0["_tensor_constant15"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %735 = torch.aten.lift_fresh_copy %734 : !torch.tensor -> !torch.tensor
    %736 = torch.aten.div.Tensor %733, %735 : !torch.tensor, !torch.tensor -> !torch.tensor
    %737 = torch.prim.GetAttr %arg0["_tensor_constant16"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %738 = torch.aten.slice.Tensor %737, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %739 = torch.aten.slice.Tensor %738, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %740 = torch.aten.slice.Tensor %739, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %741 = torch.aten.slice.Tensor %740, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %742 = torch.aten._to_copy %741, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %743 = torch.prim.GetAttr %arg0["_tensor_constant17"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %744 = torch.aten.lift_fresh_copy %743 : !torch.tensor -> !torch.tensor
    %745 = torch.aten.where.self %742, %736, %744 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %746 = torch.aten._softmax %745, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %747 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %748 = torch.aten.expand %746, %747, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %749 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %750 = torch.aten.view %748, %749 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %751 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %752 = torch.aten.expand %721, %751, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %753 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %754 = torch.aten.view %752, %753 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %755 = torch.aten.bmm %750, %754 : !torch.tensor, !torch.tensor -> !torch.tensor
    %756 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %757 = torch.aten._unsafe_view %755, %756 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %758 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %759 = torch.aten.permute %757, %758 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %760 = torch.aten.clone %759, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %761 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %762 = torch.aten.view %760, %761 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %763 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %764 = torch.aten.view %762, %763 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %765 = torch.prim.GetAttr %arg0["_param_constant66"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %766 = torch.prim.GetAttr %arg0["_param_constant67"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %767 = torch.aten.addmm %765, %764, %766, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %768 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %769 = torch.aten.view %767, %768 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %770 = torch.aten.add.Tensor %769, %696, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %771 = torch.prim.GetAttr %arg0["_param_constant68"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %772 = torch.prim.GetAttr %arg0["_param_constant69"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %773 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_31, %result1_32, %result2_33 = torch.aten.native_layer_norm %770, %773, %771, %772, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %774 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %775 = torch.aten.view %result0_31, %774 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %776 = torch.prim.GetAttr %arg0["_param_constant70"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %777 = torch.prim.GetAttr %arg0["_param_constant71"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %778 = torch.aten.addmm %776, %775, %777, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %779 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %780 = torch.aten.view %778, %779 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %781 = torch.aten.mul.Scalar %780, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %782 = torch.aten.pow.Tensor_Scalar %780, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %783 = torch.aten.mul.Scalar %782, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %784 = torch.aten.add.Tensor %780, %783, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %785 = torch.aten.mul.Scalar %784, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %786 = torch.aten.tanh %785 : !torch.tensor -> !torch.tensor
    %787 = torch.aten.add.Scalar %786, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %788 = torch.aten.mul.Tensor %781, %787 : !torch.tensor, !torch.tensor -> !torch.tensor
    %789 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %790 = torch.aten.view %788, %789 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %791 = torch.prim.GetAttr %arg0["_param_constant72"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %792 = torch.prim.GetAttr %arg0["_param_constant73"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %793 = torch.aten.addmm %791, %790, %792, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %794 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %795 = torch.aten.view %793, %794 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %796 = torch.aten.add.Tensor %770, %795, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %797 = torch.prim.GetAttr %arg0["_param_constant74"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %798 = torch.prim.GetAttr %arg0["_param_constant75"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %799 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_34, %result1_35, %result2_36 = torch.aten.native_layer_norm %796, %799, %797, %798, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %800 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %801 = torch.aten.view %result0_34, %800 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %802 = torch.prim.GetAttr %arg0["_param_constant76"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %803 = torch.prim.GetAttr %arg0["_param_constant77"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %804 = torch.aten.addmm %802, %801, %803, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %805 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %806 = torch.aten.view %804, %805 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %807 = torch.aten.slice.Tensor %806, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %808 = torch.aten.slice.Tensor %806, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %809 = torch.aten.slice.Tensor %806, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %810 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %811 = torch.aten.view %807, %810 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %812 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %813 = torch.aten.permute %811, %812 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %814 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %815 = torch.aten.view %808, %814 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %816 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %817 = torch.aten.permute %815, %816 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %818 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %819 = torch.aten.view %809, %818 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %820 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %821 = torch.aten.permute %819, %820 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %822 = torch.aten.transpose.int %817, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %823 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %824 = torch.aten.expand %813, %823, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %825 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %826 = torch.aten.view %824, %825 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %827 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %828 = torch.aten.expand %822, %827, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %829 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %830 = torch.aten.view %828, %829 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %831 = torch.aten.bmm %826, %830 : !torch.tensor, !torch.tensor -> !torch.tensor
    %832 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %833 = torch.aten._unsafe_view %831, %832 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %834 = torch.prim.GetAttr %arg0["_tensor_constant18"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %835 = torch.aten.lift_fresh_copy %834 : !torch.tensor -> !torch.tensor
    %836 = torch.aten.div.Tensor %833, %835 : !torch.tensor, !torch.tensor -> !torch.tensor
    %837 = torch.prim.GetAttr %arg0["_tensor_constant19"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %838 = torch.aten.slice.Tensor %837, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %839 = torch.aten.slice.Tensor %838, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %840 = torch.aten.slice.Tensor %839, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %841 = torch.aten.slice.Tensor %840, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %842 = torch.aten._to_copy %841, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %843 = torch.prim.GetAttr %arg0["_tensor_constant20"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %844 = torch.aten.lift_fresh_copy %843 : !torch.tensor -> !torch.tensor
    %845 = torch.aten.where.self %842, %836, %844 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %846 = torch.aten._softmax %845, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %847 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %848 = torch.aten.expand %846, %847, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %849 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %850 = torch.aten.view %848, %849 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %851 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %852 = torch.aten.expand %821, %851, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %853 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %854 = torch.aten.view %852, %853 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %855 = torch.aten.bmm %850, %854 : !torch.tensor, !torch.tensor -> !torch.tensor
    %856 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %857 = torch.aten._unsafe_view %855, %856 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %858 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %859 = torch.aten.permute %857, %858 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %860 = torch.aten.clone %859, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %861 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %862 = torch.aten.view %860, %861 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %863 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %864 = torch.aten.view %862, %863 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %865 = torch.prim.GetAttr %arg0["_param_constant78"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %866 = torch.prim.GetAttr %arg0["_param_constant79"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %867 = torch.aten.addmm %865, %864, %866, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %868 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %869 = torch.aten.view %867, %868 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %870 = torch.aten.add.Tensor %869, %796, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %871 = torch.prim.GetAttr %arg0["_param_constant80"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %872 = torch.prim.GetAttr %arg0["_param_constant81"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %873 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_37, %result1_38, %result2_39 = torch.aten.native_layer_norm %870, %873, %871, %872, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %874 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %875 = torch.aten.view %result0_37, %874 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %876 = torch.prim.GetAttr %arg0["_param_constant82"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %877 = torch.prim.GetAttr %arg0["_param_constant83"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %878 = torch.aten.addmm %876, %875, %877, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %879 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %880 = torch.aten.view %878, %879 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %881 = torch.aten.mul.Scalar %880, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %882 = torch.aten.pow.Tensor_Scalar %880, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %883 = torch.aten.mul.Scalar %882, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %884 = torch.aten.add.Tensor %880, %883, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %885 = torch.aten.mul.Scalar %884, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %886 = torch.aten.tanh %885 : !torch.tensor -> !torch.tensor
    %887 = torch.aten.add.Scalar %886, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %888 = torch.aten.mul.Tensor %881, %887 : !torch.tensor, !torch.tensor -> !torch.tensor
    %889 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %890 = torch.aten.view %888, %889 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %891 = torch.prim.GetAttr %arg0["_param_constant84"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %892 = torch.prim.GetAttr %arg0["_param_constant85"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %893 = torch.aten.addmm %891, %890, %892, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %894 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %895 = torch.aten.view %893, %894 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %896 = torch.aten.add.Tensor %870, %895, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %897 = torch.prim.GetAttr %arg0["_param_constant86"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %898 = torch.prim.GetAttr %arg0["_param_constant87"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %899 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_40, %result1_41, %result2_42 = torch.aten.native_layer_norm %896, %899, %897, %898, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %900 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %901 = torch.aten.view %result0_40, %900 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %902 = torch.prim.GetAttr %arg0["_param_constant88"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %903 = torch.prim.GetAttr %arg0["_param_constant89"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %904 = torch.aten.addmm %902, %901, %903, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %905 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %906 = torch.aten.view %904, %905 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %907 = torch.aten.slice.Tensor %906, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %908 = torch.aten.slice.Tensor %906, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %909 = torch.aten.slice.Tensor %906, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %910 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %911 = torch.aten.view %907, %910 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %912 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %913 = torch.aten.permute %911, %912 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %914 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %915 = torch.aten.view %908, %914 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %916 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %917 = torch.aten.permute %915, %916 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %918 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %919 = torch.aten.view %909, %918 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %920 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %921 = torch.aten.permute %919, %920 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %922 = torch.aten.transpose.int %917, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %923 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %924 = torch.aten.expand %913, %923, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %925 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %926 = torch.aten.view %924, %925 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %927 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %928 = torch.aten.expand %922, %927, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %929 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %930 = torch.aten.view %928, %929 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %931 = torch.aten.bmm %926, %930 : !torch.tensor, !torch.tensor -> !torch.tensor
    %932 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %933 = torch.aten._unsafe_view %931, %932 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %934 = torch.prim.GetAttr %arg0["_tensor_constant21"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %935 = torch.aten.lift_fresh_copy %934 : !torch.tensor -> !torch.tensor
    %936 = torch.aten.div.Tensor %933, %935 : !torch.tensor, !torch.tensor -> !torch.tensor
    %937 = torch.prim.GetAttr %arg0["_tensor_constant22"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %938 = torch.aten.slice.Tensor %937, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %939 = torch.aten.slice.Tensor %938, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %940 = torch.aten.slice.Tensor %939, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %941 = torch.aten.slice.Tensor %940, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %942 = torch.aten._to_copy %941, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %943 = torch.prim.GetAttr %arg0["_tensor_constant23"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %944 = torch.aten.lift_fresh_copy %943 : !torch.tensor -> !torch.tensor
    %945 = torch.aten.where.self %942, %936, %944 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %946 = torch.aten._softmax %945, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %947 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %948 = torch.aten.expand %946, %947, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %949 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %950 = torch.aten.view %948, %949 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %951 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %952 = torch.aten.expand %921, %951, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %953 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %954 = torch.aten.view %952, %953 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %955 = torch.aten.bmm %950, %954 : !torch.tensor, !torch.tensor -> !torch.tensor
    %956 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %957 = torch.aten._unsafe_view %955, %956 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %958 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %959 = torch.aten.permute %957, %958 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %960 = torch.aten.clone %959, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %961 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %962 = torch.aten.view %960, %961 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %963 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %964 = torch.aten.view %962, %963 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %965 = torch.prim.GetAttr %arg0["_param_constant90"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %966 = torch.prim.GetAttr %arg0["_param_constant91"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %967 = torch.aten.addmm %965, %964, %966, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %968 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %969 = torch.aten.view %967, %968 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %970 = torch.aten.add.Tensor %969, %896, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %971 = torch.prim.GetAttr %arg0["_param_constant92"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %972 = torch.prim.GetAttr %arg0["_param_constant93"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %973 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_43, %result1_44, %result2_45 = torch.aten.native_layer_norm %970, %973, %971, %972, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %974 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %975 = torch.aten.view %result0_43, %974 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %976 = torch.prim.GetAttr %arg0["_param_constant94"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %977 = torch.prim.GetAttr %arg0["_param_constant95"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %978 = torch.aten.addmm %976, %975, %977, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %979 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %980 = torch.aten.view %978, %979 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %981 = torch.aten.mul.Scalar %980, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %982 = torch.aten.pow.Tensor_Scalar %980, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %983 = torch.aten.mul.Scalar %982, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %984 = torch.aten.add.Tensor %980, %983, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %985 = torch.aten.mul.Scalar %984, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %986 = torch.aten.tanh %985 : !torch.tensor -> !torch.tensor
    %987 = torch.aten.add.Scalar %986, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %988 = torch.aten.mul.Tensor %981, %987 : !torch.tensor, !torch.tensor -> !torch.tensor
    %989 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %990 = torch.aten.view %988, %989 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %991 = torch.prim.GetAttr %arg0["_param_constant96"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %992 = torch.prim.GetAttr %arg0["_param_constant97"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %993 = torch.aten.addmm %991, %990, %992, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %994 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %995 = torch.aten.view %993, %994 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %996 = torch.aten.add.Tensor %970, %995, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %997 = torch.prim.GetAttr %arg0["_param_constant98"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %998 = torch.prim.GetAttr %arg0["_param_constant99"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %999 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_46, %result1_47, %result2_48 = torch.aten.native_layer_norm %996, %999, %997, %998, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1000 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1001 = torch.aten.view %result0_46, %1000 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1002 = torch.prim.GetAttr %arg0["_param_constant100"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1003 = torch.prim.GetAttr %arg0["_param_constant101"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1004 = torch.aten.addmm %1002, %1001, %1003, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1005 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1006 = torch.aten.view %1004, %1005 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1007 = torch.aten.slice.Tensor %1006, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1008 = torch.aten.slice.Tensor %1006, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1009 = torch.aten.slice.Tensor %1006, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1010 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1011 = torch.aten.view %1007, %1010 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1012 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1013 = torch.aten.permute %1011, %1012 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1014 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1015 = torch.aten.view %1008, %1014 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1016 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1017 = torch.aten.permute %1015, %1016 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1018 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1019 = torch.aten.view %1009, %1018 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1020 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1021 = torch.aten.permute %1019, %1020 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1022 = torch.aten.transpose.int %1017, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1023 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1024 = torch.aten.expand %1013, %1023, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1025 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1026 = torch.aten.view %1024, %1025 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1027 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1028 = torch.aten.expand %1022, %1027, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1029 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1030 = torch.aten.view %1028, %1029 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1031 = torch.aten.bmm %1026, %1030 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1032 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1033 = torch.aten._unsafe_view %1031, %1032 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1034 = torch.prim.GetAttr %arg0["_tensor_constant24"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1035 = torch.aten.lift_fresh_copy %1034 : !torch.tensor -> !torch.tensor
    %1036 = torch.aten.div.Tensor %1033, %1035 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1037 = torch.prim.GetAttr %arg0["_tensor_constant25"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1038 = torch.aten.slice.Tensor %1037, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1039 = torch.aten.slice.Tensor %1038, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1040 = torch.aten.slice.Tensor %1039, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1041 = torch.aten.slice.Tensor %1040, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1042 = torch.aten._to_copy %1041, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %1043 = torch.prim.GetAttr %arg0["_tensor_constant26"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1044 = torch.aten.lift_fresh_copy %1043 : !torch.tensor -> !torch.tensor
    %1045 = torch.aten.where.self %1042, %1036, %1044 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %1046 = torch.aten._softmax %1045, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %1047 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1048 = torch.aten.expand %1046, %1047, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1049 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1050 = torch.aten.view %1048, %1049 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1051 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1052 = torch.aten.expand %1021, %1051, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1053 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1054 = torch.aten.view %1052, %1053 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1055 = torch.aten.bmm %1050, %1054 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1056 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1057 = torch.aten._unsafe_view %1055, %1056 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1058 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1059 = torch.aten.permute %1057, %1058 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1060 = torch.aten.clone %1059, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %1061 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1062 = torch.aten.view %1060, %1061 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1063 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1064 = torch.aten.view %1062, %1063 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1065 = torch.prim.GetAttr %arg0["_param_constant102"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1066 = torch.prim.GetAttr %arg0["_param_constant103"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1067 = torch.aten.addmm %1065, %1064, %1066, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1068 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1069 = torch.aten.view %1067, %1068 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1070 = torch.aten.add.Tensor %1069, %996, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1071 = torch.prim.GetAttr %arg0["_param_constant104"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1072 = torch.prim.GetAttr %arg0["_param_constant105"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1073 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_49, %result1_50, %result2_51 = torch.aten.native_layer_norm %1070, %1073, %1071, %1072, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1074 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1075 = torch.aten.view %result0_49, %1074 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1076 = torch.prim.GetAttr %arg0["_param_constant106"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1077 = torch.prim.GetAttr %arg0["_param_constant107"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1078 = torch.aten.addmm %1076, %1075, %1077, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1079 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1080 = torch.aten.view %1078, %1079 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1081 = torch.aten.mul.Scalar %1080, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1082 = torch.aten.pow.Tensor_Scalar %1080, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %1083 = torch.aten.mul.Scalar %1082, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %1084 = torch.aten.add.Tensor %1080, %1083, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1085 = torch.aten.mul.Scalar %1084, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1086 = torch.aten.tanh %1085 : !torch.tensor -> !torch.tensor
    %1087 = torch.aten.add.Scalar %1086, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %1088 = torch.aten.mul.Tensor %1081, %1087 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1089 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %1090 = torch.aten.view %1088, %1089 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1091 = torch.prim.GetAttr %arg0["_param_constant108"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1092 = torch.prim.GetAttr %arg0["_param_constant109"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1093 = torch.aten.addmm %1091, %1090, %1092, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1094 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1095 = torch.aten.view %1093, %1094 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1096 = torch.aten.add.Tensor %1070, %1095, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1097 = torch.prim.GetAttr %arg0["_param_constant110"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1098 = torch.prim.GetAttr %arg0["_param_constant111"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1099 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_52, %result1_53, %result2_54 = torch.aten.native_layer_norm %1096, %1099, %1097, %1098, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1100 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1101 = torch.aten.view %result0_52, %1100 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1102 = torch.prim.GetAttr %arg0["_param_constant112"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1103 = torch.prim.GetAttr %arg0["_param_constant113"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1104 = torch.aten.addmm %1102, %1101, %1103, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1105 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1106 = torch.aten.view %1104, %1105 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1107 = torch.aten.slice.Tensor %1106, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1108 = torch.aten.slice.Tensor %1106, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1109 = torch.aten.slice.Tensor %1106, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1110 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1111 = torch.aten.view %1107, %1110 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1112 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1113 = torch.aten.permute %1111, %1112 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1114 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1115 = torch.aten.view %1108, %1114 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1116 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1117 = torch.aten.permute %1115, %1116 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1118 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1119 = torch.aten.view %1109, %1118 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1120 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1121 = torch.aten.permute %1119, %1120 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1122 = torch.aten.transpose.int %1117, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1123 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1124 = torch.aten.expand %1113, %1123, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1125 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1126 = torch.aten.view %1124, %1125 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1127 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1128 = torch.aten.expand %1122, %1127, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1129 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1130 = torch.aten.view %1128, %1129 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1131 = torch.aten.bmm %1126, %1130 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1132 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1133 = torch.aten._unsafe_view %1131, %1132 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1134 = torch.prim.GetAttr %arg0["_tensor_constant27"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1135 = torch.aten.lift_fresh_copy %1134 : !torch.tensor -> !torch.tensor
    %1136 = torch.aten.div.Tensor %1133, %1135 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1137 = torch.prim.GetAttr %arg0["_tensor_constant28"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1138 = torch.aten.slice.Tensor %1137, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1139 = torch.aten.slice.Tensor %1138, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1140 = torch.aten.slice.Tensor %1139, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1141 = torch.aten.slice.Tensor %1140, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1142 = torch.aten._to_copy %1141, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %1143 = torch.prim.GetAttr %arg0["_tensor_constant29"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1144 = torch.aten.lift_fresh_copy %1143 : !torch.tensor -> !torch.tensor
    %1145 = torch.aten.where.self %1142, %1136, %1144 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %1146 = torch.aten._softmax %1145, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %1147 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1148 = torch.aten.expand %1146, %1147, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1149 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1150 = torch.aten.view %1148, %1149 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1151 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1152 = torch.aten.expand %1121, %1151, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1153 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1154 = torch.aten.view %1152, %1153 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1155 = torch.aten.bmm %1150, %1154 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1156 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1157 = torch.aten._unsafe_view %1155, %1156 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1158 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1159 = torch.aten.permute %1157, %1158 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1160 = torch.aten.clone %1159, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %1161 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1162 = torch.aten.view %1160, %1161 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1163 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1164 = torch.aten.view %1162, %1163 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1165 = torch.prim.GetAttr %arg0["_param_constant114"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1166 = torch.prim.GetAttr %arg0["_param_constant115"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1167 = torch.aten.addmm %1165, %1164, %1166, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1168 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1169 = torch.aten.view %1167, %1168 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1170 = torch.aten.add.Tensor %1169, %1096, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1171 = torch.prim.GetAttr %arg0["_param_constant116"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1172 = torch.prim.GetAttr %arg0["_param_constant117"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1173 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_55, %result1_56, %result2_57 = torch.aten.native_layer_norm %1170, %1173, %1171, %1172, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1174 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1175 = torch.aten.view %result0_55, %1174 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1176 = torch.prim.GetAttr %arg0["_param_constant118"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1177 = torch.prim.GetAttr %arg0["_param_constant119"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1178 = torch.aten.addmm %1176, %1175, %1177, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1179 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1180 = torch.aten.view %1178, %1179 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1181 = torch.aten.mul.Scalar %1180, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1182 = torch.aten.pow.Tensor_Scalar %1180, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %1183 = torch.aten.mul.Scalar %1182, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %1184 = torch.aten.add.Tensor %1180, %1183, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1185 = torch.aten.mul.Scalar %1184, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1186 = torch.aten.tanh %1185 : !torch.tensor -> !torch.tensor
    %1187 = torch.aten.add.Scalar %1186, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %1188 = torch.aten.mul.Tensor %1181, %1187 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1189 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %1190 = torch.aten.view %1188, %1189 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1191 = torch.prim.GetAttr %arg0["_param_constant120"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1192 = torch.prim.GetAttr %arg0["_param_constant121"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1193 = torch.aten.addmm %1191, %1190, %1192, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1194 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1195 = torch.aten.view %1193, %1194 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1196 = torch.aten.add.Tensor %1170, %1195, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1197 = torch.prim.GetAttr %arg0["_param_constant122"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1198 = torch.prim.GetAttr %arg0["_param_constant123"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1199 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_58, %result1_59, %result2_60 = torch.aten.native_layer_norm %1196, %1199, %1197, %1198, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1200 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1201 = torch.aten.view %result0_58, %1200 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1202 = torch.prim.GetAttr %arg0["_param_constant124"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1203 = torch.prim.GetAttr %arg0["_param_constant125"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1204 = torch.aten.addmm %1202, %1201, %1203, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1205 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1206 = torch.aten.view %1204, %1205 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1207 = torch.aten.slice.Tensor %1206, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1208 = torch.aten.slice.Tensor %1206, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1209 = torch.aten.slice.Tensor %1206, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1210 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1211 = torch.aten.view %1207, %1210 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1212 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1213 = torch.aten.permute %1211, %1212 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1214 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1215 = torch.aten.view %1208, %1214 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1216 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1217 = torch.aten.permute %1215, %1216 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1218 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1219 = torch.aten.view %1209, %1218 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1220 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1221 = torch.aten.permute %1219, %1220 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1222 = torch.aten.transpose.int %1217, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1223 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1224 = torch.aten.expand %1213, %1223, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1225 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1226 = torch.aten.view %1224, %1225 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1227 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1228 = torch.aten.expand %1222, %1227, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1229 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1230 = torch.aten.view %1228, %1229 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1231 = torch.aten.bmm %1226, %1230 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1232 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1233 = torch.aten._unsafe_view %1231, %1232 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1234 = torch.prim.GetAttr %arg0["_tensor_constant30"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1235 = torch.aten.lift_fresh_copy %1234 : !torch.tensor -> !torch.tensor
    %1236 = torch.aten.div.Tensor %1233, %1235 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1237 = torch.prim.GetAttr %arg0["_tensor_constant31"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1238 = torch.aten.slice.Tensor %1237, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1239 = torch.aten.slice.Tensor %1238, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1240 = torch.aten.slice.Tensor %1239, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1241 = torch.aten.slice.Tensor %1240, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1242 = torch.aten._to_copy %1241, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %1243 = torch.prim.GetAttr %arg0["_tensor_constant32"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1244 = torch.aten.lift_fresh_copy %1243 : !torch.tensor -> !torch.tensor
    %1245 = torch.aten.where.self %1242, %1236, %1244 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %1246 = torch.aten._softmax %1245, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %1247 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1248 = torch.aten.expand %1246, %1247, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1249 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1250 = torch.aten.view %1248, %1249 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1251 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1252 = torch.aten.expand %1221, %1251, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1253 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1254 = torch.aten.view %1252, %1253 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1255 = torch.aten.bmm %1250, %1254 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1256 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1257 = torch.aten._unsafe_view %1255, %1256 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1258 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1259 = torch.aten.permute %1257, %1258 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1260 = torch.aten.clone %1259, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %1261 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1262 = torch.aten.view %1260, %1261 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1263 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1264 = torch.aten.view %1262, %1263 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1265 = torch.prim.GetAttr %arg0["_param_constant126"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1266 = torch.prim.GetAttr %arg0["_param_constant127"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1267 = torch.aten.addmm %1265, %1264, %1266, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1268 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1269 = torch.aten.view %1267, %1268 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1270 = torch.aten.add.Tensor %1269, %1196, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1271 = torch.prim.GetAttr %arg0["_param_constant128"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1272 = torch.prim.GetAttr %arg0["_param_constant129"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1273 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_61, %result1_62, %result2_63 = torch.aten.native_layer_norm %1270, %1273, %1271, %1272, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1274 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1275 = torch.aten.view %result0_61, %1274 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1276 = torch.prim.GetAttr %arg0["_param_constant130"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1277 = torch.prim.GetAttr %arg0["_param_constant131"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1278 = torch.aten.addmm %1276, %1275, %1277, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1279 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1280 = torch.aten.view %1278, %1279 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1281 = torch.aten.mul.Scalar %1280, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1282 = torch.aten.pow.Tensor_Scalar %1280, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %1283 = torch.aten.mul.Scalar %1282, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %1284 = torch.aten.add.Tensor %1280, %1283, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1285 = torch.aten.mul.Scalar %1284, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1286 = torch.aten.tanh %1285 : !torch.tensor -> !torch.tensor
    %1287 = torch.aten.add.Scalar %1286, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %1288 = torch.aten.mul.Tensor %1281, %1287 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1289 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %1290 = torch.aten.view %1288, %1289 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1291 = torch.prim.GetAttr %arg0["_param_constant132"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1292 = torch.prim.GetAttr %arg0["_param_constant133"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1293 = torch.aten.addmm %1291, %1290, %1292, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1294 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1295 = torch.aten.view %1293, %1294 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1296 = torch.aten.add.Tensor %1270, %1295, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1297 = torch.prim.GetAttr %arg0["_param_constant134"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1298 = torch.prim.GetAttr %arg0["_param_constant135"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1299 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_64, %result1_65, %result2_66 = torch.aten.native_layer_norm %1296, %1299, %1297, %1298, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1300 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1301 = torch.aten.view %result0_64, %1300 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1302 = torch.prim.GetAttr %arg0["_param_constant136"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1303 = torch.prim.GetAttr %arg0["_param_constant137"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1304 = torch.aten.addmm %1302, %1301, %1303, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1305 = torch.prim.ListConstruct %int1, %int5, %int2304 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1306 = torch.aten.view %1304, %1305 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1307 = torch.aten.slice.Tensor %1306, %int2, %int0, %int768, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1308 = torch.aten.slice.Tensor %1306, %int2, %int768, %int1536, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1309 = torch.aten.slice.Tensor %1306, %int2, %int1536, %int2304, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1310 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1311 = torch.aten.view %1307, %1310 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1312 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1313 = torch.aten.permute %1311, %1312 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1314 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1315 = torch.aten.view %1308, %1314 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1316 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1317 = torch.aten.permute %1315, %1316 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1318 = torch.prim.ListConstruct %int1, %int5, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1319 = torch.aten.view %1309, %1318 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1320 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1321 = torch.aten.permute %1319, %1320 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1322 = torch.aten.transpose.int %1317, %int-1, %int-2 : !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1323 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1324 = torch.aten.expand %1313, %1323, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1325 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1326 = torch.aten.view %1324, %1325 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1327 = torch.prim.ListConstruct %int1, %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1328 = torch.aten.expand %1322, %1327, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1329 = torch.prim.ListConstruct %int12, %int64, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1330 = torch.aten.view %1328, %1329 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1331 = torch.aten.bmm %1326, %1330 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1332 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1333 = torch.aten._unsafe_view %1331, %1332 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1334 = torch.prim.GetAttr %arg0["_tensor_constant33"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1335 = torch.aten.lift_fresh_copy %1334 : !torch.tensor -> !torch.tensor
    %1336 = torch.aten.div.Tensor %1333, %1335 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1337 = torch.prim.GetAttr %arg0["_tensor_constant34"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1338 = torch.aten.slice.Tensor %1337, %int0, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1339 = torch.aten.slice.Tensor %1338, %int1, %int0, %int9223372036854775807, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1340 = torch.aten.slice.Tensor %1339, %int2, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1341 = torch.aten.slice.Tensor %1340, %int3, %int0, %int5, %int1 : !torch.tensor, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.tensor
    %1342 = torch.aten._to_copy %1341, %int11, %none_0, %none_0, %none_0, %false, %none_0 : !torch.tensor, !torch.int, !torch.none, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.tensor
    %1343 = torch.prim.GetAttr %arg0["_tensor_constant35"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1344 = torch.aten.lift_fresh_copy %1343 : !torch.tensor -> !torch.tensor
    %1345 = torch.aten.where.self %1342, %1336, %1344 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tensor
    %1346 = torch.aten._softmax %1345, %int-1, %false : !torch.tensor, !torch.int, !torch.bool -> !torch.tensor
    %1347 = torch.prim.ListConstruct %int1, %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1348 = torch.aten.expand %1346, %1347, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1349 = torch.prim.ListConstruct %int12, %int5, %int5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1350 = torch.aten.view %1348, %1349 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1351 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1352 = torch.aten.expand %1321, %1351, %false : !torch.tensor, !torch.list<int>, !torch.bool -> !torch.tensor
    %1353 = torch.prim.ListConstruct %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1354 = torch.aten.view %1352, %1353 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1355 = torch.aten.bmm %1350, %1354 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1356 = torch.prim.ListConstruct %int1, %int12, %int5, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1357 = torch.aten._unsafe_view %1355, %1356 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1358 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1359 = torch.aten.permute %1357, %1358 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1360 = torch.aten.clone %1359, %int0 : !torch.tensor, !torch.int -> !torch.tensor
    %1361 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1362 = torch.aten.view %1360, %1361 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1363 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1364 = torch.aten.view %1362, %1363 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1365 = torch.prim.GetAttr %arg0["_param_constant138"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1366 = torch.prim.GetAttr %arg0["_param_constant139"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1367 = torch.aten.addmm %1365, %1364, %1366, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1368 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1369 = torch.aten.view %1367, %1368 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1370 = torch.aten.add.Tensor %1369, %1296, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1371 = torch.prim.GetAttr %arg0["_param_constant140"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1372 = torch.prim.GetAttr %arg0["_param_constant141"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1373 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_67, %result1_68, %result2_69 = torch.aten.native_layer_norm %1370, %1373, %1371, %1372, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1374 = torch.prim.ListConstruct %int-1, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1375 = torch.aten.view %result0_67, %1374 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1376 = torch.prim.GetAttr %arg0["_param_constant142"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1377 = torch.prim.GetAttr %arg0["_param_constant143"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1378 = torch.aten.addmm %1376, %1375, %1377, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1379 = torch.prim.ListConstruct %int1, %int5, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1380 = torch.aten.view %1378, %1379 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1381 = torch.aten.mul.Scalar %1380, %float5.000000e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1382 = torch.aten.pow.Tensor_Scalar %1380, %float3.000000e00 : !torch.tensor, !torch.float -> !torch.tensor
    %1383 = torch.aten.mul.Scalar %1382, %float4.471500e-02 : !torch.tensor, !torch.float -> !torch.tensor
    %1384 = torch.aten.add.Tensor %1380, %1383, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1385 = torch.aten.mul.Scalar %1384, %float7.978850e-01 : !torch.tensor, !torch.float -> !torch.tensor
    %1386 = torch.aten.tanh %1385 : !torch.tensor -> !torch.tensor
    %1387 = torch.aten.add.Scalar %1386, %float1.000000e00, %int1 : !torch.tensor, !torch.float, !torch.int -> !torch.tensor
    %1388 = torch.aten.mul.Tensor %1381, %1387 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1389 = torch.prim.ListConstruct %int-1, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %1390 = torch.aten.view %1388, %1389 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1391 = torch.prim.GetAttr %arg0["_param_constant144"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1392 = torch.prim.GetAttr %arg0["_param_constant145"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1393 = torch.aten.addmm %1391, %1390, %1392, %int1, %int1 : !torch.tensor, !torch.tensor, !torch.tensor, !torch.int, !torch.int -> !torch.tensor
    %1394 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1395 = torch.aten.view %1393, %1394 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1396 = torch.aten.add.Tensor %1370, %1395, %int1 : !torch.tensor, !torch.tensor, !torch.int -> !torch.tensor
    %1397 = torch.prim.GetAttr %arg0["_param_constant146"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1398 = torch.prim.GetAttr %arg0["_param_constant147"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1399 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0_70, %result1_71, %result2_72 = torch.aten.native_layer_norm %1396, %1399, %1397, %1398, %float1.000000e-05 : !torch.tensor, !torch.list<int>, !torch.tensor, !torch.tensor, !torch.float -> !torch.tensor, !torch.tensor, !torch.tensor
    %1400 = torch.prim.ListConstruct %int1, %int5, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1401 = torch.aten.view %result0_70, %1400 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1402 = torch.prim.GetAttr %arg0["_param_constant148"] : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda"> -> !torch.tensor
    %1403 = torch.aten.t %1402 : !torch.tensor -> !torch.tensor
    %1404 = torch.prim.ListConstruct %int5, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1405 = torch.aten.view %1401, %1404 : !torch.tensor, !torch.list<int> -> !torch.tensor
    %1406 = torch.aten.mm %1405, %1403 : !torch.tensor, !torch.tensor -> !torch.tensor
    %1407 = torch.prim.ListConstruct %int1, %int5, %int50257 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1408 = torch.aten._unsafe_view %1406, %1407 : !torch.tensor, !torch.list<int> -> !torch.tensor
    return %1408 : !torch.tensor
  }
  torch.class_type @__torch__.torch.fx.graph_module._lambda {
    torch.attr private "_param_constant0" : !torch.tensor
    torch.attr private "_param_constant1" : !torch.tensor
    torch.attr private "_param_constant2" : !torch.tensor
    torch.attr private "_param_constant3" : !torch.tensor
    torch.attr private "_param_constant4" : !torch.tensor
    torch.attr private "_param_constant5" : !torch.tensor
    torch.attr private "_param_constant6" : !torch.tensor
    torch.attr private "_param_constant7" : !torch.tensor
    torch.attr private "_param_constant8" : !torch.tensor
    torch.attr private "_param_constant9" : !torch.tensor
    torch.attr private "_param_constant10" : !torch.tensor
    torch.attr private "_param_constant11" : !torch.tensor
    torch.attr private "_param_constant12" : !torch.tensor
    torch.attr private "_param_constant13" : !torch.tensor
    torch.attr private "_param_constant14" : !torch.tensor
    torch.attr private "_param_constant15" : !torch.tensor
    torch.attr private "_param_constant16" : !torch.tensor
    torch.attr private "_param_constant17" : !torch.tensor
    torch.attr private "_param_constant18" : !torch.tensor
    torch.attr private "_param_constant19" : !torch.tensor
    torch.attr private "_param_constant20" : !torch.tensor
    torch.attr private "_param_constant21" : !torch.tensor
    torch.attr private "_param_constant22" : !torch.tensor
    torch.attr private "_param_constant23" : !torch.tensor
    torch.attr private "_param_constant24" : !torch.tensor
    torch.attr private "_param_constant25" : !torch.tensor
    torch.attr private "_param_constant26" : !torch.tensor
    torch.attr private "_param_constant27" : !torch.tensor
    torch.attr private "_param_constant28" : !torch.tensor
    torch.attr private "_param_constant29" : !torch.tensor
    torch.attr private "_param_constant30" : !torch.tensor
    torch.attr private "_param_constant31" : !torch.tensor
    torch.attr private "_param_constant32" : !torch.tensor
    torch.attr private "_param_constant33" : !torch.tensor
    torch.attr private "_param_constant34" : !torch.tensor
    torch.attr private "_param_constant35" : !torch.tensor
    torch.attr private "_param_constant36" : !torch.tensor
    torch.attr private "_param_constant37" : !torch.tensor
    torch.attr private "_param_constant38" : !torch.tensor
    torch.attr private "_param_constant39" : !torch.tensor
    torch.attr private "_param_constant40" : !torch.tensor
    torch.attr private "_param_constant41" : !torch.tensor
    torch.attr private "_param_constant42" : !torch.tensor
    torch.attr private "_param_constant43" : !torch.tensor
    torch.attr private "_param_constant44" : !torch.tensor
    torch.attr private "_param_constant45" : !torch.tensor
    torch.attr private "_param_constant46" : !torch.tensor
    torch.attr private "_param_constant47" : !torch.tensor
    torch.attr private "_param_constant48" : !torch.tensor
    torch.attr private "_param_constant49" : !torch.tensor
    torch.attr private "_param_constant50" : !torch.tensor
    torch.attr private "_param_constant51" : !torch.tensor
    torch.attr private "_param_constant52" : !torch.tensor
    torch.attr private "_param_constant53" : !torch.tensor
    torch.attr private "_param_constant54" : !torch.tensor
    torch.attr private "_param_constant55" : !torch.tensor
    torch.attr private "_param_constant56" : !torch.tensor
    torch.attr private "_param_constant57" : !torch.tensor
    torch.attr private "_param_constant58" : !torch.tensor
    torch.attr private "_param_constant59" : !torch.tensor
    torch.attr private "_param_constant60" : !torch.tensor
    torch.attr private "_param_constant61" : !torch.tensor
    torch.attr private "_param_constant62" : !torch.tensor
    torch.attr private "_param_constant63" : !torch.tensor
    torch.attr private "_param_constant64" : !torch.tensor
    torch.attr private "_param_constant65" : !torch.tensor
    torch.attr private "_param_constant66" : !torch.tensor
    torch.attr private "_param_constant67" : !torch.tensor
    torch.attr private "_param_constant68" : !torch.tensor
    torch.attr private "_param_constant69" : !torch.tensor
    torch.attr private "_param_constant70" : !torch.tensor
    torch.attr private "_param_constant71" : !torch.tensor
    torch.attr private "_param_constant72" : !torch.tensor
    torch.attr private "_param_constant73" : !torch.tensor
    torch.attr private "_param_constant74" : !torch.tensor
    torch.attr private "_param_constant75" : !torch.tensor
    torch.attr private "_param_constant76" : !torch.tensor
    torch.attr private "_param_constant77" : !torch.tensor
    torch.attr private "_param_constant78" : !torch.tensor
    torch.attr private "_param_constant79" : !torch.tensor
    torch.attr private "_param_constant80" : !torch.tensor
    torch.attr private "_param_constant81" : !torch.tensor
    torch.attr private "_param_constant82" : !torch.tensor
    torch.attr private "_param_constant83" : !torch.tensor
    torch.attr private "_param_constant84" : !torch.tensor
    torch.attr private "_param_constant85" : !torch.tensor
    torch.attr private "_param_constant86" : !torch.tensor
    torch.attr private "_param_constant87" : !torch.tensor
    torch.attr private "_param_constant88" : !torch.tensor
    torch.attr private "_param_constant89" : !torch.tensor
    torch.attr private "_param_constant90" : !torch.tensor
    torch.attr private "_param_constant91" : !torch.tensor
    torch.attr private "_param_constant92" : !torch.tensor
    torch.attr private "_param_constant93" : !torch.tensor
    torch.attr private "_param_constant94" : !torch.tensor
    torch.attr private "_param_constant95" : !torch.tensor
    torch.attr private "_param_constant96" : !torch.tensor
    torch.attr private "_param_constant97" : !torch.tensor
    torch.attr private "_param_constant98" : !torch.tensor
    torch.attr private "_param_constant99" : !torch.tensor
    torch.attr private "_param_constant100" : !torch.tensor
    torch.attr private "_param_constant101" : !torch.tensor
    torch.attr private "_param_constant102" : !torch.tensor
    torch.attr private "_param_constant103" : !torch.tensor
    torch.attr private "_param_constant104" : !torch.tensor
    torch.attr private "_param_constant105" : !torch.tensor
    torch.attr private "_param_constant106" : !torch.tensor
    torch.attr private "_param_constant107" : !torch.tensor
    torch.attr private "_param_constant108" : !torch.tensor
    torch.attr private "_param_constant109" : !torch.tensor
    torch.attr private "_param_constant110" : !torch.tensor
    torch.attr private "_param_constant111" : !torch.tensor
    torch.attr private "_param_constant112" : !torch.tensor
    torch.attr private "_param_constant113" : !torch.tensor
    torch.attr private "_param_constant114" : !torch.tensor
    torch.attr private "_param_constant115" : !torch.tensor
    torch.attr private "_param_constant116" : !torch.tensor
    torch.attr private "_param_constant117" : !torch.tensor
    torch.attr private "_param_constant118" : !torch.tensor
    torch.attr private "_param_constant119" : !torch.tensor
    torch.attr private "_param_constant120" : !torch.tensor
    torch.attr private "_param_constant121" : !torch.tensor
    torch.attr private "_param_constant122" : !torch.tensor
    torch.attr private "_param_constant123" : !torch.tensor
    torch.attr private "_param_constant124" : !torch.tensor
    torch.attr private "_param_constant125" : !torch.tensor
    torch.attr private "_param_constant126" : !torch.tensor
    torch.attr private "_param_constant127" : !torch.tensor
    torch.attr private "_param_constant128" : !torch.tensor
    torch.attr private "_param_constant129" : !torch.tensor
    torch.attr private "_param_constant130" : !torch.tensor
    torch.attr private "_param_constant131" : !torch.tensor
    torch.attr private "_param_constant132" : !torch.tensor
    torch.attr private "_param_constant133" : !torch.tensor
    torch.attr private "_param_constant134" : !torch.tensor
    torch.attr private "_param_constant135" : !torch.tensor
    torch.attr private "_param_constant136" : !torch.tensor
    torch.attr private "_param_constant137" : !torch.tensor
    torch.attr private "_param_constant138" : !torch.tensor
    torch.attr private "_param_constant139" : !torch.tensor
    torch.attr private "_param_constant140" : !torch.tensor
    torch.attr private "_param_constant141" : !torch.tensor
    torch.attr private "_param_constant142" : !torch.tensor
    torch.attr private "_param_constant143" : !torch.tensor
    torch.attr private "_param_constant144" : !torch.tensor
    torch.attr private "_param_constant145" : !torch.tensor
    torch.attr private "_param_constant146" : !torch.tensor
    torch.attr private "_param_constant147" : !torch.tensor
    torch.attr private "_param_constant148" : !torch.tensor
    torch.attr private "_tensor_constant0" : !torch.tensor
    torch.attr private "_tensor_constant1" : !torch.tensor
    torch.attr private "_tensor_constant2" : !torch.tensor
    torch.attr private "_tensor_constant3" : !torch.tensor
    torch.attr private "_tensor_constant4" : !torch.tensor
    torch.attr private "_tensor_constant5" : !torch.tensor
    torch.attr private "_tensor_constant6" : !torch.tensor
    torch.attr private "_tensor_constant7" : !torch.tensor
    torch.attr private "_tensor_constant8" : !torch.tensor
    torch.attr private "_tensor_constant9" : !torch.tensor
    torch.attr private "_tensor_constant10" : !torch.tensor
    torch.attr private "_tensor_constant11" : !torch.tensor
    torch.attr private "_tensor_constant12" : !torch.tensor
    torch.attr private "_tensor_constant13" : !torch.tensor
    torch.attr private "_tensor_constant14" : !torch.tensor
    torch.attr private "_tensor_constant15" : !torch.tensor
    torch.attr private "_tensor_constant16" : !torch.tensor
    torch.attr private "_tensor_constant17" : !torch.tensor
    torch.attr private "_tensor_constant18" : !torch.tensor
    torch.attr private "_tensor_constant19" : !torch.tensor
    torch.attr private "_tensor_constant20" : !torch.tensor
    torch.attr private "_tensor_constant21" : !torch.tensor
    torch.attr private "_tensor_constant22" : !torch.tensor
    torch.attr private "_tensor_constant23" : !torch.tensor
    torch.attr private "_tensor_constant24" : !torch.tensor
    torch.attr private "_tensor_constant25" : !torch.tensor
    torch.attr private "_tensor_constant26" : !torch.tensor
    torch.attr private "_tensor_constant27" : !torch.tensor
    torch.attr private "_tensor_constant28" : !torch.tensor
    torch.attr private "_tensor_constant29" : !torch.tensor
    torch.attr private "_tensor_constant30" : !torch.tensor
    torch.attr private "_tensor_constant31" : !torch.tensor
    torch.attr private "_tensor_constant32" : !torch.tensor
    torch.attr private "_tensor_constant33" : !torch.tensor
    torch.attr private "_tensor_constant34" : !torch.tensor
    torch.attr private "_tensor_constant35" : !torch.tensor
    torch.attr private "training" : !torch.bool
    torch.attr private "_is_full_backward_hook" : !torch.optional<bool>
    torch.attr private "_code" : !torch.str
    torch.method private "__code_getter", @__torch__.torch.fx.graph_module._lambda.__code_getter
    torch.method "forward", @__torch__.torch.fx.graph_module._lambda.forward
  }
  %0 = torch.tensor.literal(dense_resource<__elided__> : tensor<50257x768xf32>) : !torch.tensor<[50257,768],f32>
  %1 = torch.tensor.literal(dense_resource<__elided__> : tensor<1024x768xf32>) : !torch.tensor<[1024,768],f32>
  %2 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %3 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %4 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %5 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %6 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %7 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %8 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %9 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %10 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %11 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %12 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %13 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %14 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %15 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %16 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %17 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %18 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %19 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %20 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %21 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %22 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %23 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %24 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %25 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %26 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %27 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %28 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %29 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %30 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %31 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %32 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %33 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %34 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %35 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %36 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %37 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %38 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %39 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %40 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %41 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %42 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %43 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %44 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %45 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %46 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %47 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %48 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %49 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %50 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %51 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %52 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %53 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %54 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %55 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %56 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %57 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %58 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %59 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %60 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %61 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %62 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %63 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %64 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %65 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %66 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %67 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %68 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %69 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %70 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %71 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %72 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %73 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %74 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %75 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %76 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %77 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %78 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %79 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %80 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %81 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %82 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %83 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %84 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %85 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %86 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %87 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %88 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %89 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %90 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %91 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %92 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %93 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %94 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %95 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %96 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %97 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %98 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %99 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %100 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %101 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %102 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %103 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %104 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %105 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %106 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %107 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %108 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %109 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %110 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %111 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %112 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %113 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %114 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %115 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %116 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %117 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %118 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %119 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %120 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %121 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %122 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %123 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %124 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %125 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %126 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %127 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %128 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %129 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %130 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %131 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %132 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %133 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %134 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %135 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %136 = torch.tensor.literal(dense_resource<__elided__> : tensor<2304xf32>) : !torch.tensor<[2304],f32>
  %137 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x2304xf32>) : !torch.tensor<[768,2304],f32>
  %138 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %139 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x768xf32>) : !torch.tensor<[768,768],f32>
  %140 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %141 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %142 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072xf32>) : !torch.tensor<[3072],f32>
  %143 = torch.tensor.literal(dense_resource<__elided__> : tensor<768x3072xf32>) : !torch.tensor<[768,3072],f32>
  %144 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %145 = torch.tensor.literal(dense_resource<__elided__> : tensor<3072x768xf32>) : !torch.tensor<[3072,768],f32>
  %146 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %147 = torch.tensor.literal(dense_resource<__elided__> : tensor<768xf32>) : !torch.tensor<[768],f32>
  %148 = torch.tensor.literal(dense_resource<__elided__> : tensor<50257x768xf32>) : !torch.tensor<[50257,768],f32>
  %149 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %150 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %151 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %152 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %153 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %154 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %155 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %156 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %157 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %158 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %159 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %160 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %161 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %162 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %163 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %164 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %165 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %166 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %167 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %168 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %169 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %170 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %171 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %172 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %173 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %174 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %175 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %176 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %177 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %178 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %179 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %180 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %181 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %182 = torch.tensor.literal(dense<8.000000e+00> : tensor<f32>) : !torch.tensor<[],f32>
  %183 = torch.tensor.literal(dense_resource<__elided__> : tensor<1x1x1024x1024xui8>) : !torch.tensor<[1,1,1024,1024],ui8>
  %184 = torch.tensor.literal(dense<-3.40282347E+38> : tensor<f32>) : !torch.tensor<[],f32>
  %true = torch.constant.bool true
  %none = torch.constant.none
  %str = torch.constant.str "\0A\0A\0Adef forward(self, arg0_1):\0A    view = torch.ops.aten.view(arg0_1, [-1, 5]);  arg0_1 = None\0A    arange = torch.ops.aten.arange(0, 5, dtype = torch.int64, device = device(type='cpu'), pin_memory = False)\0A    unsqueeze = torch.ops.aten.unsqueeze(arange, 0);  arange = None\0A    view_1 = torch.ops.aten.view(unsqueeze, [-1, 5]);  unsqueeze = None\0A    _param_constant0 = self._param_constant0\0A    embedding = torch.ops.aten.embedding(_param_constant0, view);  _param_constant0 = view = None\0A    _param_constant1 = self._param_constant1\0A    embedding_1 = torch.ops.aten.embedding(_param_constant1, view_1);  _param_constant1 = view_1 = None\0A    add = torch.ops.aten.add(embedding, embedding_1);  embedding = embedding_1 = None\0A    _param_constant2 = self._param_constant2\0A    _param_constant3 = self._param_constant3\0A    native_layer_norm = torch.ops.aten.native_layer_norm(add, [768], _param_constant2, _param_constant3, 1e-05);  _param_constant2 = _param_constant3 = None\0A    getitem = native_layer_norm[0]\0A    getitem_1 = native_layer_norm[1]\0A    getitem_2 = native_layer_norm[2];  native_layer_norm = None\0A    view_2 = torch.ops.aten.view(getitem, [-1, 768]);  getitem = None\0A    _param_constant4 = self._param_constant4\0A    _param_constant5 = self._param_constant5\0A    addmm = torch.ops.aten.addmm(_param_constant4, view_2, _param_constant5);  _param_constant4 = view_2 = _param_constant5 = None\0A    view_3 = torch.ops.aten.view(addmm, [1, 5, 2304]);  addmm = None\0A    slice_1 = torch.ops.aten.slice(view_3, 2, 0, 768)\0A    slice_2 = torch.ops.aten.slice(view_3, 2, 768, 1536)\0A    slice_3 = torch.ops.aten.slice(view_3, 2, 1536, 2304);  view_3 = None\0A    view_4 = torch.ops.aten.view(slice_1, [1, 5, 12, 64]);  slice_1 = None\0A    permute = torch.ops.aten.permute(view_4, [0, 2, 1, 3]);  view_4 = None\0A    view_5 = torch.ops.aten.view(slice_2, [1, 5, 12, 64]);  slice_2 = None\0A    permute_1 = torch.ops.aten.permute(view_5, [0, 2, 1, 3]);  view_5 = None\0A    view_6 = torch.ops.aten.view(slice_3, [1, 5, 12, 64]);  slice_3 = None\0A    permute_2 = torch.ops.aten.permute(view_6, [0, 2, 1, 3]);  view_6 = None\0A    transpose = torch.ops.aten.transpose(permute_1, -1, -2);  permute_1 = None\0A    expand = torch.ops.aten.expand(permute, [1, 12, 5, 64]);  permute = None\0A    view_7 = torch.ops.aten.view(expand, [12, 5, 64]);  expand = None\0A    expand_1 = torch.ops.aten.expand(transpose, [1, 12, 64, 5]);  transpose = None\0A    view_8 = torch.ops.aten.view(expand_1, [12, 64, 5]);  expand_1 = None\0A    bmm = torch.ops.aten.bmm(view_7, view_8);  view_7 = view_8 = None\0A    _unsafe_view = torch.ops.aten._unsafe_view(bmm, [1, 12, 5, 5]);  bmm = None\0A    _tensor_constant0 = self._tensor_constant0\0A    lift_fresh_copy = torch.ops.aten.lift_fresh_copy(_tensor_constant0);  _tensor_constant0 = None\0A    div = torch.ops.aten.div(_unsafe_view, lift_fresh_copy);  _unsafe_view = lift_fresh_copy = None\0A    _tensor_constant1 = self._tensor_constant1\0A    slice_4 = torch.ops.aten.slice(_tensor_constant1, 0, 0, 9223372036854775807);  _tensor_constant1 = None\0A    slice_5 = torch.ops.aten.slice(slice_4, 1, 0, 9223372036854775807);  slice_4 = None\0A    slice_6 = torch.ops.aten.slice(slice_5, 2, 0, 5);  slice_5 = None\0A    slice_7 = torch.ops.aten.slice(slice_6, 3, 0, 5);  slice_6 = None\0A    _to_copy = torch.ops.aten._to_copy(slice_7, dtype = torch.bool);  slice_7 = None\0A    _tensor_constant2 = self._tensor_constant2\0A    lift_fresh_copy_1 = torch.ops.aten.lift_fresh_copy(_tensor_constant2);  _tensor_constant2 = None\0A    where = torch.ops.aten.where(_to_copy, div, lift_fresh_copy_1);  _to_copy = div = lift_fresh_copy_1 = None\0A    _softmax = torch.ops.aten._softmax(where, -1, False);  where = None\0A    detach = torch.ops.aten.detach(_softmax)\0A    expand_2 = torch.ops.aten.expand(_softmax, [1, 12, 5, 5]);  _softmax = None\0A    view_9 = torch.ops.aten.view(expand_2, [12, 5, 5]);  expand_2 = None\0A    expand_3 = torch.ops.aten.expand(permute_2, [1, 12, 5, 64]);  permute_2 = None\0A    view_10 = torch.ops.aten.view(expand_3, [12, 5, 64]);  expand_3 = None\0A    bmm_1 = torch.ops.aten.bmm(view_9, view_10);  view_9 = view_10 = None\0A    _unsafe_view_1 = torch.ops.aten._unsafe_view(bmm_1, [1, 12, 5, 64]);  bmm_1 = None\0A    permute_3 = torch.ops.aten.permute(_unsafe_view_1, [0, 2, 1, 3]);  _unsafe_view_1 = None\0A    clone = torch.ops.aten.clone(permute_3, memory_format = torch.contiguous_format);  permute_3 = None\0A    view_11 = torch.ops.aten.view(clone, [1, 5, 768]);  clone = None\0A    view_12 = torch.ops.aten.view(view_11, [-1, 768]);  view_11 = None\0A    _param_constant6 = self._param_constant6\0A    _param_constant7 = self._param_constant7\0A    addmm_1 = torch.ops.aten.addmm(_param_constant6, view_12, _param_constant7);  _param_constant6 = view_12 = _param_constant7 = None\0A    view_13 = torch.ops.aten.view(addmm_1, [1, 5, 768]);  addmm_1 = None\0A    add_1 = torch.ops.aten.add(view_13, add);  view_13 = add = None\0A    _param_constant8 = self._param_constant8\0A    _param_constant9 = self._param_constant9\0A    native_layer_norm_1 = torch.ops.aten.native_layer_norm(add_1, [768], _param_constant8, _param_constant9, 1e-05);  _param_constant8 = _param_constant9 = None\0A    getitem_3 = native_layer_norm_1[0]\0A    getitem_4 = native_layer_norm_1[1]\0A    getitem_5 = native_layer_norm_1[2];  native_layer_norm_1 = None\0A    view_14 = torch.ops.aten.view(getitem_3, [-1, 768]);  getitem_3 = None\0A    _param_constant10 = self._param_constant10\0A    _param_constant11 = self._param_constant11\0A    addmm_2 = torch.ops.aten.addmm(_param_constant10, view_14, _param_constant11);  _param_constant10 = view_14 = _param_constant11 = None\0A    view_15 = torch.ops.aten.view(addmm_2, [1, 5, 3072]);  addmm_2 = None\0A    mul = torch.ops.aten.mul(view_15, 0.5)\0A    pow_1 = torch.ops.aten.pow(view_15, 3.0)\0A    mul_1 = torch.ops.aten.mul(pow_1, 0.044715);  pow_1 = None\0A    add_2 = torch.ops.aten.add(view_15, mul_1);  view_15 = mul_1 = None\0A    mul_2 = torch.ops.aten.mul(add_2, 0.7978845608028654);  add_2 = None\0A    tanh = torch.ops.aten.tanh(mul_2);  mul_2 = None\0A    detach_1 = torch.ops.aten.detach(tanh)\0A    add_3 = torch.ops.aten.add(tanh, 1.0);  tanh = None\0A    mul_3 = torch.ops.aten.mul(mul, add_3);  mul = add_3 = None\0A    view_16 = torch.ops.aten.view(mul_3, [-1, 3072]);  mul_3 = None\0A    _param_constant12 = self._param_constant12\0A    _param_constant13 = self._param_constant13\0A    addmm_3 = torch.ops.aten.addmm(_param_constant12, view_16, _param_constant13);  _param_constant12 = view_16 = _param_constant13 = None\0A    view_17 = torch.ops.aten.view(addmm_3, [1, 5, 768]);  addmm_3 = None\0A    add_4 = torch.ops.aten.add(add_1, view_17);  add_1 = view_17 = None\0A    _param_constant14 = self._param_constant14\0A    _param_constant15 = self._param_constant15\0A    native_layer_norm_2 = torch.ops.aten.native_layer_norm(add_4, [768], _param_constant14, _param_constant15, 1e-05);  _param_constant14 = _param_constant15 = None\0A    getitem_6 = native_layer_norm_2[0]\0A    getitem_7 = native_layer_norm_2[1]\0A    getitem_8 = native_layer_norm_2[2];  native_layer_norm_2 = None\0A    view_18 = torch.ops.aten.view(getitem_6, [-1, 768]);  getitem_6 = None\0A    _param_constant16 = self._param_constant16\0A    _param_constant17 = self._param_constant17\0A    addmm_4 = torch.ops.aten.addmm(_param_constant16, view_18, _param_constant17);  _param_constant16 = view_18 = _param_constant17 = None\0A    view_19 = torch.ops.aten.view(addmm_4, [1, 5, 2304]);  addmm_4 = None\0A    slice_8 = torch.ops.aten.slice(view_19, 2, 0, 768)\0A    slice_9 = torch.ops.aten.slice(view_19, 2, 768, 1536)\0A    slice_10 = torch.ops.aten.slice(view_19, 2, 1536, 2304);  view_19 = None\0A    view_20 = torch.ops.aten.view(slice_8, [1, 5, 12, 64]);  slice_8 = None\0A    permute_4 = torch.ops.aten.permute(view_20, [0, 2, 1, 3]);  view_20 = None\0A    view_21 = torch.ops.aten.view(slice_9, [1, 5, 12, 64]);  slice_9 = None\0A    permute_5 = torch.ops.aten.permute(view_21, [0, 2, 1, 3]);  view_21 = None\0A    view_22 = torch.ops.aten.view(slice_10, [1, 5, 12, 64]);  slice_10 = None\0A    permute_6 = torch.ops.aten.permute(view_22, [0, 2, 1, 3]);  view_22 = None\0A    transpose_1 = torch.ops.aten.transpose(permute_5, -1, -2);  permute_5 = None\0A    expand_4 = torch.ops.aten.expand(permute_4, [1, 12, 5, 64]);  permute_4 = None\0A    view_23 = torch.ops.aten.view(expand_4, [12, 5, 64]);  expand_4 = None\0A    expand_5 = torch.ops.aten.expand(transpose_1, [1, 12, 64, 5]);  transpose_1 = None\0A    view_24 = torch.ops.aten.view(expand_5, [12, 64, 5]);  expand_5 = None\0A    bmm_2 = torch.ops.aten.bmm(view_23, view_24);  view_23 = view_24 = None\0A    _unsafe_view_2 = torch.ops.aten._unsafe_view(bmm_2, [1, 12, 5, 5]);  bmm_2 = None\0A    _tensor_constant3 = self._tensor_constant3\0A    lift_fresh_copy_2 = torch.ops.aten.lift_fresh_copy(_tensor_constant3);  _tensor_constant3 = None\0A    div_1 = torch.ops.aten.div(_unsafe_view_2, lift_fresh_copy_2);  _unsafe_view_2 = lift_fresh_copy_2 = None\0A    _tensor_constant4 = self._tensor_constant4\0A    slice_11 = torch.ops.aten.slice(_tensor_constant4, 0, 0, 9223372036854775807);  _tensor_constant4 = None\0A    slice_12 = torch.ops.aten.slice(slice_11, 1, 0, 9223372036854775807);  slice_11 = None\0A    slice_13 = torch.ops.aten.slice(slice_12, 2, 0, 5);  slice_12 = None\0A    slice_14 = torch.ops.aten.slice(slice_13, 3, 0, 5);  slice_13 = None\0A    _to_copy_1 = torch.ops.aten._to_copy(slice_14, dtype = torch.bool);  slice_14 = None\0A    _tensor_constant5 = self._tensor_constant5\0A    lift_fresh_copy_3 = torch.ops.aten.lift_fresh_copy(_tensor_constant5);  _tensor_constant5 = None\0A    where_1 = torch.ops.aten.where(_to_copy_1, div_1, lift_fresh_copy_3);  _to_copy_1 = div_1 = lift_fresh_copy_3 = None\0A    _softmax_1 = torch.ops.aten._softmax(where_1, -1, False);  where_1 = None\0A    detach_2 = torch.ops.aten.detach(_softmax_1)\0A    expand_6 = torch.ops.aten.expand(_softmax_1, [1, 12, 5, 5]);  _softmax_1 = None\0A    view_25 = torch.ops.aten.view(expand_6, [12, 5, 5]);  expand_6 = None\0A    expand_7 = torch.ops.aten.expand(permute_6, [1, 12, 5, 64]);  permute_6 = None\0A    view_26 = torch.ops.aten.view(expand_7, [12, 5, 64]);  expand_7 = None\0A    bmm_3 = torch.ops.aten.bmm(view_25, view_26);  view_25 = view_26 = None\0A    _unsafe_view_3 = torch.ops.aten._unsafe_view(bmm_3, [1, 12, 5, 64]);  bmm_3 = None\0A    permute_7 = torch.ops.aten.permute(_unsafe_view_3, [0, 2, 1, 3]);  _unsafe_view_3 = None\0A    clone_1 = torch.ops.aten.clone(permute_7, memory_format = torch.contiguous_format);  permute_7 = None\0A    view_27 = torch.ops.aten.view(clone_1, [1, 5, 768]);  clone_1 = None\0A    view_28 = torch.ops.aten.view(view_27, [-1, 768]);  view_27 = None\0A    _param_constant18 = self._param_constant18\0A    _param_constant19 = self._param_constant19\0A    addmm_5 = torch.ops.aten.addmm(_param_constant18, view_28, _param_constant19);  _param_constant18 = view_28 = _param_constant19 = None\0A    view_29 = torch.ops.aten.view(addmm_5, [1, 5, 768]);  addmm_5 = None\0A    add_5 = torch.ops.aten.add(view_29, add_4);  view_29 = add_4 = None\0A    _param_constant20 = self._param_constant20\0A    _param_constant21 = self._param_constant21\0A    native_layer_norm_3 = torch.ops.aten.native_layer_norm(add_5, [768], _param_constant20, _param_constant21, 1e-05);  _param_constant20 = _param_constant21 = None\0A    getitem_9 = native_layer_norm_3[0]\0A    getitem_10 = native_layer_norm_3[1]\0A    getitem_11 = native_layer_norm_3[2];  native_layer_norm_3 = None\0A    view_30 = torch.ops.aten.view(getitem_9, [-1, 768]);  getitem_9 = None\0A    _param_constant22 = self._param_constant22\0A    _param_constant23 = self._param_constant23\0A    addmm_6 = torch.ops.aten.addmm(_param_constant22, view_30, _param_constant23);  _param_constant22 = view_30 = _param_constant23 = None\0A    view_31 = torch.ops.aten.view(addmm_6, [1, 5, 3072]);  addmm_6 = None\0A    mul_4 = torch.ops.aten.mul(view_31, 0.5)\0A    pow_2 = torch.ops.aten.pow(view_31, 3.0)\0A    mul_5 = torch.ops.aten.mul(pow_2, 0.044715);  pow_2 = None\0A    add_6 = torch.ops.aten.add(view_31, mul_5);  view_31 = mul_5 = None\0A    mul_6 = torch.ops.aten.mul(add_6, 0.7978845608028654);  add_6 = None\0A    tanh_1 = torch.ops.aten.tanh(mul_6);  mul_6 = None\0A    detach_3 = torch.ops.aten.detach(tanh_1)\0A    add_7 = torch.ops.aten.add(tanh_1, 1.0);  tanh_1 = None\0A    mul_7 = torch.ops.aten.mul(mul_4, add_7);  mul_4 = add_7 = None\0A    view_32 = torch.ops.aten.view(mul_7, [-1, 3072]);  mul_7 = None\0A    _param_constant24 = self._param_constant24\0A    _param_constant25 = self._param_constant25\0A    addmm_7 = torch.ops.aten.addmm(_param_constant24, view_32, _param_constant25);  _param_constant24 = view_32 = _param_constant25 = None\0A    view_33 = torch.ops.aten.view(addmm_7, [1, 5, 768]);  addmm_7 = None\0A    add_8 = torch.ops.aten.add(add_5, view_33);  add_5 = view_33 = None\0A    _param_constant26 = self._param_constant26\0A    _param_constant27 = self._param_constant27\0A    native_layer_norm_4 = torch.ops.aten.native_layer_norm(add_8, [768], _param_constant26, _param_constant27, 1e-05);  _param_constant26 = _param_constant27 = None\0A    getitem_12 = native_layer_norm_4[0]\0A    getitem_13 = native_layer_norm_4[1]\0A    getitem_14 = native_layer_norm_4[2];  native_layer_norm_4 = None\0A    view_34 = torch.ops.aten.view(getitem_12, [-1, 768]);  getitem_12 = None\0A    _param_constant28 = self._param_constant28\0A    _param_constant29 = self._param_constant29\0A    addmm_8 = torch.ops.aten.addmm(_param_constant28, view_34, _param_constant29);  _param_constant28 = view_34 = _param_constant29 = None\0A    view_35 = torch.ops.aten.view(addmm_8, [1, 5, 2304]);  addmm_8 = None\0A    slice_15 = torch.ops.aten.slice(view_35, 2, 0, 768)\0A    slice_16 = torch.ops.aten.slice(view_35, 2, 768, 1536)\0A    slice_17 = torch.ops.aten.slice(view_35, 2, 1536, 2304);  view_35 = None\0A    view_36 = torch.ops.aten.view(slice_15, [1, 5, 12, 64]);  slice_15 = None\0A    permute_8 = torch.ops.aten.permute(view_36, [0, 2, 1, 3]);  view_36 = None\0A    view_37 = torch.ops.aten.view(slice_16, [1, 5, 12, 64]);  slice_16 = None\0A    permute_9 = torch.ops.aten.permute(view_37, [0, 2, 1, 3]);  view_37 = None\0A    view_38 = torch.ops.aten.view(slice_17, [1, 5, 12, 64]);  slice_17 = None\0A    permute_10 = torch.ops.aten.permute(view_38, [0, 2, 1, 3]);  view_38 = None\0A    transpose_2 = torch.ops.aten.transpose(permute_9, -1, -2);  permute_9 = None\0A    expand_8 = torch.ops.aten.expand(permute_8, [1, 12, 5, 64]);  permute_8 = None\0A    view_39 = torch.ops.aten.view(expand_8, [12, 5, 64]);  expand_8 = None\0A    expand_9 = torch.ops.aten.expand(transpose_2, [1, 12, 64, 5]);  transpose_2 = None\0A    view_40 = torch.ops.aten.view(expand_9, [12, 64, 5]);  expand_9 = None\0A    bmm_4 = torch.ops.aten.bmm(view_39, view_40);  view_39 = view_40 = None\0A    _unsafe_view_4 = torch.ops.aten._unsafe_view(bmm_4, [1, 12, 5, 5]);  bmm_4 = None\0A    _tensor_constant6 = self._tensor_constant6\0A    lift_fresh_copy_4 = torch.ops.aten.lift_fresh_copy(_tensor_constant6);  _tensor_constant6 = None\0A    div_2 = torch.ops.aten.div(_unsafe_view_4, lift_fresh_copy_4);  _unsafe_view_4 = lift_fresh_copy_4 = None\0A    _tensor_constant7 = self._tensor_constant7\0A    slice_18 = torch.ops.aten.slice(_tensor_constant7, 0, 0, 9223372036854775807);  _tensor_constant7 = None\0A    slice_19 = torch.ops.aten.slice(slice_18, 1, 0, 9223372036854775807);  slice_18 = None\0A    slice_20 = torch.ops.aten.slice(slice_19, 2, 0, 5);  slice_19 = None\0A    slice_21 = torch.ops.aten.slice(slice_20, 3, 0, 5);  slice_20 = None\0A    _to_copy_2 = torch.ops.aten._to_copy(slice_21, dtype = torch.bool);  slice_21 = None\0A    _tensor_constant8 = self._tensor_constant8\0A    lift_fresh_copy_5 = torch.ops.aten.lift_fresh_copy(_tensor_constant8);  _tensor_constant8 = None\0A    where_2 = torch.ops.aten.where(_to_copy_2, div_2, lift_fresh_copy_5);  _to_copy_2 = div_2 = lift_fresh_copy_5 = None\0A    _softmax_2 = torch.ops.aten._softmax(where_2, -1, False);  where_2 = None\0A    detach_4 = torch.ops.aten.detach(_softmax_2)\0A    expand_10 = torch.ops.aten.expand(_softmax_2, [1, 12, 5, 5]);  _softmax_2 = None\0A    view_41 = torch.ops.aten.view(expand_10, [12, 5, 5]);  expand_10 = None\0A    expand_11 = torch.ops.aten.expand(permute_10, [1, 12, 5, 64]);  permute_10 = None\0A    view_42 = torch.ops.aten.view(expand_11, [12, 5, 64]);  expand_11 = None\0A    bmm_5 = torch.ops.aten.bmm(view_41, view_42);  view_41 = view_42 = None\0A    _unsafe_view_5 = torch.ops.aten._unsafe_view(bmm_5, [1, 12, 5, 64]);  bmm_5 = None\0A    permute_11 = torch.ops.aten.permute(_unsafe_view_5, [0, 2, 1, 3]);  _unsafe_view_5 = None\0A    clone_2 = torch.ops.aten.clone(permute_11, memory_format = torch.contiguous_format);  permute_11 = None\0A    view_43 = torch.ops.aten.view(clone_2, [1, 5, 768]);  clone_2 = None\0A    view_44 = torch.ops.aten.view(view_43, [-1, 768]);  view_43 = None\0A    _param_constant30 = self._param_constant30\0A    _param_constant31 = self._param_constant31\0A    addmm_9 = torch.ops.aten.addmm(_param_constant30, view_44, _param_constant31);  _param_constant30 = view_44 = _param_constant31 = None\0A    view_45 = torch.ops.aten.view(addmm_9, [1, 5, 768]);  addmm_9 = None\0A    add_9 = torch.ops.aten.add(view_45, add_8);  view_45 = add_8 = None\0A    _param_constant32 = self._param_constant32\0A    _param_constant33 = self._param_constant33\0A    native_layer_norm_5 = torch.ops.aten.native_layer_norm(add_9, [768], _param_constant32, _param_constant33, 1e-05);  _param_constant32 = _param_constant33 = None\0A    getitem_15 = native_layer_norm_5[0]\0A    getitem_16 = native_layer_norm_5[1]\0A    getitem_17 = native_layer_norm_5[2];  native_layer_norm_5 = None\0A    view_46 = torch.ops.aten.view(getitem_15, [-1, 768]);  getitem_15 = None\0A    _param_constant34 = self._param_constant34\0A    _param_constant35 = self._param_constant35\0A    addmm_10 = torch.ops.aten.addmm(_param_constant34, view_46, _param_constant35);  _param_constant34 = view_46 = _param_constant35 = None\0A    view_47 = torch.ops.aten.view(addmm_10, [1, 5, 3072]);  addmm_10 = None\0A    mul_8 = torch.ops.aten.mul(view_47, 0.5)\0A    pow_3 = torch.ops.aten.pow(view_47, 3.0)\0A    mul_9 = torch.ops.aten.mul(pow_3, 0.044715);  pow_3 = None\0A    add_10 = torch.ops.aten.add(view_47, mul_9);  view_47 = mul_9 = None\0A    mul_10 = torch.ops.aten.mul(add_10, 0.7978845608028654);  add_10 = None\0A    tanh_2 = torch.ops.aten.tanh(mul_10);  mul_10 = None\0A    detach_5 = torch.ops.aten.detach(tanh_2)\0A    add_11 = torch.ops.aten.add(tanh_2, 1.0);  tanh_2 = None\0A    mul_11 = torch.ops.aten.mul(mul_8, add_11);  mul_8 = add_11 = None\0A    view_48 = torch.ops.aten.view(mul_11, [-1, 3072]);  mul_11 = None\0A    _param_constant36 = self._param_constant36\0A    _param_constant37 = self._param_constant37\0A    addmm_11 = torch.ops.aten.addmm(_param_constant36, view_48, _param_constant37);  _param_constant36 = view_48 = _param_constant37 = None\0A    view_49 = torch.ops.aten.view(addmm_11, [1, 5, 768]);  addmm_11 = None\0A    add_12 = torch.ops.aten.add(add_9, view_49);  add_9 = view_49 = None\0A    _param_constant38 = self._param_constant38\0A    _param_constant39 = self._param_constant39\0A    native_layer_norm_6 = torch.ops.aten.native_layer_norm(add_12, [768], _param_constant38, _param_constant39, 1e-05);  _param_constant38 = _param_constant39 = None\0A    getitem_18 = native_layer_norm_6[0]\0A    getitem_19 = native_layer_norm_6[1]\0A    getitem_20 = native_layer_norm_6[2];  native_layer_norm_6 = None\0A    view_50 = torch.ops.aten.view(getitem_18, [-1, 768]);  getitem_18 = None\0A    _param_constant40 = self._param_constant40\0A    _param_constant41 = self._param_constant41\0A    addmm_12 = torch.ops.aten.addmm(_param_constant40, view_50, _param_constant41);  _param_constant40 = view_50 = _param_constant41 = None\0A    view_51 = torch.ops.aten.view(addmm_12, [1, 5, 2304]);  addmm_12 = None\0A    slice_22 = torch.ops.aten.slice(view_51, 2, 0, 768)\0A    slice_23 = torch.ops.aten.slice(view_51, 2, 768, 1536)\0A    slice_24 = torch.ops.aten.slice(view_51, 2, 1536, 2304);  view_51 = None\0A    view_52 = torch.ops.aten.view(slice_22, [1, 5, 12, 64]);  slice_22 = None\0A    permute_12 = torch.ops.aten.permute(view_52, [0, 2, 1, 3]);  view_52 = None\0A    view_53 = torch.ops.aten.view(slice_23, [1, 5, 12, 64]);  slice_23 = None\0A    permute_13 = torch.ops.aten.permute(view_53, [0, 2, 1, 3]);  view_53 = None\0A    view_54 = torch.ops.aten.view(slice_24, [1, 5, 12, 64]);  slice_24 = None\0A    permute_14 = torch.ops.aten.permute(view_54, [0, 2, 1, 3]);  view_54 = None\0A    transpose_3 = torch.ops.aten.transpose(permute_13, -1, -2);  permute_13 = None\0A    expand_12 = torch.ops.aten.expand(permute_12, [1, 12, 5, 64]);  permute_12 = None\0A    view_55 = torch.ops.aten.view(expand_12, [12, 5, 64]);  expand_12 = None\0A    expand_13 = torch.ops.aten.expand(transpose_3, [1, 12, 64, 5]);  transpose_3 = None\0A    view_56 = torch.ops.aten.view(expand_13, [12, 64, 5]);  expand_13 = None\0A    bmm_6 = torch.ops.aten.bmm(view_55, view_56);  view_55 = view_56 = None\0A    _unsafe_view_6 = torch.ops.aten._unsafe_view(bmm_6, [1, 12, 5, 5]);  bmm_6 = None\0A    _tensor_constant9 = self._tensor_constant9\0A    lift_fresh_copy_6 = torch.ops.aten.lift_fresh_copy(_tensor_constant9);  _tensor_constant9 = None\0A    div_3 = torch.ops.aten.div(_unsafe_view_6, lift_fresh_copy_6);  _unsafe_view_6 = lift_fresh_copy_6 = None\0A    _tensor_constant10 = self._tensor_constant10\0A    slice_25 = torch.ops.aten.slice(_tensor_constant10, 0, 0, 9223372036854775807);  _tensor_constant10 = None\0A    slice_26 = torch.ops.aten.slice(slice_25, 1, 0, 9223372036854775807);  slice_25 = None\0A    slice_27 = torch.ops.aten.slice(slice_26, 2, 0, 5);  slice_26 = None\0A    slice_28 = torch.ops.aten.slice(slice_27, 3, 0, 5);  slice_27 = None\0A    _to_copy_3 = torch.ops.aten._to_copy(slice_28, dtype = torch.bool);  slice_28 = None\0A    _tensor_constant11 = self._tensor_constant11\0A    lift_fresh_copy_7 = torch.ops.aten.lift_fresh_copy(_tensor_constant11);  _tensor_constant11 = None\0A    where_3 = torch.ops.aten.where(_to_copy_3, div_3, lift_fresh_copy_7);  _to_copy_3 = div_3 = lift_fresh_copy_7 = None\0A    _softmax_3 = torch.ops.aten._softmax(where_3, -1, False);  where_3 = None\0A    detach_6 = torch.ops.aten.detach(_softmax_3)\0A    expand_14 = torch.ops.aten.expand(_softmax_3, [1, 12, 5, 5]);  _softmax_3 = None\0A    view_57 = torch.ops.aten.view(expand_14, [12, 5, 5]);  expand_14 = None\0A    expand_15 = torch.ops.aten.expand(permute_14, [1, 12, 5, 64]);  permute_14 = None\0A    view_58 = torch.ops.aten.view(expand_15, [12, 5, 64]);  expand_15 = None\0A    bmm_7 = torch.ops.aten.bmm(view_57, view_58);  view_57 = view_58 = None\0A    _unsafe_view_7 = torch.ops.aten._unsafe_view(bmm_7, [1, 12, 5, 64]);  bmm_7 = None\0A    permute_15 = torch.ops.aten.permute(_unsafe_view_7, [0, 2, 1, 3]);  _unsafe_view_7 = None\0A    clone_3 = torch.ops.aten.clone(permute_15, memory_format = torch.contiguous_format);  permute_15 = None\0A    view_59 = torch.ops.aten.view(clone_3, [1, 5, 768]);  clone_3 = None\0A    view_60 = torch.ops.aten.view(view_59, [-1, 768]);  view_59 = None\0A    _param_constant42 = self._param_constant42\0A    _param_constant43 = self._param_constant43\0A    addmm_13 = torch.ops.aten.addmm(_param_constant42, view_60, _param_constant43);  _param_constant42 = view_60 = _param_constant43 = None\0A    view_61 = torch.ops.aten.view(addmm_13, [1, 5, 768]);  addmm_13 = None\0A    add_13 = torch.ops.aten.add(view_61, add_12);  view_61 = add_12 = None\0A    _param_constant44 = self._param_constant44\0A    _param_constant45 = self._param_constant45\0A    native_layer_norm_7 = torch.ops.aten.native_layer_norm(add_13, [768], _param_constant44, _param_constant45, 1e-05);  _param_constant44 = _param_constant45 = None\0A    getitem_21 = native_layer_norm_7[0]\0A    getitem_22 = native_layer_norm_7[1]\0A    getitem_23 = native_layer_norm_7[2];  native_layer_norm_7 = None\0A    view_62 = torch.ops.aten.view(getitem_21, [-1, 768]);  getitem_21 = None\0A    _param_constant46 = self._param_constant46\0A    _param_constant47 = self._param_constant47\0A    addmm_14 = torch.ops.aten.addmm(_param_constant46, view_62, _param_constant47);  _param_constant46 = view_62 = _param_constant47 = None\0A    view_63 = torch.ops.aten.view(addmm_14, [1, 5, 3072]);  addmm_14 = None\0A    mul_12 = torch.ops.aten.mul(view_63, 0.5)\0A    pow_4 = torch.ops.aten.pow(view_63, 3.0)\0A    mul_13 = torch.ops.aten.mul(pow_4, 0.044715);  pow_4 = None\0A    add_14 = torch.ops.aten.add(view_63, mul_13);  view_63 = mul_13 = None\0A    mul_14 = torch.ops.aten.mul(add_14, 0.7978845608028654);  add_14 = None\0A    tanh_3 = torch.ops.aten.tanh(mul_14);  mul_14 = None\0A    detach_7 = torch.ops.aten.detach(tanh_3)\0A    add_15 = torch.ops.aten.add(tanh_3, 1.0);  tanh_3 = None\0A    mul_15 = torch.ops.aten.mul(mul_12, add_15);  mul_12 = add_15 = None\0A    view_64 = torch.ops.aten.view(mul_15, [-1, 3072]);  mul_15 = None\0A    _param_constant48 = self._param_constant48\0A    _param_constant49 = self._param_constant49\0A    addmm_15 = torch.ops.aten.addmm(_param_constant48, view_64, _param_constant49);  _param_constant48 = view_64 = _param_constant49 = None\0A    view_65 = torch.ops.aten.view(addmm_15, [1, 5, 768]);  addmm_15 = None\0A    add_16 = torch.ops.aten.add(add_13, view_65);  add_13 = view_65 = None\0A    _param_constant50 = self._param_constant50\0A    _param_constant51 = self._param_constant51\0A    native_layer_norm_8 = torch.ops.aten.native_layer_norm(add_16, [768], _param_constant50, _param_constant51, 1e-05);  _param_constant50 = _param_constant51 = None\0A    getitem_24 = native_layer_norm_8[0]\0A    getitem_25 = native_layer_norm_8[1]\0A    getitem_26 = native_layer_norm_8[2];  native_layer_norm_8 = None\0A    view_66 = torch.ops.aten.view(getitem_24, [-1, 768]);  getitem_24 = None\0A    _param_constant52 = self._param_constant52\0A    _param_constant53 = self._param_constant53\0A    addmm_16 = torch.ops.aten.addmm(_param_constant52, view_66, _param_constant53);  _param_constant52 = view_66 = _param_constant53 = None\0A    view_67 = torch.ops.aten.view(addmm_16, [1, 5, 2304]);  addmm_16 = None\0A    slice_29 = torch.ops.aten.slice(view_67, 2, 0, 768)\0A    slice_30 = torch.ops.aten.slice(view_67, 2, 768, 1536)\0A    slice_31 = torch.ops.aten.slice(view_67, 2, 1536, 2304);  view_67 = None\0A    view_68 = torch.ops.aten.view(slice_29, [1, 5, 12, 64]);  slice_29 = None\0A    permute_16 = torch.ops.aten.permute(view_68, [0, 2, 1, 3]);  view_68 = None\0A    view_69 = torch.ops.aten.view(slice_30, [1, 5, 12, 64]);  slice_30 = None\0A    permute_17 = torch.ops.aten.permute(view_69, [0, 2, 1, 3]);  view_69 = None\0A    view_70 = torch.ops.aten.view(slice_31, [1, 5, 12, 64]);  slice_31 = None\0A    permute_18 = torch.ops.aten.permute(view_70, [0, 2, 1, 3]);  view_70 = None\0A    transpose_4 = torch.ops.aten.transpose(permute_17, -1, -2);  permute_17 = None\0A    expand_16 = torch.ops.aten.expand(permute_16, [1, 12, 5, 64]);  permute_16 = None\0A    view_71 = torch.ops.aten.view(expand_16, [12, 5, 64]);  expand_16 = None\0A    expand_17 = torch.ops.aten.expand(transpose_4, [1, 12, 64, 5]);  transpose_4 = None\0A    view_72 = torch.ops.aten.view(expand_17, [12, 64, 5]);  expand_17 = None\0A    bmm_8 = torch.ops.aten.bmm(view_71, view_72);  view_71 = view_72 = None\0A    _unsafe_view_8 = torch.ops.aten._unsafe_view(bmm_8, [1, 12, 5, 5]);  bmm_8 = None\0A    _tensor_constant12 = self._tensor_constant12\0A    lift_fresh_copy_8 = torch.ops.aten.lift_fresh_copy(_tensor_constant12);  _tensor_constant12 = None\0A    div_4 = torch.ops.aten.div(_unsafe_view_8, lift_fresh_copy_8);  _unsafe_view_8 = lift_fresh_copy_8 = None\0A    _tensor_constant13 = self._tensor_constant13\0A    slice_32 = torch.ops.aten.slice(_tensor_constant13, 0, 0, 9223372036854775807);  _tensor_constant13 = None\0A    slice_33 = torch.ops.aten.slice(slice_32, 1, 0, 9223372036854775807);  slice_32 = None\0A    slice_34 = torch.ops.aten.slice(slice_33, 2, 0, 5);  slice_33 = None\0A    slice_35 = torch.ops.aten.slice(slice_34, 3, 0, 5);  slice_34 = None\0A    _to_copy_4 = torch.ops.aten._to_copy(slice_35, dtype = torch.bool);  slice_35 = None\0A    _tensor_constant14 = self._tensor_constant14\0A    lift_fresh_copy_9 = torch.ops.aten.lift_fresh_copy(_tensor_constant14);  _tensor_constant14 = None\0A    where_4 = torch.ops.aten.where(_to_copy_4, div_4, lift_fresh_copy_9);  _to_copy_4 = div_4 = lift_fresh_copy_9 = None\0A    _softmax_4 = torch.ops.aten._softmax(where_4, -1, False);  where_4 = None\0A    detach_8 = torch.ops.aten.detach(_softmax_4)\0A    expand_18 = torch.ops.aten.expand(_softmax_4, [1, 12, 5, 5]);  _softmax_4 = None\0A    view_73 = torch.ops.aten.view(expand_18, [12, 5, 5]);  expand_18 = None\0A    expand_19 = torch.ops.aten.expand(permute_18, [1, 12, 5, 64]);  permute_18 = None\0A    view_74 = torch.ops.aten.view(expand_19, [12, 5, 64]);  expand_19 = None\0A    bmm_9 = torch.ops.aten.bmm(view_73, view_74);  view_73 = view_74 = None\0A    _unsafe_view_9 = torch.ops.aten._unsafe_view(bmm_9, [1, 12, 5, 64]);  bmm_9 = None\0A    permute_19 = torch.ops.aten.permute(_unsafe_view_9, [0, 2, 1, 3]);  _unsafe_view_9 = None\0A    clone_4 = torch.ops.aten.clone(permute_19, memory_format = torch.contiguous_format);  permute_19 = None\0A    view_75 = torch.ops.aten.view(clone_4, [1, 5, 768]);  clone_4 = None\0A    view_76 = torch.ops.aten.view(view_75, [-1, 768]);  view_75 = None\0A    _param_constant54 = self._param_constant54\0A    _param_constant55 = self._param_constant55\0A    addmm_17 = torch.ops.aten.addmm(_param_constant54, view_76, _param_constant55);  _param_constant54 = view_76 = _param_constant55 = None\0A    view_77 = torch.ops.aten.view(addmm_17, [1, 5, 768]);  addmm_17 = None\0A    add_17 = torch.ops.aten.add(view_77, add_16);  view_77 = add_16 = None\0A    _param_constant56 = self._param_constant56\0A    _param_constant57 = self._param_constant57\0A    native_layer_norm_9 = torch.ops.aten.native_layer_norm(add_17, [768], _param_constant56, _param_constant57, 1e-05);  _param_constant56 = _param_constant57 = None\0A    getitem_27 = native_layer_norm_9[0]\0A    getitem_28 = native_layer_norm_9[1]\0A    getitem_29 = native_layer_norm_9[2];  native_layer_norm_9 = None\0A    view_78 = torch.ops.aten.view(getitem_27, [-1, 768]);  getitem_27 = None\0A    _param_constant58 = self._param_constant58\0A    _param_constant59 = self._param_constant59\0A    addmm_18 = torch.ops.aten.addmm(_param_constant58, view_78, _param_constant59);  _param_constant58 = view_78 = _param_constant59 = None\0A    view_79 = torch.ops.aten.view(addmm_18, [1, 5, 3072]);  addmm_18 = None\0A    mul_16 = torch.ops.aten.mul(view_79, 0.5)\0A    pow_5 = torch.ops.aten.pow(view_79, 3.0)\0A    mul_17 = torch.ops.aten.mul(pow_5, 0.044715);  pow_5 = None\0A    add_18 = torch.ops.aten.add(view_79, mul_17);  view_79 = mul_17 = None\0A    mul_18 = torch.ops.aten.mul(add_18, 0.7978845608028654);  add_18 = None\0A    tanh_4 = torch.ops.aten.tanh(mul_18);  mul_18 = None\0A    detach_9 = torch.ops.aten.detach(tanh_4)\0A    add_19 = torch.ops.aten.add(tanh_4, 1.0);  tanh_4 = None\0A    mul_19 = torch.ops.aten.mul(mul_16, add_19);  mul_16 = add_19 = None\0A    view_80 = torch.ops.aten.view(mul_19, [-1, 3072]);  mul_19 = None\0A    _param_constant60 = self._param_constant60\0A    _param_constant61 = self._param_constant61\0A    addmm_19 = torch.ops.aten.addmm(_param_constant60, view_80, _param_constant61);  _param_constant60 = view_80 = _param_constant61 = None\0A    view_81 = torch.ops.aten.view(addmm_19, [1, 5, 768]);  addmm_19 = None\0A    add_20 = torch.ops.aten.add(add_17, view_81);  add_17 = view_81 = None\0A    _param_constant62 = self._param_constant62\0A    _param_constant63 = self._param_constant63\0A    native_layer_norm_10 = torch.ops.aten.native_layer_norm(add_20, [768], _param_constant62, _param_constant63, 1e-05);  _param_constant62 = _param_constant63 = None\0A    getitem_30 = native_layer_norm_10[0]\0A    getitem_31 = native_layer_norm_10[1]\0A    getitem_32 = native_layer_norm_10[2];  native_layer_norm_10 = None\0A    view_82 = torch.ops.aten.view(getitem_30, [-1, 768]);  getitem_30 = None\0A    _param_constant64 = self._param_constant64\0A    _param_constant65 = self._param_constant65\0A    addmm_20 = torch.ops.aten.addmm(_param_constant64, view_82, _param_constant65);  _param_constant64 = view_82 = _param_constant65 = None\0A    view_83 = torch.ops.aten.view(addmm_20, [1, 5, 2304]);  addmm_20 = None\0A    slice_36 = torch.ops.aten.slice(view_83, 2, 0, 768)\0A    slice_37 = torch.ops.aten.slice(view_83, 2, 768, 1536)\0A    slice_38 = torch.ops.aten.slice(view_83, 2, 1536, 2304);  view_83 = None\0A    view_84 = torch.ops.aten.view(slice_36, [1, 5, 12, 64]);  slice_36 = None\0A    permute_20 = torch.ops.aten.permute(view_84, [0, 2, 1, 3]);  view_84 = None\0A    view_85 = torch.ops.aten.view(slice_37, [1, 5, 12, 64]);  slice_37 = None\0A    permute_21 = torch.ops.aten.permute(view_85, [0, 2, 1, 3]);  view_85 = None\0A    view_86 = torch.ops.aten.view(slice_38, [1, 5, 12, 64]);  slice_38 = None\0A    permute_22 = torch.ops.aten.permute(view_86, [0, 2, 1, 3]);  view_86 = None\0A    transpose_5 = torch.ops.aten.transpose(permute_21, -1, -2);  permute_21 = None\0A    expand_20 = torch.ops.aten.expand(permute_20, [1, 12, 5, 64]);  permute_20 = None\0A    view_87 = torch.ops.aten.view(expand_20, [12, 5, 64]);  expand_20 = None\0A    expand_21 = torch.ops.aten.expand(transpose_5, [1, 12, 64, 5]);  transpose_5 = None\0A    view_88 = torch.ops.aten.view(expand_21, [12, 64, 5]);  expand_21 = None\0A    bmm_10 = torch.ops.aten.bmm(view_87, view_88);  view_87 = view_88 = None\0A    _unsafe_view_10 = torch.ops.aten._unsafe_view(bmm_10, [1, 12, 5, 5]);  bmm_10 = None\0A    _tensor_constant15 = self._tensor_constant15\0A    lift_fresh_copy_10 = torch.ops.aten.lift_fresh_copy(_tensor_constant15);  _tensor_constant15 = None\0A    div_5 = torch.ops.aten.div(_unsafe_view_10, lift_fresh_copy_10);  _unsafe_view_10 = lift_fresh_copy_10 = None\0A    _tensor_constant16 = self._tensor_constant16\0A    slice_39 = torch.ops.aten.slice(_tensor_constant16, 0, 0, 9223372036854775807);  _tensor_constant16 = None\0A    slice_40 = torch.ops.aten.slice(slice_39, 1, 0, 9223372036854775807);  slice_39 = None\0A    slice_41 = torch.ops.aten.slice(slice_40, 2, 0, 5);  slice_40 = None\0A    slice_42 = torch.ops.aten.slice(slice_41, 3, 0, 5);  slice_41 = None\0A    _to_copy_5 = torch.ops.aten._to_copy(slice_42, dtype = torch.bool);  slice_42 = None\0A    _tensor_constant17 = self._tensor_constant17\0A    lift_fresh_copy_11 = torch.ops.aten.lift_fresh_copy(_tensor_constant17);  _tensor_constant17 = None\0A    where_5 = torch.ops.aten.where(_to_copy_5, div_5, lift_fresh_copy_11);  _to_copy_5 = div_5 = lift_fresh_copy_11 = None\0A    _softmax_5 = torch.ops.aten._softmax(where_5, -1, False);  where_5 = None\0A    detach_10 = torch.ops.aten.detach(_softmax_5)\0A    expand_22 = torch.ops.aten.expand(_softmax_5, [1, 12, 5, 5]);  _softmax_5 = None\0A    view_89 = torch.ops.aten.view(expand_22, [12, 5, 5]);  expand_22 = None\0A    expand_23 = torch.ops.aten.expand(permute_22, [1, 12, 5, 64]);  permute_22 = None\0A    view_90 = torch.ops.aten.view(expand_23, [12, 5, 64]);  expand_23 = None\0A    bmm_11 = torch.ops.aten.bmm(view_89, view_90);  view_89 = view_90 = None\0A    _unsafe_view_11 = torch.ops.aten._unsafe_view(bmm_11, [1, 12, 5, 64]);  bmm_11 = None\0A    permute_23 = torch.ops.aten.permute(_unsafe_view_11, [0, 2, 1, 3]);  _unsafe_view_11 = None\0A    clone_5 = torch.ops.aten.clone(permute_23, memory_format = torch.contiguous_format);  permute_23 = None\0A    view_91 = torch.ops.aten.view(clone_5, [1, 5, 768]);  clone_5 = None\0A    view_92 = torch.ops.aten.view(view_91, [-1, 768]);  view_91 = None\0A    _param_constant66 = self._param_constant66\0A    _param_constant67 = self._param_constant67\0A    addmm_21 = torch.ops.aten.addmm(_param_constant66, view_92, _param_constant67);  _param_constant66 = view_92 = _param_constant67 = None\0A    view_93 = torch.ops.aten.view(addmm_21, [1, 5, 768]);  addmm_21 = None\0A    add_21 = torch.ops.aten.add(view_93, add_20);  view_93 = add_20 = None\0A    _param_constant68 = self._param_constant68\0A    _param_constant69 = self._param_constant69\0A    native_layer_norm_11 = torch.ops.aten.native_layer_norm(add_21, [768], _param_constant68, _param_constant69, 1e-05);  _param_constant68 = _param_constant69 = None\0A    getitem_33 = native_layer_norm_11[0]\0A    getitem_34 = native_layer_norm_11[1]\0A    getitem_35 = native_layer_norm_11[2];  native_layer_norm_11 = None\0A    view_94 = torch.ops.aten.view(getitem_33, [-1, 768]);  getitem_33 = None\0A    _param_constant70 = self._param_constant70\0A    _param_constant71 = self._param_constant71\0A    addmm_22 = torch.ops.aten.addmm(_param_constant70, view_94, _param_constant71);  _param_constant70 = view_94 = _param_constant71 = None\0A    view_95 = torch.ops.aten.view(addmm_22, [1, 5, 3072]);  addmm_22 = None\0A    mul_20 = torch.ops.aten.mul(view_95, 0.5)\0A    pow_6 = torch.ops.aten.pow(view_95, 3.0)\0A    mul_21 = torch.ops.aten.mul(pow_6, 0.044715);  pow_6 = None\0A    add_22 = torch.ops.aten.add(view_95, mul_21);  view_95 = mul_21 = None\0A    mul_22 = torch.ops.aten.mul(add_22, 0.7978845608028654);  add_22 = None\0A    tanh_5 = torch.ops.aten.tanh(mul_22);  mul_22 = None\0A    detach_11 = torch.ops.aten.detach(tanh_5)\0A    add_23 = torch.ops.aten.add(tanh_5, 1.0);  tanh_5 = None\0A    mul_23 = torch.ops.aten.mul(mul_20, add_23);  mul_20 = add_23 = None\0A    view_96 = torch.ops.aten.view(mul_23, [-1, 3072]);  mul_23 = None\0A    _param_constant72 = self._param_constant72\0A    _param_constant73 = self._param_constant73\0A    addmm_23 = torch.ops.aten.addmm(_param_constant72, view_96, _param_constant73);  _param_constant72 = view_96 = _param_constant73 = None\0A    view_97 = torch.ops.aten.view(addmm_23, [1, 5, 768]);  addmm_23 = None\0A    add_24 = torch.ops.aten.add(add_21, view_97);  add_21 = view_97 = None\0A    _param_constant74 = self._param_constant74\0A    _param_constant75 = self._param_constant75\0A    native_layer_norm_12 = torch.ops.aten.native_layer_norm(add_24, [768], _param_constant74, _param_constant75, 1e-05);  _param_constant74 = _param_constant75 = None\0A    getitem_36 = native_layer_norm_12[0]\0A    getitem_37 = native_layer_norm_12[1]\0A    getitem_38 = native_layer_norm_12[2];  native_layer_norm_12 = None\0A    view_98 = torch.ops.aten.view(getitem_36, [-1, 768]);  getitem_36 = None\0A    _param_constant76 = self._param_constant76\0A    _param_constant77 = self._param_constant77\0A    addmm_24 = torch.ops.aten.addmm(_param_constant76, view_98, _param_constant77);  _param_constant76 = view_98 = _param_constant77 = None\0A    view_99 = torch.ops.aten.view(addmm_24, [1, 5, 2304]);  addmm_24 = None\0A    slice_43 = torch.ops.aten.slice(view_99, 2, 0, 768)\0A    slice_44 = torch.ops.aten.slice(view_99, 2, 768, 1536)\0A    slice_45 = torch.ops.aten.slice(view_99, 2, 1536, 2304);  view_99 = None\0A    view_100 = torch.ops.aten.view(slice_43, [1, 5, 12, 64]);  slice_43 = None\0A    permute_24 = torch.ops.aten.permute(view_100, [0, 2, 1, 3]);  view_100 = None\0A    view_101 = torch.ops.aten.view(slice_44, [1, 5, 12, 64]);  slice_44 = None\0A    permute_25 = torch.ops.aten.permute(view_101, [0, 2, 1, 3]);  view_101 = None\0A    view_102 = torch.ops.aten.view(slice_45, [1, 5, 12, 64]);  slice_45 = None\0A    permute_26 = torch.ops.aten.permute(view_102, [0, 2, 1, 3]);  view_102 = None\0A    transpose_6 = torch.ops.aten.transpose(permute_25, -1, -2);  permute_25 = None\0A    expand_24 = torch.ops.aten.expand(permute_24, [1, 12, 5, 64]);  permute_24 = None\0A    view_103 = torch.ops.aten.view(expand_24, [12, 5, 64]);  expand_24 = None\0A    expand_25 = torch.ops.aten.expand(transpose_6, [1, 12, 64, 5]);  transpose_6 = None\0A    view_104 = torch.ops.aten.view(expand_25, [12, 64, 5]);  expand_25 = None\0A    bmm_12 = torch.ops.aten.bmm(view_103, view_104);  view_103 = view_104 = None\0A    _unsafe_view_12 = torch.ops.aten._unsafe_view(bmm_12, [1, 12, 5, 5]);  bmm_12 = None\0A    _tensor_constant18 = self._tensor_constant18\0A    lift_fresh_copy_12 = torch.ops.aten.lift_fresh_copy(_tensor_constant18);  _tensor_constant18 = None\0A    div_6 = torch.ops.aten.div(_unsafe_view_12, lift_fresh_copy_12);  _unsafe_view_12 = lift_fresh_copy_12 = None\0A    _tensor_constant19 = self._tensor_constant19\0A    slice_46 = torch.ops.aten.slice(_tensor_constant19, 0, 0, 9223372036854775807);  _tensor_constant19 = None\0A    slice_47 = torch.ops.aten.slice(slice_46, 1, 0, 9223372036854775807);  slice_46 = None\0A    slice_48 = torch.ops.aten.slice(slice_47, 2, 0, 5);  slice_47 = None\0A    slice_49 = torch.ops.aten.slice(slice_48, 3, 0, 5);  slice_48 = None\0A    _to_copy_6 = torch.ops.aten._to_copy(slice_49, dtype = torch.bool);  slice_49 = None\0A    _tensor_constant20 = self._tensor_constant20\0A    lift_fresh_copy_13 = torch.ops.aten.lift_fresh_copy(_tensor_constant20);  _tensor_constant20 = None\0A    where_6 = torch.ops.aten.where(_to_copy_6, div_6, lift_fresh_copy_13);  _to_copy_6 = div_6 = lift_fresh_copy_13 = None\0A    _softmax_6 = torch.ops.aten._softmax(where_6, -1, False);  where_6 = None\0A    detach_12 = torch.ops.aten.detach(_softmax_6)\0A    expand_26 = torch.ops.aten.expand(_softmax_6, [1, 12, 5, 5]);  _softmax_6 = None\0A    view_105 = torch.ops.aten.view(expand_26, [12, 5, 5]);  expand_26 = None\0A    expand_27 = torch.ops.aten.expand(permute_26, [1, 12, 5, 64]);  permute_26 = None\0A    view_106 = torch.ops.aten.view(expand_27, [12, 5, 64]);  expand_27 = None\0A    bmm_13 = torch.ops.aten.bmm(view_105, view_106);  view_105 = view_106 = None\0A    _unsafe_view_13 = torch.ops.aten._unsafe_view(bmm_13, [1, 12, 5, 64]);  bmm_13 = None\0A    permute_27 = torch.ops.aten.permute(_unsafe_view_13, [0, 2, 1, 3]);  _unsafe_view_13 = None\0A    clone_6 = torch.ops.aten.clone(permute_27, memory_format = torch.contiguous_format);  permute_27 = None\0A    view_107 = torch.ops.aten.view(clone_6, [1, 5, 768]);  clone_6 = None\0A    view_108 = torch.ops.aten.view(view_107, [-1, 768]);  view_107 = None\0A    _param_constant78 = self._param_constant78\0A    _param_constant79 = self._param_constant79\0A    addmm_25 = torch.ops.aten.addmm(_param_constant78, view_108, _param_constant79);  _param_constant78 = view_108 = _param_constant79 = None\0A    view_109 = torch.ops.aten.view(addmm_25, [1, 5, 768]);  addmm_25 = None\0A    add_25 = torch.ops.aten.add(view_109, add_24);  view_109 = add_24 = None\0A    _param_constant80 = self._param_constant80\0A    _param_constant81 = self._param_constant81\0A    native_layer_norm_13 = torch.ops.aten.native_layer_norm(add_25, [768], _param_constant80, _param_constant81, 1e-05);  _param_constant80 = _param_constant81 = None\0A    getitem_39 = native_layer_norm_13[0]\0A    getitem_40 = native_layer_norm_13[1]\0A    getitem_41 = native_layer_norm_13[2];  native_layer_norm_13 = None\0A    view_110 = torch.ops.aten.view(getitem_39, [-1, 768]);  getitem_39 = None\0A    _param_constant82 = self._param_constant82\0A    _param_constant83 = self._param_constant83\0A    addmm_26 = torch.ops.aten.addmm(_param_constant82, view_110, _param_constant83);  _param_constant82 = view_110 = _param_constant83 = None\0A    view_111 = torch.ops.aten.view(addmm_26, [1, 5, 3072]);  addmm_26 = None\0A    mul_24 = torch.ops.aten.mul(view_111, 0.5)\0A    pow_7 = torch.ops.aten.pow(view_111, 3.0)\0A    mul_25 = torch.ops.aten.mul(pow_7, 0.044715);  pow_7 = None\0A    add_26 = torch.ops.aten.add(view_111, mul_25);  view_111 = mul_25 = None\0A    mul_26 = torch.ops.aten.mul(add_26, 0.7978845608028654);  add_26 = None\0A    tanh_6 = torch.ops.aten.tanh(mul_26);  mul_26 = None\0A    detach_13 = torch.ops.aten.detach(tanh_6)\0A    add_27 = torch.ops.aten.add(tanh_6, 1.0);  tanh_6 = None\0A    mul_27 = torch.ops.aten.mul(mul_24, add_27);  mul_24 = add_27 = None\0A    view_112 = torch.ops.aten.view(mul_27, [-1, 3072]);  mul_27 = None\0A    _param_constant84 = self._param_constant84\0A    _param_constant85 = self._param_constant85\0A    addmm_27 = torch.ops.aten.addmm(_param_constant84, view_112, _param_constant85);  _param_constant84 = view_112 = _param_constant85 = None\0A    view_113 = torch.ops.aten.view(addmm_27, [1, 5, 768]);  addmm_27 = None\0A    add_28 = torch.ops.aten.add(add_25, view_113);  add_25 = view_113 = None\0A    _param_constant86 = self._param_constant86\0A    _param_constant87 = self._param_constant87\0A    native_layer_norm_14 = torch.ops.aten.native_layer_norm(add_28, [768], _param_constant86, _param_constant87, 1e-05);  _param_constant86 = _param_constant87 = None\0A    getitem_42 = native_layer_norm_14[0]\0A    getitem_43 = native_layer_norm_14[1]\0A    getitem_44 = native_layer_norm_14[2];  native_layer_norm_14 = None\0A    view_114 = torch.ops.aten.view(getitem_42, [-1, 768]);  getitem_42 = None\0A    _param_constant88 = self._param_constant88\0A    _param_constant89 = self._param_constant89\0A    addmm_28 = torch.ops.aten.addmm(_param_constant88, view_114, _param_constant89);  _param_constant88 = view_114 = _param_constant89 = None\0A    view_115 = torch.ops.aten.view(addmm_28, [1, 5, 2304]);  addmm_28 = None\0A    slice_50 = torch.ops.aten.slice(view_115, 2, 0, 768)\0A    slice_51 = torch.ops.aten.slice(view_115, 2, 768, 1536)\0A    slice_52 = torch.ops.aten.slice(view_115, 2, 1536, 2304);  view_115 = None\0A    view_116 = torch.ops.aten.view(slice_50, [1, 5, 12, 64]);  slice_50 = None\0A    permute_28 = torch.ops.aten.permute(view_116, [0, 2, 1, 3]);  view_116 = None\0A    view_117 = torch.ops.aten.view(slice_51, [1, 5, 12, 64]);  slice_51 = None\0A    permute_29 = torch.ops.aten.permute(view_117, [0, 2, 1, 3]);  view_117 = None\0A    view_118 = torch.ops.aten.view(slice_52, [1, 5, 12, 64]);  slice_52 = None\0A    permute_30 = torch.ops.aten.permute(view_118, [0, 2, 1, 3]);  view_118 = None\0A    transpose_7 = torch.ops.aten.transpose(permute_29, -1, -2);  permute_29 = None\0A    expand_28 = torch.ops.aten.expand(permute_28, [1, 12, 5, 64]);  permute_28 = None\0A    view_119 = torch.ops.aten.view(expand_28, [12, 5, 64]);  expand_28 = None\0A    expand_29 = torch.ops.aten.expand(transpose_7, [1, 12, 64, 5]);  transpose_7 = None\0A    view_120 = torch.ops.aten.view(expand_29, [12, 64, 5]);  expand_29 = None\0A    bmm_14 = torch.ops.aten.bmm(view_119, view_120);  view_119 = view_120 = None\0A    _unsafe_view_14 = torch.ops.aten._unsafe_view(bmm_14, [1, 12, 5, 5]);  bmm_14 = None\0A    _tensor_constant21 = self._tensor_constant21\0A    lift_fresh_copy_14 = torch.ops.aten.lift_fresh_copy(_tensor_constant21);  _tensor_constant21 = None\0A    div_7 = torch.ops.aten.div(_unsafe_view_14, lift_fresh_copy_14);  _unsafe_view_14 = lift_fresh_copy_14 = None\0A    _tensor_constant22 = self._tensor_constant22\0A    slice_53 = torch.ops.aten.slice(_tensor_constant22, 0, 0, 9223372036854775807);  _tensor_constant22 = None\0A    slice_54 = torch.ops.aten.slice(slice_53, 1, 0, 9223372036854775807);  slice_53 = None\0A    slice_55 = torch.ops.aten.slice(slice_54, 2, 0, 5);  slice_54 = None\0A    slice_56 = torch.ops.aten.slice(slice_55, 3, 0, 5);  slice_55 = None\0A    _to_copy_7 = torch.ops.aten._to_copy(slice_56, dtype = torch.bool);  slice_56 = None\0A    _tensor_constant23 = self._tensor_constant23\0A    lift_fresh_copy_15 = torch.ops.aten.lift_fresh_copy(_tensor_constant23);  _tensor_constant23 = None\0A    where_7 = torch.ops.aten.where(_to_copy_7, div_7, lift_fresh_copy_15);  _to_copy_7 = div_7 = lift_fresh_copy_15 = None\0A    _softmax_7 = torch.ops.aten._softmax(where_7, -1, False);  where_7 = None\0A    detach_14 = torch.ops.aten.detach(_softmax_7)\0A    expand_30 = torch.ops.aten.expand(_softmax_7, [1, 12, 5, 5]);  _softmax_7 = None\0A    view_121 = torch.ops.aten.view(expand_30, [12, 5, 5]);  expand_30 = None\0A    expand_31 = torch.ops.aten.expand(permute_30, [1, 12, 5, 64]);  permute_30 = None\0A    view_122 = torch.ops.aten.view(expand_31, [12, 5, 64]);  expand_31 = None\0A    bmm_15 = torch.ops.aten.bmm(view_121, view_122);  view_121 = view_122 = None\0A    _unsafe_view_15 = torch.ops.aten._unsafe_view(bmm_15, [1, 12, 5, 64]);  bmm_15 = None\0A    permute_31 = torch.ops.aten.permute(_unsafe_view_15, [0, 2, 1, 3]);  _unsafe_view_15 = None\0A    clone_7 = torch.ops.aten.clone(permute_31, memory_format = torch.contiguous_format);  permute_31 = None\0A    view_123 = torch.ops.aten.view(clone_7, [1, 5, 768]);  clone_7 = None\0A    view_124 = torch.ops.aten.view(view_123, [-1, 768]);  view_123 = None\0A    _param_constant90 = self._param_constant90\0A    _param_constant91 = self._param_constant91\0A    addmm_29 = torch.ops.aten.addmm(_param_constant90, view_124, _param_constant91);  _param_constant90 = view_124 = _param_constant91 = None\0A    view_125 = torch.ops.aten.view(addmm_29, [1, 5, 768]);  addmm_29 = None\0A    add_29 = torch.ops.aten.add(view_125, add_28);  view_125 = add_28 = None\0A    _param_constant92 = self._param_constant92\0A    _param_constant93 = self._param_constant93\0A    native_layer_norm_15 = torch.ops.aten.native_layer_norm(add_29, [768], _param_constant92, _param_constant93, 1e-05);  _param_constant92 = _param_constant93 = None\0A    getitem_45 = native_layer_norm_15[0]\0A    getitem_46 = native_layer_norm_15[1]\0A    getitem_47 = native_layer_norm_15[2];  native_layer_norm_15 = None\0A    view_126 = torch.ops.aten.view(getitem_45, [-1, 768]);  getitem_45 = None\0A    _param_constant94 = self._param_constant94\0A    _param_constant95 = self._param_constant95\0A    addmm_30 = torch.ops.aten.addmm(_param_constant94, view_126, _param_constant95);  _param_constant94 = view_126 = _param_constant95 = None\0A    view_127 = torch.ops.aten.view(addmm_30, [1, 5, 3072]);  addmm_30 = None\0A    mul_28 = torch.ops.aten.mul(view_127, 0.5)\0A    pow_8 = torch.ops.aten.pow(view_127, 3.0)\0A    mul_29 = torch.ops.aten.mul(pow_8, 0.044715);  pow_8 = None\0A    add_30 = torch.ops.aten.add(view_127, mul_29);  view_127 = mul_29 = None\0A    mul_30 = torch.ops.aten.mul(add_30, 0.7978845608028654);  add_30 = None\0A    tanh_7 = torch.ops.aten.tanh(mul_30);  mul_30 = None\0A    detach_15 = torch.ops.aten.detach(tanh_7)\0A    add_31 = torch.ops.aten.add(tanh_7, 1.0);  tanh_7 = None\0A    mul_31 = torch.ops.aten.mul(mul_28, add_31);  mul_28 = add_31 = None\0A    view_128 = torch.ops.aten.view(mul_31, [-1, 3072]);  mul_31 = None\0A    _param_constant96 = self._param_constant96\0A    _param_constant97 = self._param_constant97\0A    addmm_31 = torch.ops.aten.addmm(_param_constant96, view_128, _param_constant97);  _param_constant96 = view_128 = _param_constant97 = None\0A    view_129 = torch.ops.aten.view(addmm_31, [1, 5, 768]);  addmm_31 = None\0A    add_32 = torch.ops.aten.add(add_29, view_129);  add_29 = view_129 = None\0A    _param_constant98 = self._param_constant98\0A    _param_constant99 = self._param_constant99\0A    native_layer_norm_16 = torch.ops.aten.native_layer_norm(add_32, [768], _param_constant98, _param_constant99, 1e-05);  _param_constant98 = _param_constant99 = None\0A    getitem_48 = native_layer_norm_16[0]\0A    getitem_49 = native_layer_norm_16[1]\0A    getitem_50 = native_layer_norm_16[2];  native_layer_norm_16 = None\0A    view_130 = torch.ops.aten.view(getitem_48, [-1, 768]);  getitem_48 = None\0A    _param_constant100 = self._param_constant100\0A    _param_constant101 = self._param_constant101\0A    addmm_32 = torch.ops.aten.addmm(_param_constant100, view_130, _param_constant101);  _param_constant100 = view_130 = _param_constant101 = None\0A    view_131 = torch.ops.aten.view(addmm_32, [1, 5, 2304]);  addmm_32 = None\0A    slice_57 = torch.ops.aten.slice(view_131, 2, 0, 768)\0A    slice_58 = torch.ops.aten.slice(view_131, 2, 768, 1536)\0A    slice_59 = torch.ops.aten.slice(view_131, 2, 1536, 2304);  view_131 = None\0A    view_132 = torch.ops.aten.view(slice_57, [1, 5, 12, 64]);  slice_57 = None\0A    permute_32 = torch.ops.aten.permute(view_132, [0, 2, 1, 3]);  view_132 = None\0A    view_133 = torch.ops.aten.view(slice_58, [1, 5, 12, 64]);  slice_58 = None\0A    permute_33 = torch.ops.aten.permute(view_133, [0, 2, 1, 3]);  view_133 = None\0A    view_134 = torch.ops.aten.view(slice_59, [1, 5, 12, 64]);  slice_59 = None\0A    permute_34 = torch.ops.aten.permute(view_134, [0, 2, 1, 3]);  view_134 = None\0A    transpose_8 = torch.ops.aten.transpose(permute_33, -1, -2);  permute_33 = None\0A    expand_32 = torch.ops.aten.expand(permute_32, [1, 12, 5, 64]);  permute_32 = None\0A    view_135 = torch.ops.aten.view(expand_32, [12, 5, 64]);  expand_32 = None\0A    expand_33 = torch.ops.aten.expand(transpose_8, [1, 12, 64, 5]);  transpose_8 = None\0A    view_136 = torch.ops.aten.view(expand_33, [12, 64, 5]);  expand_33 = None\0A    bmm_16 = torch.ops.aten.bmm(view_135, view_136);  view_135 = view_136 = None\0A    _unsafe_view_16 = torch.ops.aten._unsafe_view(bmm_16, [1, 12, 5, 5]);  bmm_16 = None\0A    _tensor_constant24 = self._tensor_constant24\0A    lift_fresh_copy_16 = torch.ops.aten.lift_fresh_copy(_tensor_constant24);  _tensor_constant24 = None\0A    div_8 = torch.ops.aten.div(_unsafe_view_16, lift_fresh_copy_16);  _unsafe_view_16 = lift_fresh_copy_16 = None\0A    _tensor_constant25 = self._tensor_constant25\0A    slice_60 = torch.ops.aten.slice(_tensor_constant25, 0, 0, 9223372036854775807);  _tensor_constant25 = None\0A    slice_61 = torch.ops.aten.slice(slice_60, 1, 0, 9223372036854775807);  slice_60 = None\0A    slice_62 = torch.ops.aten.slice(slice_61, 2, 0, 5);  slice_61 = None\0A    slice_63 = torch.ops.aten.slice(slice_62, 3, 0, 5);  slice_62 = None\0A    _to_copy_8 = torch.ops.aten._to_copy(slice_63, dtype = torch.bool);  slice_63 = None\0A    _tensor_constant26 = self._tensor_constant26\0A    lift_fresh_copy_17 = torch.ops.aten.lift_fresh_copy(_tensor_constant26);  _tensor_constant26 = None\0A    where_8 = torch.ops.aten.where(_to_copy_8, div_8, lift_fresh_copy_17);  _to_copy_8 = div_8 = lift_fresh_copy_17 = None\0A    _softmax_8 = torch.ops.aten._softmax(where_8, -1, False);  where_8 = None\0A    detach_16 = torch.ops.aten.detach(_softmax_8)\0A    expand_34 = torch.ops.aten.expand(_softmax_8, [1, 12, 5, 5]);  _softmax_8 = None\0A    view_137 = torch.ops.aten.view(expand_34, [12, 5, 5]);  expand_34 = None\0A    expand_35 = torch.ops.aten.expand(permute_34, [1, 12, 5, 64]);  permute_34 = None\0A    view_138 = torch.ops.aten.view(expand_35, [12, 5, 64]);  expand_35 = None\0A    bmm_17 = torch.ops.aten.bmm(view_137, view_138);  view_137 = view_138 = None\0A    _unsafe_view_17 = torch.ops.aten._unsafe_view(bmm_17, [1, 12, 5, 64]);  bmm_17 = None\0A    permute_35 = torch.ops.aten.permute(_unsafe_view_17, [0, 2, 1, 3]);  _unsafe_view_17 = None\0A    clone_8 = torch.ops.aten.clone(permute_35, memory_format = torch.contiguous_format);  permute_35 = None\0A    view_139 = torch.ops.aten.view(clone_8, [1, 5, 768]);  clone_8 = None\0A    view_140 = torch.ops.aten.view(view_139, [-1, 768]);  view_139 = None\0A    _param_constant102 = self._param_constant102\0A    _param_constant103 = self._param_constant103\0A    addmm_33 = torch.ops.aten.addmm(_param_constant102, view_140, _param_constant103);  _param_constant102 = view_140 = _param_constant103 = None\0A    view_141 = torch.ops.aten.view(addmm_33, [1, 5, 768]);  addmm_33 = None\0A    add_33 = torch.ops.aten.add(view_141, add_32);  view_141 = add_32 = None\0A    _param_constant104 = self._param_constant104\0A    _param_constant105 = self._param_constant105\0A    native_layer_norm_17 = torch.ops.aten.native_layer_norm(add_33, [768], _param_constant104, _param_constant105, 1e-05);  _param_constant104 = _param_constant105 = None\0A    getitem_51 = native_layer_norm_17[0]\0A    getitem_52 = native_layer_norm_17[1]\0A    getitem_53 = native_layer_norm_17[2];  native_layer_norm_17 = None\0A    view_142 = torch.ops.aten.view(getitem_51, [-1, 768]);  getitem_51 = None\0A    _param_constant106 = self._param_constant106\0A    _param_constant107 = self._param_constant107\0A    addmm_34 = torch.ops.aten.addmm(_param_constant106, view_142, _param_constant107);  _param_constant106 = view_142 = _param_constant107 = None\0A    view_143 = torch.ops.aten.view(addmm_34, [1, 5, 3072]);  addmm_34 = None\0A    mul_32 = torch.ops.aten.mul(view_143, 0.5)\0A    pow_9 = torch.ops.aten.pow(view_143, 3.0)\0A    mul_33 = torch.ops.aten.mul(pow_9, 0.044715);  pow_9 = None\0A    add_34 = torch.ops.aten.add(view_143, mul_33);  view_143 = mul_33 = None\0A    mul_34 = torch.ops.aten.mul(add_34, 0.7978845608028654);  add_34 = None\0A    tanh_8 = torch.ops.aten.tanh(mul_34);  mul_34 = None\0A    detach_17 = torch.ops.aten.detach(tanh_8)\0A    add_35 = torch.ops.aten.add(tanh_8, 1.0);  tanh_8 = None\0A    mul_35 = torch.ops.aten.mul(mul_32, add_35);  mul_32 = add_35 = None\0A    view_144 = torch.ops.aten.view(mul_35, [-1, 3072]);  mul_35 = None\0A    _param_constant108 = self._param_constant108\0A    _param_constant109 = self._param_constant109\0A    addmm_35 = torch.ops.aten.addmm(_param_constant108, view_144, _param_constant109);  _param_constant108 = view_144 = _param_constant109 = None\0A    view_145 = torch.ops.aten.view(addmm_35, [1, 5, 768]);  addmm_35 = None\0A    add_36 = torch.ops.aten.add(add_33, view_145);  add_33 = view_145 = None\0A    _param_constant110 = self._param_constant110\0A    _param_constant111 = self._param_constant111\0A    native_layer_norm_18 = torch.ops.aten.native_layer_norm(add_36, [768], _param_constant110, _param_constant111, 1e-05);  _param_constant110 = _param_constant111 = None\0A    getitem_54 = native_layer_norm_18[0]\0A    getitem_55 = native_layer_norm_18[1]\0A    getitem_56 = native_layer_norm_18[2];  native_layer_norm_18 = None\0A    view_146 = torch.ops.aten.view(getitem_54, [-1, 768]);  getitem_54 = None\0A    _param_constant112 = self._param_constant112\0A    _param_constant113 = self._param_constant113\0A    addmm_36 = torch.ops.aten.addmm(_param_constant112, view_146, _param_constant113);  _param_constant112 = view_146 = _param_constant113 = None\0A    view_147 = torch.ops.aten.view(addmm_36, [1, 5, 2304]);  addmm_36 = None\0A    slice_64 = torch.ops.aten.slice(view_147, 2, 0, 768)\0A    slice_65 = torch.ops.aten.slice(view_147, 2, 768, 1536)\0A    slice_66 = torch.ops.aten.slice(view_147, 2, 1536, 2304);  view_147 = None\0A    view_148 = torch.ops.aten.view(slice_64, [1, 5, 12, 64]);  slice_64 = None\0A    permute_36 = torch.ops.aten.permute(view_148, [0, 2, 1, 3]);  view_148 = None\0A    view_149 = torch.ops.aten.view(slice_65, [1, 5, 12, 64]);  slice_65 = None\0A    permute_37 = torch.ops.aten.permute(view_149, [0, 2, 1, 3]);  view_149 = None\0A    view_150 = torch.ops.aten.view(slice_66, [1, 5, 12, 64]);  slice_66 = None\0A    permute_38 = torch.ops.aten.permute(view_150, [0, 2, 1, 3]);  view_150 = None\0A    transpose_9 = torch.ops.aten.transpose(permute_37, -1, -2);  permute_37 = None\0A    expand_36 = torch.ops.aten.expand(permute_36, [1, 12, 5, 64]);  permute_36 = None\0A    view_151 = torch.ops.aten.view(expand_36, [12, 5, 64]);  expand_36 = None\0A    expand_37 = torch.ops.aten.expand(transpose_9, [1, 12, 64, 5]);  transpose_9 = None\0A    view_152 = torch.ops.aten.view(expand_37, [12, 64, 5]);  expand_37 = None\0A    bmm_18 = torch.ops.aten.bmm(view_151, view_152);  view_151 = view_152 = None\0A    _unsafe_view_18 = torch.ops.aten._unsafe_view(bmm_18, [1, 12, 5, 5]);  bmm_18 = None\0A    _tensor_constant27 = self._tensor_constant27\0A    lift_fresh_copy_18 = torch.ops.aten.lift_fresh_copy(_tensor_constant27);  _tensor_constant27 = None\0A    div_9 = torch.ops.aten.div(_unsafe_view_18, lift_fresh_copy_18);  _unsafe_view_18 = lift_fresh_copy_18 = None\0A    _tensor_constant28 = self._tensor_constant28\0A    slice_67 = torch.ops.aten.slice(_tensor_constant28, 0, 0, 9223372036854775807);  _tensor_constant28 = None\0A    slice_68 = torch.ops.aten.slice(slice_67, 1, 0, 9223372036854775807);  slice_67 = None\0A    slice_69 = torch.ops.aten.slice(slice_68, 2, 0, 5);  slice_68 = None\0A    slice_70 = torch.ops.aten.slice(slice_69, 3, 0, 5);  slice_69 = None\0A    _to_copy_9 = torch.ops.aten._to_copy(slice_70, dtype = torch.bool);  slice_70 = None\0A    _tensor_constant29 = self._tensor_constant29\0A    lift_fresh_copy_19 = torch.ops.aten.lift_fresh_copy(_tensor_constant29);  _tensor_constant29 = None\0A    where_9 = torch.ops.aten.where(_to_copy_9, div_9, lift_fresh_copy_19);  _to_copy_9 = div_9 = lift_fresh_copy_19 = None\0A    _softmax_9 = torch.ops.aten._softmax(where_9, -1, False);  where_9 = None\0A    detach_18 = torch.ops.aten.detach(_softmax_9)\0A    expand_38 = torch.ops.aten.expand(_softmax_9, [1, 12, 5, 5]);  _softmax_9 = None\0A    view_153 = torch.ops.aten.view(expand_38, [12, 5, 5]);  expand_38 = None\0A    expand_39 = torch.ops.aten.expand(permute_38, [1, 12, 5, 64]);  permute_38 = None\0A    view_154 = torch.ops.aten.view(expand_39, [12, 5, 64]);  expand_39 = None\0A    bmm_19 = torch.ops.aten.bmm(view_153, view_154);  view_153 = view_154 = None\0A    _unsafe_view_19 = torch.ops.aten._unsafe_view(bmm_19, [1, 12, 5, 64]);  bmm_19 = None\0A    permute_39 = torch.ops.aten.permute(_unsafe_view_19, [0, 2, 1, 3]);  _unsafe_view_19 = None\0A    clone_9 = torch.ops.aten.clone(permute_39, memory_format = torch.contiguous_format);  permute_39 = None\0A    view_155 = torch.ops.aten.view(clone_9, [1, 5, 768]);  clone_9 = None\0A    view_156 = torch.ops.aten.view(view_155, [-1, 768]);  view_155 = None\0A    _param_constant114 = self._param_constant114\0A    _param_constant115 = self._param_constant115\0A    addmm_37 = torch.ops.aten.addmm(_param_constant114, view_156, _param_constant115);  _param_constant114 = view_156 = _param_constant115 = None\0A    view_157 = torch.ops.aten.view(addmm_37, [1, 5, 768]);  addmm_37 = None\0A    add_37 = torch.ops.aten.add(view_157, add_36);  view_157 = add_36 = None\0A    _param_constant116 = self._param_constant116\0A    _param_constant117 = self._param_constant117\0A    native_layer_norm_19 = torch.ops.aten.native_layer_norm(add_37, [768], _param_constant116, _param_constant117, 1e-05);  _param_constant116 = _param_constant117 = None\0A    getitem_57 = native_layer_norm_19[0]\0A    getitem_58 = native_layer_norm_19[1]\0A    getitem_59 = native_layer_norm_19[2];  native_layer_norm_19 = None\0A    view_158 = torch.ops.aten.view(getitem_57, [-1, 768]);  getitem_57 = None\0A    _param_constant118 = self._param_constant118\0A    _param_constant119 = self._param_constant119\0A    addmm_38 = torch.ops.aten.addmm(_param_constant118, view_158, _param_constant119);  _param_constant118 = view_158 = _param_constant119 = None\0A    view_159 = torch.ops.aten.view(addmm_38, [1, 5, 3072]);  addmm_38 = None\0A    mul_36 = torch.ops.aten.mul(view_159, 0.5)\0A    pow_10 = torch.ops.aten.pow(view_159, 3.0)\0A    mul_37 = torch.ops.aten.mul(pow_10, 0.044715);  pow_10 = None\0A    add_38 = torch.ops.aten.add(view_159, mul_37);  view_159 = mul_37 = None\0A    mul_38 = torch.ops.aten.mul(add_38, 0.7978845608028654);  add_38 = None\0A    tanh_9 = torch.ops.aten.tanh(mul_38);  mul_38 = None\0A    detach_19 = torch.ops.aten.detach(tanh_9)\0A    add_39 = torch.ops.aten.add(tanh_9, 1.0);  tanh_9 = None\0A    mul_39 = torch.ops.aten.mul(mul_36, add_39);  mul_36 = add_39 = None\0A    view_160 = torch.ops.aten.view(mul_39, [-1, 3072]);  mul_39 = None\0A    _param_constant120 = self._param_constant120\0A    _param_constant121 = self._param_constant121\0A    addmm_39 = torch.ops.aten.addmm(_param_constant120, view_160, _param_constant121);  _param_constant120 = view_160 = _param_constant121 = None\0A    view_161 = torch.ops.aten.view(addmm_39, [1, 5, 768]);  addmm_39 = None\0A    add_40 = torch.ops.aten.add(add_37, view_161);  add_37 = view_161 = None\0A    _param_constant122 = self._param_constant122\0A    _param_constant123 = self._param_constant123\0A    native_layer_norm_20 = torch.ops.aten.native_layer_norm(add_40, [768], _param_constant122, _param_constant123, 1e-05);  _param_constant122 = _param_constant123 = None\0A    getitem_60 = native_layer_norm_20[0]\0A    getitem_61 = native_layer_norm_20[1]\0A    getitem_62 = native_layer_norm_20[2];  native_layer_norm_20 = None\0A    view_162 = torch.ops.aten.view(getitem_60, [-1, 768]);  getitem_60 = None\0A    _param_constant124 = self._param_constant124\0A    _param_constant125 = self._param_constant125\0A    addmm_40 = torch.ops.aten.addmm(_param_constant124, view_162, _param_constant125);  _param_constant124 = view_162 = _param_constant125 = None\0A    view_163 = torch.ops.aten.view(addmm_40, [1, 5, 2304]);  addmm_40 = None\0A    slice_71 = torch.ops.aten.slice(view_163, 2, 0, 768)\0A    slice_72 = torch.ops.aten.slice(view_163, 2, 768, 1536)\0A    slice_73 = torch.ops.aten.slice(view_163, 2, 1536, 2304);  view_163 = None\0A    view_164 = torch.ops.aten.view(slice_71, [1, 5, 12, 64]);  slice_71 = None\0A    permute_40 = torch.ops.aten.permute(view_164, [0, 2, 1, 3]);  view_164 = None\0A    view_165 = torch.ops.aten.view(slice_72, [1, 5, 12, 64]);  slice_72 = None\0A    permute_41 = torch.ops.aten.permute(view_165, [0, 2, 1, 3]);  view_165 = None\0A    view_166 = torch.ops.aten.view(slice_73, [1, 5, 12, 64]);  slice_73 = None\0A    permute_42 = torch.ops.aten.permute(view_166, [0, 2, 1, 3]);  view_166 = None\0A    transpose_10 = torch.ops.aten.transpose(permute_41, -1, -2);  permute_41 = None\0A    expand_40 = torch.ops.aten.expand(permute_40, [1, 12, 5, 64]);  permute_40 = None\0A    view_167 = torch.ops.aten.view(expand_40, [12, 5, 64]);  expand_40 = None\0A    expand_41 = torch.ops.aten.expand(transpose_10, [1, 12, 64, 5]);  transpose_10 = None\0A    view_168 = torch.ops.aten.view(expand_41, [12, 64, 5]);  expand_41 = None\0A    bmm_20 = torch.ops.aten.bmm(view_167, view_168);  view_167 = view_168 = None\0A    _unsafe_view_20 = torch.ops.aten._unsafe_view(bmm_20, [1, 12, 5, 5]);  bmm_20 = None\0A    _tensor_constant30 = self._tensor_constant30\0A    lift_fresh_copy_20 = torch.ops.aten.lift_fresh_copy(_tensor_constant30);  _tensor_constant30 = None\0A    div_10 = torch.ops.aten.div(_unsafe_view_20, lift_fresh_copy_20);  _unsafe_view_20 = lift_fresh_copy_20 = None\0A    _tensor_constant31 = self._tensor_constant31\0A    slice_74 = torch.ops.aten.slice(_tensor_constant31, 0, 0, 9223372036854775807);  _tensor_constant31 = None\0A    slice_75 = torch.ops.aten.slice(slice_74, 1, 0, 9223372036854775807);  slice_74 = None\0A    slice_76 = torch.ops.aten.slice(slice_75, 2, 0, 5);  slice_75 = None\0A    slice_77 = torch.ops.aten.slice(slice_76, 3, 0, 5);  slice_76 = None\0A    _to_copy_10 = torch.ops.aten._to_copy(slice_77, dtype = torch.bool);  slice_77 = None\0A    _tensor_constant32 = self._tensor_constant32\0A    lift_fresh_copy_21 = torch.ops.aten.lift_fresh_copy(_tensor_constant32);  _tensor_constant32 = None\0A    where_10 = torch.ops.aten.where(_to_copy_10, div_10, lift_fresh_copy_21);  _to_copy_10 = div_10 = lift_fresh_copy_21 = None\0A    _softmax_10 = torch.ops.aten._softmax(where_10, -1, False);  where_10 = None\0A    detach_20 = torch.ops.aten.detach(_softmax_10)\0A    expand_42 = torch.ops.aten.expand(_softmax_10, [1, 12, 5, 5]);  _softmax_10 = None\0A    view_169 = torch.ops.aten.view(expand_42, [12, 5, 5]);  expand_42 = None\0A    expand_43 = torch.ops.aten.expand(permute_42, [1, 12, 5, 64]);  permute_42 = None\0A    view_170 = torch.ops.aten.view(expand_43, [12, 5, 64]);  expand_43 = None\0A    bmm_21 = torch.ops.aten.bmm(view_169, view_170);  view_169 = view_170 = None\0A    _unsafe_view_21 = torch.ops.aten._unsafe_view(bmm_21, [1, 12, 5, 64]);  bmm_21 = None\0A    permute_43 = torch.ops.aten.permute(_unsafe_view_21, [0, 2, 1, 3]);  _unsafe_view_21 = None\0A    clone_10 = torch.ops.aten.clone(permute_43, memory_format = torch.contiguous_format);  permute_43 = None\0A    view_171 = torch.ops.aten.view(clone_10, [1, 5, 768]);  clone_10 = None\0A    view_172 = torch.ops.aten.view(view_171, [-1, 768]);  view_171 = None\0A    _param_constant126 = self._param_constant126\0A    _param_constant127 = self._param_constant127\0A    addmm_41 = torch.ops.aten.addmm(_param_constant126, view_172, _param_constant127);  _param_constant126 = view_172 = _param_constant127 = None\0A    view_173 = torch.ops.aten.view(addmm_41, [1, 5, 768]);  addmm_41 = None\0A    add_41 = torch.ops.aten.add(view_173, add_40);  view_173 = add_40 = None\0A    _param_constant128 = self._param_constant128\0A    _param_constant129 = self._param_constant129\0A    native_layer_norm_21 = torch.ops.aten.native_layer_norm(add_41, [768], _param_constant128, _param_constant129, 1e-05);  _param_constant128 = _param_constant129 = None\0A    getitem_63 = native_layer_norm_21[0]\0A    getitem_64 = native_layer_norm_21[1]\0A    getitem_65 = native_layer_norm_21[2];  native_layer_norm_21 = None\0A    view_174 = torch.ops.aten.view(getitem_63, [-1, 768]);  getitem_63 = None\0A    _param_constant130 = self._param_constant130\0A    _param_constant131 = self._param_constant131\0A    addmm_42 = torch.ops.aten.addmm(_param_constant130, view_174, _param_constant131);  _param_constant130 = view_174 = _param_constant131 = None\0A    view_175 = torch.ops.aten.view(addmm_42, [1, 5, 3072]);  addmm_42 = None\0A    mul_40 = torch.ops.aten.mul(view_175, 0.5)\0A    pow_11 = torch.ops.aten.pow(view_175, 3.0)\0A    mul_41 = torch.ops.aten.mul(pow_11, 0.044715);  pow_11 = None\0A    add_42 = torch.ops.aten.add(view_175, mul_41);  view_175 = mul_41 = None\0A    mul_42 = torch.ops.aten.mul(add_42, 0.7978845608028654);  add_42 = None\0A    tanh_10 = torch.ops.aten.tanh(mul_42);  mul_42 = None\0A    detach_21 = torch.ops.aten.detach(tanh_10)\0A    add_43 = torch.ops.aten.add(tanh_10, 1.0);  tanh_10 = None\0A    mul_43 = torch.ops.aten.mul(mul_40, add_43);  mul_40 = add_43 = None\0A    view_176 = torch.ops.aten.view(mul_43, [-1, 3072]);  mul_43 = None\0A    _param_constant132 = self._param_constant132\0A    _param_constant133 = self._param_constant133\0A    addmm_43 = torch.ops.aten.addmm(_param_constant132, view_176, _param_constant133);  _param_constant132 = view_176 = _param_constant133 = None\0A    view_177 = torch.ops.aten.view(addmm_43, [1, 5, 768]);  addmm_43 = None\0A    add_44 = torch.ops.aten.add(add_41, view_177);  add_41 = view_177 = None\0A    _param_constant134 = self._param_constant134\0A    _param_constant135 = self._param_constant135\0A    native_layer_norm_22 = torch.ops.aten.native_layer_norm(add_44, [768], _param_constant134, _param_constant135, 1e-05);  _param_constant134 = _param_constant135 = None\0A    getitem_66 = native_layer_norm_22[0]\0A    getitem_67 = native_layer_norm_22[1]\0A    getitem_68 = native_layer_norm_22[2];  native_layer_norm_22 = None\0A    view_178 = torch.ops.aten.view(getitem_66, [-1, 768]);  getitem_66 = None\0A    _param_constant136 = self._param_constant136\0A    _param_constant137 = self._param_constant137\0A    addmm_44 = torch.ops.aten.addmm(_param_constant136, view_178, _param_constant137);  _param_constant136 = view_178 = _param_constant137 = None\0A    view_179 = torch.ops.aten.view(addmm_44, [1, 5, 2304]);  addmm_44 = None\0A    slice_78 = torch.ops.aten.slice(view_179, 2, 0, 768)\0A    slice_79 = torch.ops.aten.slice(view_179, 2, 768, 1536)\0A    slice_80 = torch.ops.aten.slice(view_179, 2, 1536, 2304);  view_179 = None\0A    view_180 = torch.ops.aten.view(slice_78, [1, 5, 12, 64]);  slice_78 = None\0A    permute_44 = torch.ops.aten.permute(view_180, [0, 2, 1, 3]);  view_180 = None\0A    view_181 = torch.ops.aten.view(slice_79, [1, 5, 12, 64]);  slice_79 = None\0A    permute_45 = torch.ops.aten.permute(view_181, [0, 2, 1, 3]);  view_181 = None\0A    view_182 = torch.ops.aten.view(slice_80, [1, 5, 12, 64]);  slice_80 = None\0A    permute_46 = torch.ops.aten.permute(view_182, [0, 2, 1, 3]);  view_182 = None\0A    transpose_11 = torch.ops.aten.transpose(permute_45, -1, -2);  permute_45 = None\0A    expand_44 = torch.ops.aten.expand(permute_44, [1, 12, 5, 64]);  permute_44 = None\0A    view_183 = torch.ops.aten.view(expand_44, [12, 5, 64]);  expand_44 = None\0A    expand_45 = torch.ops.aten.expand(transpose_11, [1, 12, 64, 5]);  transpose_11 = None\0A    view_184 = torch.ops.aten.view(expand_45, [12, 64, 5]);  expand_45 = None\0A    bmm_22 = torch.ops.aten.bmm(view_183, view_184);  view_183 = view_184 = None\0A    _unsafe_view_22 = torch.ops.aten._unsafe_view(bmm_22, [1, 12, 5, 5]);  bmm_22 = None\0A    _tensor_constant33 = self._tensor_constant33\0A    lift_fresh_copy_22 = torch.ops.aten.lift_fresh_copy(_tensor_constant33);  _tensor_constant33 = None\0A    div_11 = torch.ops.aten.div(_unsafe_view_22, lift_fresh_copy_22);  _unsafe_view_22 = lift_fresh_copy_22 = None\0A    _tensor_constant34 = self._tensor_constant34\0A    slice_81 = torch.ops.aten.slice(_tensor_constant34, 0, 0, 9223372036854775807);  _tensor_constant34 = None\0A    slice_82 = torch.ops.aten.slice(slice_81, 1, 0, 9223372036854775807);  slice_81 = None\0A    slice_83 = torch.ops.aten.slice(slice_82, 2, 0, 5);  slice_82 = None\0A    slice_84 = torch.ops.aten.slice(slice_83, 3, 0, 5);  slice_83 = None\0A    _to_copy_11 = torch.ops.aten._to_copy(slice_84, dtype = torch.bool);  slice_84 = None\0A    _tensor_constant35 = self._tensor_constant35\0A    lift_fresh_copy_23 = torch.ops.aten.lift_fresh_copy(_tensor_constant35);  _tensor_constant35 = None\0A    where_11 = torch.ops.aten.where(_to_copy_11, div_11, lift_fresh_copy_23);  _to_copy_11 = div_11 = lift_fresh_copy_23 = None\0A    _softmax_11 = torch.ops.aten._softmax(where_11, -1, False);  where_11 = None\0A    detach_22 = torch.ops.aten.detach(_softmax_11)\0A    expand_46 = torch.ops.aten.expand(_softmax_11, [1, 12, 5, 5]);  _softmax_11 = None\0A    view_185 = torch.ops.aten.view(expand_46, [12, 5, 5]);  expand_46 = None\0A    expand_47 = torch.ops.aten.expand(permute_46, [1, 12, 5, 64]);  permute_46 = None\0A    view_186 = torch.ops.aten.view(expand_47, [12, 5, 64]);  expand_47 = None\0A    bmm_23 = torch.ops.aten.bmm(view_185, view_186);  view_185 = view_186 = None\0A    _unsafe_view_23 = torch.ops.aten._unsafe_view(bmm_23, [1, 12, 5, 64]);  bmm_23 = None\0A    permute_47 = torch.ops.aten.permute(_unsafe_view_23, [0, 2, 1, 3]);  _unsafe_view_23 = None\0A    clone_11 = torch.ops.aten.clone(permute_47, memory_format = torch.contiguous_format);  permute_47 = None\0A    view_187 = torch.ops.aten.view(clone_11, [1, 5, 768]);  clone_11 = None\0A    view_188 = torch.ops.aten.view(view_187, [-1, 768]);  view_187 = None\0A    _param_constant138 = self._param_constant138\0A    _param_constant139 = self._param_constant139\0A    addmm_45 = torch.ops.aten.addmm(_param_constant138, view_188, _param_constant139);  _param_constant138 = view_188 = _param_constant139 = None\0A    view_189 = torch.ops.aten.view(addmm_45, [1, 5, 768]);  addmm_45 = None\0A    add_45 = torch.ops.aten.add(view_189, add_44);  view_189 = add_44 = None\0A    _param_constant140 = self._param_constant140\0A    _param_constant141 = self._param_constant141\0A    native_layer_norm_23 = torch.ops.aten.native_layer_norm(add_45, [768], _param_constant140, _param_constant141, 1e-05);  _param_constant140 = _param_constant141 = None\0A    getitem_69 = native_layer_norm_23[0]\0A    getitem_70 = native_layer_norm_23[1]\0A    getitem_71 = native_layer_norm_23[2];  native_layer_norm_23 = None\0A    view_190 = torch.ops.aten.view(getitem_69, [-1, 768]);  getitem_69 = None\0A    _param_constant142 = self._param_constant142\0A    _param_constant143 = self._param_constant143\0A    addmm_46 = torch.ops.aten.addmm(_param_constant142, view_190, _param_constant143);  _param_constant142 = view_190 = _param_constant143 = None\0A    view_191 = torch.ops.aten.view(addmm_46, [1, 5, 3072]);  addmm_46 = None\0A    mul_44 = torch.ops.aten.mul(view_191, 0.5)\0A    pow_12 = torch.ops.aten.pow(view_191, 3.0)\0A    mul_45 = torch.ops.aten.mul(pow_12, 0.044715);  pow_12 = None\0A    add_46 = torch.ops.aten.add(view_191, mul_45);  view_191 = mul_45 = None\0A    mul_46 = torch.ops.aten.mul(add_46, 0.7978845608028654);  add_46 = None\0A    tanh_11 = torch.ops.aten.tanh(mul_46);  mul_46 = None\0A    detach_23 = torch.ops.aten.detach(tanh_11)\0A    add_47 = torch.ops.aten.add(tanh_11, 1.0);  tanh_11 = None\0A    mul_47 = torch.ops.aten.mul(mul_44, add_47);  mul_44 = add_47 = None\0A    view_192 = torch.ops.aten.view(mul_47, [-1, 3072]);  mul_47 = None\0A    _param_constant144 = self._param_constant144\0A    _param_constant145 = self._param_constant145\0A    addmm_47 = torch.ops.aten.addmm(_param_constant144, view_192, _param_constant145);  _param_constant144 = view_192 = _param_constant145 = None\0A    view_193 = torch.ops.aten.view(addmm_47, [1, 5, 768]);  addmm_47 = None\0A    add_48 = torch.ops.aten.add(add_45, view_193);  add_45 = view_193 = None\0A    _param_constant146 = self._param_constant146\0A    _param_constant147 = self._param_constant147\0A    native_layer_norm_24 = torch.ops.aten.native_layer_norm(add_48, [768], _param_constant146, _param_constant147, 1e-05);  add_48 = _param_constant146 = _param_constant147 = None\0A    getitem_72 = native_layer_norm_24[0]\0A    getitem_73 = native_layer_norm_24[1]\0A    getitem_74 = native_layer_norm_24[2];  native_layer_norm_24 = None\0A    view_194 = torch.ops.aten.view(getitem_72, [1, 5, 768]);  getitem_72 = None\0A    _param_constant148 = self._param_constant148\0A    t = torch.ops.aten.t(_param_constant148);  _param_constant148 = None\0A    view_195 = torch.ops.aten.view(view_194, [5, 768]);  view_194 = None\0A    mm = torch.ops.aten.mm(view_195, t);  view_195 = t = None\0A    _unsafe_view_24 = torch.ops.aten._unsafe_view(mm, [1, 5, 50257]);  mm = None\0A    return _unsafe_view_24\0A    "
  %185 = torch.nn_module {
    torch.slot "_param_constant0", %0 : !torch.tensor<[50257,768],f32>
    torch.slot "_param_constant1", %1 : !torch.tensor<[1024,768],f32>
    torch.slot "_param_constant2", %2 : !torch.tensor<[768],f32>
    torch.slot "_param_constant3", %3 : !torch.tensor<[768],f32>
    torch.slot "_param_constant4", %4 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant5", %5 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant6", %6 : !torch.tensor<[768],f32>
    torch.slot "_param_constant7", %7 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant8", %8 : !torch.tensor<[768],f32>
    torch.slot "_param_constant9", %9 : !torch.tensor<[768],f32>
    torch.slot "_param_constant10", %10 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant11", %11 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant12", %12 : !torch.tensor<[768],f32>
    torch.slot "_param_constant13", %13 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant14", %14 : !torch.tensor<[768],f32>
    torch.slot "_param_constant15", %15 : !torch.tensor<[768],f32>
    torch.slot "_param_constant16", %16 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant17", %17 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant18", %18 : !torch.tensor<[768],f32>
    torch.slot "_param_constant19", %19 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant20", %20 : !torch.tensor<[768],f32>
    torch.slot "_param_constant21", %21 : !torch.tensor<[768],f32>
    torch.slot "_param_constant22", %22 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant23", %23 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant24", %24 : !torch.tensor<[768],f32>
    torch.slot "_param_constant25", %25 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant26", %26 : !torch.tensor<[768],f32>
    torch.slot "_param_constant27", %27 : !torch.tensor<[768],f32>
    torch.slot "_param_constant28", %28 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant29", %29 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant30", %30 : !torch.tensor<[768],f32>
    torch.slot "_param_constant31", %31 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant32", %32 : !torch.tensor<[768],f32>
    torch.slot "_param_constant33", %33 : !torch.tensor<[768],f32>
    torch.slot "_param_constant34", %34 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant35", %35 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant36", %36 : !torch.tensor<[768],f32>
    torch.slot "_param_constant37", %37 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant38", %38 : !torch.tensor<[768],f32>
    torch.slot "_param_constant39", %39 : !torch.tensor<[768],f32>
    torch.slot "_param_constant40", %40 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant41", %41 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant42", %42 : !torch.tensor<[768],f32>
    torch.slot "_param_constant43", %43 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant44", %44 : !torch.tensor<[768],f32>
    torch.slot "_param_constant45", %45 : !torch.tensor<[768],f32>
    torch.slot "_param_constant46", %46 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant47", %47 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant48", %48 : !torch.tensor<[768],f32>
    torch.slot "_param_constant49", %49 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant50", %50 : !torch.tensor<[768],f32>
    torch.slot "_param_constant51", %51 : !torch.tensor<[768],f32>
    torch.slot "_param_constant52", %52 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant53", %53 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant54", %54 : !torch.tensor<[768],f32>
    torch.slot "_param_constant55", %55 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant56", %56 : !torch.tensor<[768],f32>
    torch.slot "_param_constant57", %57 : !torch.tensor<[768],f32>
    torch.slot "_param_constant58", %58 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant59", %59 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant60", %60 : !torch.tensor<[768],f32>
    torch.slot "_param_constant61", %61 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant62", %62 : !torch.tensor<[768],f32>
    torch.slot "_param_constant63", %63 : !torch.tensor<[768],f32>
    torch.slot "_param_constant64", %64 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant65", %65 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant66", %66 : !torch.tensor<[768],f32>
    torch.slot "_param_constant67", %67 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant68", %68 : !torch.tensor<[768],f32>
    torch.slot "_param_constant69", %69 : !torch.tensor<[768],f32>
    torch.slot "_param_constant70", %70 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant71", %71 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant72", %72 : !torch.tensor<[768],f32>
    torch.slot "_param_constant73", %73 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant74", %74 : !torch.tensor<[768],f32>
    torch.slot "_param_constant75", %75 : !torch.tensor<[768],f32>
    torch.slot "_param_constant76", %76 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant77", %77 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant78", %78 : !torch.tensor<[768],f32>
    torch.slot "_param_constant79", %79 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant80", %80 : !torch.tensor<[768],f32>
    torch.slot "_param_constant81", %81 : !torch.tensor<[768],f32>
    torch.slot "_param_constant82", %82 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant83", %83 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant84", %84 : !torch.tensor<[768],f32>
    torch.slot "_param_constant85", %85 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant86", %86 : !torch.tensor<[768],f32>
    torch.slot "_param_constant87", %87 : !torch.tensor<[768],f32>
    torch.slot "_param_constant88", %88 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant89", %89 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant90", %90 : !torch.tensor<[768],f32>
    torch.slot "_param_constant91", %91 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant92", %92 : !torch.tensor<[768],f32>
    torch.slot "_param_constant93", %93 : !torch.tensor<[768],f32>
    torch.slot "_param_constant94", %94 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant95", %95 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant96", %96 : !torch.tensor<[768],f32>
    torch.slot "_param_constant97", %97 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant98", %98 : !torch.tensor<[768],f32>
    torch.slot "_param_constant99", %99 : !torch.tensor<[768],f32>
    torch.slot "_param_constant100", %100 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant101", %101 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant102", %102 : !torch.tensor<[768],f32>
    torch.slot "_param_constant103", %103 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant104", %104 : !torch.tensor<[768],f32>
    torch.slot "_param_constant105", %105 : !torch.tensor<[768],f32>
    torch.slot "_param_constant106", %106 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant107", %107 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant108", %108 : !torch.tensor<[768],f32>
    torch.slot "_param_constant109", %109 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant110", %110 : !torch.tensor<[768],f32>
    torch.slot "_param_constant111", %111 : !torch.tensor<[768],f32>
    torch.slot "_param_constant112", %112 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant113", %113 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant114", %114 : !torch.tensor<[768],f32>
    torch.slot "_param_constant115", %115 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant116", %116 : !torch.tensor<[768],f32>
    torch.slot "_param_constant117", %117 : !torch.tensor<[768],f32>
    torch.slot "_param_constant118", %118 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant119", %119 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant120", %120 : !torch.tensor<[768],f32>
    torch.slot "_param_constant121", %121 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant122", %122 : !torch.tensor<[768],f32>
    torch.slot "_param_constant123", %123 : !torch.tensor<[768],f32>
    torch.slot "_param_constant124", %124 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant125", %125 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant126", %126 : !torch.tensor<[768],f32>
    torch.slot "_param_constant127", %127 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant128", %128 : !torch.tensor<[768],f32>
    torch.slot "_param_constant129", %129 : !torch.tensor<[768],f32>
    torch.slot "_param_constant130", %130 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant131", %131 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant132", %132 : !torch.tensor<[768],f32>
    torch.slot "_param_constant133", %133 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant134", %134 : !torch.tensor<[768],f32>
    torch.slot "_param_constant135", %135 : !torch.tensor<[768],f32>
    torch.slot "_param_constant136", %136 : !torch.tensor<[2304],f32>
    torch.slot "_param_constant137", %137 : !torch.tensor<[768,2304],f32>
    torch.slot "_param_constant138", %138 : !torch.tensor<[768],f32>
    torch.slot "_param_constant139", %139 : !torch.tensor<[768,768],f32>
    torch.slot "_param_constant140", %140 : !torch.tensor<[768],f32>
    torch.slot "_param_constant141", %141 : !torch.tensor<[768],f32>
    torch.slot "_param_constant142", %142 : !torch.tensor<[3072],f32>
    torch.slot "_param_constant143", %143 : !torch.tensor<[768,3072],f32>
    torch.slot "_param_constant144", %144 : !torch.tensor<[768],f32>
    torch.slot "_param_constant145", %145 : !torch.tensor<[3072,768],f32>
    torch.slot "_param_constant146", %146 : !torch.tensor<[768],f32>
    torch.slot "_param_constant147", %147 : !torch.tensor<[768],f32>
    torch.slot "_param_constant148", %148 : !torch.tensor<[50257,768],f32>
    torch.slot "_tensor_constant0", %149 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant1", %150 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant2", %151 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant3", %152 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant4", %153 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant5", %154 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant6", %155 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant7", %156 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant8", %157 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant9", %158 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant10", %159 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant11", %160 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant12", %161 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant13", %162 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant14", %163 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant15", %164 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant16", %165 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant17", %166 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant18", %167 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant19", %168 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant20", %169 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant21", %170 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant22", %171 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant23", %172 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant24", %173 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant25", %174 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant26", %175 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant27", %176 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant28", %177 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant29", %178 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant30", %179 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant31", %180 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant32", %181 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant33", %182 : !torch.tensor<[],f32>
    torch.slot "_tensor_constant34", %183 : !torch.tensor<[1,1,1024,1024],ui8>
    torch.slot "_tensor_constant35", %184 : !torch.tensor<[],f32>
    torch.slot "training", %true : !torch.bool
    torch.slot "_is_full_backward_hook", %none : !torch.none
    torch.slot "_code", %str : !torch.str
  } : !torch.nn.Module<"__torch__.torch.fx.graph_module._lambda">
 }