Created
January 12, 2023 17:17
-
-
Save pashu123/e1e97043c3deda71716cd8aeb367c837 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| graph(): | |
| %arg0_1 : [#users=1] = placeholder[target=arg0_1] | |
| %arg1_1 : [#users=1] = placeholder[target=arg1_1] | |
| %arg2_1 : [#users=52] = placeholder[target=arg2_1] | |
| %arg3_1 : [#users=1] = placeholder[target=arg3_1] | |
| %expand : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%arg1_1, [2]), kwargs = {}) | |
| %arange : [#users=1] = call_function[target=torch.ops.aten.arange](args = (0, 128), kwargs = {dtype: torch.float32, device: cuda:0, pin_memory: False}) | |
| %mul : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%arange, -9.210340371976184), kwargs = {}) | |
| %div : [#users=1] = call_function[target=torch.ops.aten.div](args = (%mul, 128), kwargs = {}) | |
| %exp : [#users=1] = call_function[target=torch.ops.aten.exp](args = (%div,), kwargs = {}) | |
| %slice_1 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%expand, 0, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_1, 1), kwargs = {}) | |
| %unsqueeze_1 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%exp, 0), kwargs = {}) | |
| %slice_2 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%unsqueeze_1, 1, 0, 9223372036854775807), kwargs = {}) | |
| %mul_1 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%unsqueeze, %slice_2), kwargs = {}) | |
| %mul_2 : [#users=2] = call_function[target=torch.ops.aten.mul](args = (%mul_1, 1), kwargs = {}) | |
| %sin : [#users=1] = call_function[target=torch.ops.aten.sin](args = (%mul_2,), kwargs = {}) | |
| %cos : [#users=1] = call_function[target=torch.ops.aten.cos](args = (%mul_2,), kwargs = {}) | |
| %cat : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%sin, %cos], -1), kwargs = {}) | |
| %slice_3 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%cat, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_4 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_3, 1, 128, 9223372036854775807), kwargs = {}) | |
| %slice_5 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%cat, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_6 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_5, 1, 0, 128), kwargs = {}) | |
| %cat_1 : [#users=1] = call_function[target=torch.ops.aten.cat](args = ([%slice_4, %slice_6], -1), kwargs = {}) | |
| %_to_copy : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%cat_1,), kwargs = {dtype: torch.float16}) | |
| %_param_constant0 : [#users=1] = get_attr[target=_param_constant0] | |
| %t : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant0,), kwargs = {}) | |
| %_param_constant1 : [#users=1] = get_attr[target=_param_constant1] | |
| %addmm : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant1, %_to_copy, %t), kwargs = {}) | |
| %silu : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm,), kwargs = {}) | |
| %_param_constant2 : [#users=1] = get_attr[target=_param_constant2] | |
| %t_1 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant2,), kwargs = {}) | |
| %_param_constant3 : [#users=1] = get_attr[target=_param_constant3] | |
| %addmm_1 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant3, %silu, %t_1), kwargs = {}) | |
| %_param_constant4 : [#users=1] = get_attr[target=_param_constant4] | |
| %embedding : [#users=1] = call_function[target=torch.ops.aten.embedding](args = (%_param_constant4, %arg3_1), kwargs = {}) | |
| %add : [#users=22] = call_function[target=torch.ops.aten.add](args = (%addmm_1, %embedding), kwargs = {}) | |
| %_param_constant5 : [#users=1] = get_attr[target=_param_constant5] | |
| %_param_constant6 : [#users=1] = get_attr[target=_param_constant6] | |
| %convolution : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%arg0_1, %_param_constant5, %_param_constant6, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %view : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view, torch.float32), kwargs = {}) | |
| %var_mean : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem : [#users=1] = call_function[target=operator.getitem](args = (%var_mean, 0), kwargs = {}) | |
| %getitem_1 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean, 1), kwargs = {}) | |
| %add_1 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem, 1e-05), kwargs = {}) | |
| %rsqrt : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_1,), kwargs = {}) | |
| %sub : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view, %getitem_1), kwargs = {}) | |
| %mul_3 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub, %rsqrt), kwargs = {}) | |
| %view_1 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_3, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant7 : [#users=1] = get_attr[target=_param_constant7] | |
| %unsqueeze_2 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant7, 0), kwargs = {}) | |
| %unsqueeze_3 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_2, 2), kwargs = {}) | |
| %unsqueeze_4 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_3, 3), kwargs = {}) | |
| %_param_constant8 : [#users=1] = get_attr[target=_param_constant8] | |
| %unsqueeze_5 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant8, 0), kwargs = {}) | |
| %unsqueeze_6 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_5, 2), kwargs = {}) | |
| %unsqueeze_7 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_6, 3), kwargs = {}) | |
| %mul_4 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_1, %unsqueeze_7), kwargs = {}) | |
| %add_2 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_4, %unsqueeze_4), kwargs = {}) | |
| %convert_element_type_1 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_2, torch.float16), kwargs = {}) | |
| %convert_element_type_2 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_1, torch.float16), kwargs = {}) | |
| %convert_element_type_3 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt, torch.float16), kwargs = {}) | |
| %squeeze : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_2, 3), kwargs = {}) | |
| %squeeze_1 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze, 2), kwargs = {}) | |
| %squeeze_2 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_3, 3), kwargs = {}) | |
| %squeeze_3 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_2, 2), kwargs = {}) | |
| %detach : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_1,), kwargs = {}) | |
| %detach_1 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_3,), kwargs = {}) | |
| %silu_1 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_1,), kwargs = {}) | |
| %_param_constant9 : [#users=1] = get_attr[target=_param_constant9] | |
| %_param_constant10 : [#users=1] = get_attr[target=_param_constant10] | |
| %convolution_1 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_1, %_param_constant9, %_param_constant10, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_2 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant11 : [#users=1] = get_attr[target=_param_constant11] | |
| %t_2 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant11,), kwargs = {}) | |
| %_param_constant12 : [#users=1] = get_attr[target=_param_constant12] | |
| %addmm_2 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant12, %silu_2, %t_2), kwargs = {}) | |
| %slice_7 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_2, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_8 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_7, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_8 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_8, 2), kwargs = {}) | |
| %unsqueeze_9 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_8, 3), kwargs = {}) | |
| %add_3 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_1, %unsqueeze_9), kwargs = {}) | |
| %view_2 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_3, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_4 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_2, torch.float32), kwargs = {}) | |
| %var_mean_1 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_4, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_2 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_1, 0), kwargs = {}) | |
| %getitem_3 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_1, 1), kwargs = {}) | |
| %add_4 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_2, 1e-05), kwargs = {}) | |
| %rsqrt_1 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_4,), kwargs = {}) | |
| %sub_1 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_2, %getitem_3), kwargs = {}) | |
| %mul_5 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_1, %rsqrt_1), kwargs = {}) | |
| %view_3 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_5, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant13 : [#users=1] = get_attr[target=_param_constant13] | |
| %unsqueeze_10 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant13, 0), kwargs = {}) | |
| %unsqueeze_11 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_10, 2), kwargs = {}) | |
| %unsqueeze_12 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_11, 3), kwargs = {}) | |
| %_param_constant14 : [#users=1] = get_attr[target=_param_constant14] | |
| %unsqueeze_13 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant14, 0), kwargs = {}) | |
| %unsqueeze_14 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_13, 2), kwargs = {}) | |
| %unsqueeze_15 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_14, 3), kwargs = {}) | |
| %mul_6 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_3, %unsqueeze_15), kwargs = {}) | |
| %add_5 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_6, %unsqueeze_12), kwargs = {}) | |
| %convert_element_type_5 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_5, torch.float16), kwargs = {}) | |
| %convert_element_type_6 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_3, torch.float16), kwargs = {}) | |
| %convert_element_type_7 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_1, torch.float16), kwargs = {}) | |
| %squeeze_4 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_6, 3), kwargs = {}) | |
| %squeeze_5 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_4, 2), kwargs = {}) | |
| %squeeze_6 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_7, 3), kwargs = {}) | |
| %squeeze_7 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_6, 2), kwargs = {}) | |
| %detach_2 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_5,), kwargs = {}) | |
| %detach_3 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_7,), kwargs = {}) | |
| %silu_3 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_5,), kwargs = {}) | |
| %_param_constant15 : [#users=1] = get_attr[target=_param_constant15] | |
| %_param_constant16 : [#users=1] = get_attr[target=_param_constant16] | |
| %convolution_2 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_3, %_param_constant15, %_param_constant16, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_6 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution, %convolution_2), kwargs = {}) | |
| %div_1 : [#users=3] = call_function[target=torch.ops.aten.div](args = (%add_6, 1.0), kwargs = {}) | |
| %view_4 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_1, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_8 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_4, torch.float32), kwargs = {}) | |
| %var_mean_2 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_8, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_2, 0), kwargs = {}) | |
| %getitem_5 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_2, 1), kwargs = {}) | |
| %add_7 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_4, 1e-05), kwargs = {}) | |
| %rsqrt_2 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_7,), kwargs = {}) | |
| %sub_2 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_4, %getitem_5), kwargs = {}) | |
| %mul_7 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_2, %rsqrt_2), kwargs = {}) | |
| %view_5 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_7, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant17 : [#users=1] = get_attr[target=_param_constant17] | |
| %unsqueeze_16 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant17, 0), kwargs = {}) | |
| %unsqueeze_17 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_16, 2), kwargs = {}) | |
| %unsqueeze_18 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_17, 3), kwargs = {}) | |
| %_param_constant18 : [#users=1] = get_attr[target=_param_constant18] | |
| %unsqueeze_19 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant18, 0), kwargs = {}) | |
| %unsqueeze_20 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_19, 2), kwargs = {}) | |
| %unsqueeze_21 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_20, 3), kwargs = {}) | |
| %mul_8 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_5, %unsqueeze_21), kwargs = {}) | |
| %add_8 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_8, %unsqueeze_18), kwargs = {}) | |
| %convert_element_type_9 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_8, torch.float16), kwargs = {}) | |
| %convert_element_type_10 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_5, torch.float16), kwargs = {}) | |
| %convert_element_type_11 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_2, torch.float16), kwargs = {}) | |
| %squeeze_8 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_10, 3), kwargs = {}) | |
| %squeeze_9 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_8, 2), kwargs = {}) | |
| %squeeze_10 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_11, 3), kwargs = {}) | |
| %squeeze_11 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_10, 2), kwargs = {}) | |
| %detach_4 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_9,), kwargs = {}) | |
| %detach_5 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_11,), kwargs = {}) | |
| %silu_4 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_9,), kwargs = {}) | |
| %_param_constant19 : [#users=1] = get_attr[target=_param_constant19] | |
| %_param_constant20 : [#users=1] = get_attr[target=_param_constant20] | |
| %convolution_3 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_4, %_param_constant19, %_param_constant20, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_5 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant21 : [#users=1] = get_attr[target=_param_constant21] | |
| %t_3 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant21,), kwargs = {}) | |
| %_param_constant22 : [#users=1] = get_attr[target=_param_constant22] | |
| %addmm_3 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant22, %silu_5, %t_3), kwargs = {}) | |
| %slice_9 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_3, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_10 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_9, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_22 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_10, 2), kwargs = {}) | |
| %unsqueeze_23 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_22, 3), kwargs = {}) | |
| %add_9 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_3, %unsqueeze_23), kwargs = {}) | |
| %view_6 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_9, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_12 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_6, torch.float32), kwargs = {}) | |
| %var_mean_3 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_12, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_6 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_3, 0), kwargs = {}) | |
| %getitem_7 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_3, 1), kwargs = {}) | |
| %add_10 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_6, 1e-05), kwargs = {}) | |
| %rsqrt_3 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_10,), kwargs = {}) | |
| %sub_3 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_6, %getitem_7), kwargs = {}) | |
| %mul_9 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_3, %rsqrt_3), kwargs = {}) | |
| %view_7 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_9, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant23 : [#users=1] = get_attr[target=_param_constant23] | |
| %unsqueeze_24 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant23, 0), kwargs = {}) | |
| %unsqueeze_25 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_24, 2), kwargs = {}) | |
| %unsqueeze_26 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_25, 3), kwargs = {}) | |
| %_param_constant24 : [#users=1] = get_attr[target=_param_constant24] | |
| %unsqueeze_27 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant24, 0), kwargs = {}) | |
| %unsqueeze_28 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_27, 2), kwargs = {}) | |
| %unsqueeze_29 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_28, 3), kwargs = {}) | |
| %mul_10 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_7, %unsqueeze_29), kwargs = {}) | |
| %add_11 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_10, %unsqueeze_26), kwargs = {}) | |
| %convert_element_type_13 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_11, torch.float16), kwargs = {}) | |
| %convert_element_type_14 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_7, torch.float16), kwargs = {}) | |
| %convert_element_type_15 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_3, torch.float16), kwargs = {}) | |
| %squeeze_12 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_14, 3), kwargs = {}) | |
| %squeeze_13 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_12, 2), kwargs = {}) | |
| %squeeze_14 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_15, 3), kwargs = {}) | |
| %squeeze_15 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_14, 2), kwargs = {}) | |
| %detach_6 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_13,), kwargs = {}) | |
| %detach_7 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_15,), kwargs = {}) | |
| %silu_6 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_13,), kwargs = {}) | |
| %_param_constant25 : [#users=1] = get_attr[target=_param_constant25] | |
| %_param_constant26 : [#users=1] = get_attr[target=_param_constant26] | |
| %convolution_4 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_6, %_param_constant25, %_param_constant26, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_12 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%div_1, %convolution_4), kwargs = {}) | |
| %div_2 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_12, 1.0), kwargs = {}) | |
| %_param_constant27 : [#users=1] = get_attr[target=_param_constant27] | |
| %_param_constant28 : [#users=1] = get_attr[target=_param_constant28] | |
| %convolution_5 : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%div_2, %_param_constant27, %_param_constant28, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %view_8 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution_5, [2, 32, 8, 4096]), kwargs = {}) | |
| %convert_element_type_16 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_8, torch.float32), kwargs = {}) | |
| %var_mean_4 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_16, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_8 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_4, 0), kwargs = {}) | |
| %getitem_9 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_4, 1), kwargs = {}) | |
| %add_13 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_8, 1e-05), kwargs = {}) | |
| %rsqrt_4 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_13,), kwargs = {}) | |
| %sub_4 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_8, %getitem_9), kwargs = {}) | |
| %mul_11 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_4, %rsqrt_4), kwargs = {}) | |
| %view_9 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_11, [2, 256, 64, 64]), kwargs = {}) | |
| %_param_constant29 : [#users=1] = get_attr[target=_param_constant29] | |
| %unsqueeze_30 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant29, 0), kwargs = {}) | |
| %unsqueeze_31 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_30, 2), kwargs = {}) | |
| %unsqueeze_32 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_31, 3), kwargs = {}) | |
| %_param_constant30 : [#users=1] = get_attr[target=_param_constant30] | |
| %unsqueeze_33 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant30, 0), kwargs = {}) | |
| %unsqueeze_34 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_33, 2), kwargs = {}) | |
| %unsqueeze_35 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_34, 3), kwargs = {}) | |
| %mul_12 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_9, %unsqueeze_35), kwargs = {}) | |
| %add_14 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_12, %unsqueeze_32), kwargs = {}) | |
| %convert_element_type_17 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_14, torch.float16), kwargs = {}) | |
| %convert_element_type_18 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_9, torch.float16), kwargs = {}) | |
| %convert_element_type_19 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_4, torch.float16), kwargs = {}) | |
| %squeeze_16 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_18, 3), kwargs = {}) | |
| %squeeze_17 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_16, 2), kwargs = {}) | |
| %squeeze_18 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_19, 3), kwargs = {}) | |
| %squeeze_19 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_18, 2), kwargs = {}) | |
| %detach_8 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_17,), kwargs = {}) | |
| %detach_9 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_19,), kwargs = {}) | |
| %silu_7 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_17,), kwargs = {}) | |
| %_param_constant31 : [#users=1] = get_attr[target=_param_constant31] | |
| %_param_constant32 : [#users=1] = get_attr[target=_param_constant32] | |
| %convolution_6 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_7, %_param_constant31, %_param_constant32, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_8 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant33 : [#users=1] = get_attr[target=_param_constant33] | |
| %t_4 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant33,), kwargs = {}) | |
| %_param_constant34 : [#users=1] = get_attr[target=_param_constant34] | |
| %addmm_4 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant34, %silu_8, %t_4), kwargs = {}) | |
| %slice_11 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_4, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_12 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_11, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_36 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_12, 2), kwargs = {}) | |
| %unsqueeze_37 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_36, 3), kwargs = {}) | |
| %add_15 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_6, %unsqueeze_37), kwargs = {}) | |
| %view_10 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_15, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_20 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_10, torch.float32), kwargs = {}) | |
| %var_mean_5 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_20, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_10 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_5, 0), kwargs = {}) | |
| %getitem_11 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_5, 1), kwargs = {}) | |
| %add_16 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_10, 1e-05), kwargs = {}) | |
| %rsqrt_5 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_16,), kwargs = {}) | |
| %sub_5 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_10, %getitem_11), kwargs = {}) | |
| %mul_13 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_5, %rsqrt_5), kwargs = {}) | |
| %view_11 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_13, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant35 : [#users=1] = get_attr[target=_param_constant35] | |
| %unsqueeze_38 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant35, 0), kwargs = {}) | |
| %unsqueeze_39 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_38, 2), kwargs = {}) | |
| %unsqueeze_40 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_39, 3), kwargs = {}) | |
| %_param_constant36 : [#users=1] = get_attr[target=_param_constant36] | |
| %unsqueeze_41 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant36, 0), kwargs = {}) | |
| %unsqueeze_42 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_41, 2), kwargs = {}) | |
| %unsqueeze_43 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_42, 3), kwargs = {}) | |
| %mul_14 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_11, %unsqueeze_43), kwargs = {}) | |
| %add_17 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_14, %unsqueeze_40), kwargs = {}) | |
| %convert_element_type_21 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_17, torch.float16), kwargs = {}) | |
| %convert_element_type_22 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_11, torch.float16), kwargs = {}) | |
| %convert_element_type_23 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_5, torch.float16), kwargs = {}) | |
| %squeeze_20 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_22, 3), kwargs = {}) | |
| %squeeze_21 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_20, 2), kwargs = {}) | |
| %squeeze_22 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_23, 3), kwargs = {}) | |
| %squeeze_23 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_22, 2), kwargs = {}) | |
| %detach_10 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_21,), kwargs = {}) | |
| %detach_11 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_23,), kwargs = {}) | |
| %silu_9 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_21,), kwargs = {}) | |
| %_param_constant37 : [#users=1] = get_attr[target=_param_constant37] | |
| %_param_constant38 : [#users=1] = get_attr[target=_param_constant38] | |
| %convolution_7 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_9, %_param_constant37, %_param_constant38, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant39 : [#users=1] = get_attr[target=_param_constant39] | |
| %_param_constant40 : [#users=1] = get_attr[target=_param_constant40] | |
| %convolution_8 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%convolution_5, %_param_constant39, %_param_constant40, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_18 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_8, %convolution_7), kwargs = {}) | |
| %div_3 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_18, 1.0), kwargs = {}) | |
| %view_12 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_3, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_24 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_12, torch.float32), kwargs = {}) | |
| %var_mean_6 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_24, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_12 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_6, 0), kwargs = {}) | |
| %getitem_13 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_6, 1), kwargs = {}) | |
| %add_19 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_12, 1e-06), kwargs = {}) | |
| %rsqrt_6 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_19,), kwargs = {}) | |
| %sub_6 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_12, %getitem_13), kwargs = {}) | |
| %mul_15 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_6, %rsqrt_6), kwargs = {}) | |
| %view_13 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_15, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant41 : [#users=1] = get_attr[target=_param_constant41] | |
| %unsqueeze_44 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant41, 0), kwargs = {}) | |
| %unsqueeze_45 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_44, 2), kwargs = {}) | |
| %unsqueeze_46 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_45, 3), kwargs = {}) | |
| %_param_constant42 : [#users=1] = get_attr[target=_param_constant42] | |
| %unsqueeze_47 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant42, 0), kwargs = {}) | |
| %unsqueeze_48 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_47, 2), kwargs = {}) | |
| %unsqueeze_49 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_48, 3), kwargs = {}) | |
| %mul_16 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_13, %unsqueeze_49), kwargs = {}) | |
| %add_20 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_16, %unsqueeze_46), kwargs = {}) | |
| %convert_element_type_25 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_20, torch.float16), kwargs = {}) | |
| %convert_element_type_26 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_13, torch.float16), kwargs = {}) | |
| %convert_element_type_27 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_6, torch.float16), kwargs = {}) | |
| %squeeze_24 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_26, 3), kwargs = {}) | |
| %squeeze_25 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_24, 2), kwargs = {}) | |
| %squeeze_26 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_27, 3), kwargs = {}) | |
| %squeeze_27 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_26, 2), kwargs = {}) | |
| %detach_12 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_25,), kwargs = {}) | |
| %detach_13 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_27,), kwargs = {}) | |
| %permute : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_25, [0, 2, 3, 1]), kwargs = {}) | |
| %view_14 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant43 : [#users=1] = get_attr[target=_param_constant43] | |
| %t_5 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant43,), kwargs = {}) | |
| %expand_1 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_14, [2, 4096, 512]), kwargs = {}) | |
| %view_15 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_1, [2, 4096, 512]), kwargs = {}) | |
| %expand_2 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_5, [2, 512, 512]), kwargs = {}) | |
| %view_16 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_2, [2, 512, 512]), kwargs = {}) | |
| %bmm : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_15, %view_16), kwargs = {}) | |
| %_unsafe_view : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant44 : [#users=1] = get_attr[target=_param_constant44] | |
| %add_21 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view, %_param_constant44), kwargs = {}) | |
| %convert_element_type_28 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_21, torch.float32), kwargs = {}) | |
| %var_mean_7 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_28, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_14 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_7, 0), kwargs = {}) | |
| %getitem_15 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_7, 1), kwargs = {}) | |
| %add_22 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_14, 1e-05), kwargs = {}) | |
| %rsqrt_7 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_22,), kwargs = {}) | |
| %sub_7 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_21, %getitem_15), kwargs = {}) | |
| %mul_17 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_7, %rsqrt_7), kwargs = {}) | |
| %_param_constant45 : [#users=1] = get_attr[target=_param_constant45] | |
| %mul_18 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_17, %_param_constant45), kwargs = {}) | |
| %_param_constant46 : [#users=1] = get_attr[target=_param_constant46] | |
| %add_23 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_18, %_param_constant46), kwargs = {}) | |
| %convert_element_type_29 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_23, torch.float16), kwargs = {}) | |
| %_param_constant47 : [#users=1] = get_attr[target=_param_constant47] | |
| %t_6 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant47,), kwargs = {}) | |
| %view_17 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_29, [8192, 512]), kwargs = {}) | |
| %mm : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_17, %t_6), kwargs = {}) | |
| %_unsafe_view_1 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm, [2, 4096, 512]), kwargs = {}) | |
| %view_18 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_1, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_1 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_18, [0, 2, 1, 3]), kwargs = {}) | |
| %clone : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_2 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant48 : [#users=1] = get_attr[target=_param_constant48] | |
| %t_7 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant48,), kwargs = {}) | |
| %view_19 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_1 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_19, %t_7), kwargs = {}) | |
| %_unsafe_view_3 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_1, [2, 77, 512]), kwargs = {}) | |
| %_param_constant49 : [#users=1] = get_attr[target=_param_constant49] | |
| %t_8 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant49,), kwargs = {}) | |
| %view_20 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_2 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_20, %t_8), kwargs = {}) | |
| %_unsafe_view_4 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_2, [2, 77, 512]), kwargs = {}) | |
| %view_21 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_3, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_2 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_21, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_1 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_5 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_1, [16, 77, 64]), kwargs = {}) | |
| %view_22 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_4, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_3 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_22, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_2 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_6 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_2, [16, 77, 64]), kwargs = {}) | |
| %empty : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_5, -1, -2), kwargs = {}) | |
| %baddbmm : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty, %_unsafe_view_2, %transpose), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm, -1, False), kwargs = {}) | |
| %detach_14 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax,), kwargs = {}) | |
| %bmm_1 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax, %_unsafe_view_6), kwargs = {}) | |
| %view_23 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_1, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_4 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_23, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_3 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_4,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_7 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_3, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant50 : [#users=1] = get_attr[target=_param_constant50] | |
| %t_9 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant50,), kwargs = {}) | |
| %view_24 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_7, [8192, 512]), kwargs = {}) | |
| %_param_constant51 : [#users=1] = get_attr[target=_param_constant51] | |
| %addmm_5 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant51, %view_24, %t_9), kwargs = {}) | |
| %view_25 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_5, [2, 4096, 512]), kwargs = {}) | |
| %add_24 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_25, %add_21), kwargs = {}) | |
| %convert_element_type_30 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_24, torch.float32), kwargs = {}) | |
| %var_mean_8 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_30, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_16 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_8, 0), kwargs = {}) | |
| %getitem_17 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_8, 1), kwargs = {}) | |
| %add_25 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_16, 1e-05), kwargs = {}) | |
| %rsqrt_8 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_25,), kwargs = {}) | |
| %sub_8 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_24, %getitem_17), kwargs = {}) | |
| %mul_19 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_8, %rsqrt_8), kwargs = {}) | |
| %_param_constant52 : [#users=1] = get_attr[target=_param_constant52] | |
| %mul_20 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_19, %_param_constant52), kwargs = {}) | |
| %_param_constant53 : [#users=1] = get_attr[target=_param_constant53] | |
| %add_26 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_20, %_param_constant53), kwargs = {}) | |
| %convert_element_type_31 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_26, torch.float16), kwargs = {}) | |
| %_param_constant54 : [#users=1] = get_attr[target=_param_constant54] | |
| %t_10 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant54,), kwargs = {}) | |
| %view_26 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_31, [8192, 512]), kwargs = {}) | |
| %mm_3 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_26, %t_10), kwargs = {}) | |
| %_unsafe_view_8 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_3, [2, 4096, 512]), kwargs = {}) | |
| %view_27 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_8, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_5 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_27, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_4 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_5,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_9 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_4, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant55 : [#users=1] = get_attr[target=_param_constant55] | |
| %t_11 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant55,), kwargs = {}) | |
| %view_28 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_4 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_28, %t_11), kwargs = {}) | |
| %_unsafe_view_10 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_4, [2, 77, 512]), kwargs = {}) | |
| %_param_constant56 : [#users=1] = get_attr[target=_param_constant56] | |
| %t_12 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant56,), kwargs = {}) | |
| %view_29 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_5 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_29, %t_12), kwargs = {}) | |
| %_unsafe_view_11 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_5, [2, 77, 512]), kwargs = {}) | |
| %view_30 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_10, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_6 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_30, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_5 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_6,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_12 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_5, [16, 77, 64]), kwargs = {}) | |
| %view_31 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_11, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_7 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_31, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_6 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_7,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_13 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_6, [16, 77, 64]), kwargs = {}) | |
| %empty_1 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_1 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_12, -1, -2), kwargs = {}) | |
| %baddbmm_1 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_1, %_unsafe_view_9, %transpose_1), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_1 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_1, -1, False), kwargs = {}) | |
| %detach_15 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_1,), kwargs = {}) | |
| %bmm_2 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_1, %_unsafe_view_13), kwargs = {}) | |
| %view_32 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_2, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_8 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_32, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_7 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_8,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_14 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_7, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant57 : [#users=1] = get_attr[target=_param_constant57] | |
| %t_13 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant57,), kwargs = {}) | |
| %view_33 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_14, [8192, 512]), kwargs = {}) | |
| %_param_constant58 : [#users=1] = get_attr[target=_param_constant58] | |
| %addmm_6 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant58, %view_33, %t_13), kwargs = {}) | |
| %view_34 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_6, [2, 4096, 512]), kwargs = {}) | |
| %add_27 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_34, %add_24), kwargs = {}) | |
| %convert_element_type_32 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_27, torch.float32), kwargs = {}) | |
| %var_mean_9 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_32, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_18 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_9, 0), kwargs = {}) | |
| %getitem_19 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_9, 1), kwargs = {}) | |
| %add_28 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_18, 1e-05), kwargs = {}) | |
| %rsqrt_9 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_28,), kwargs = {}) | |
| %sub_9 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_27, %getitem_19), kwargs = {}) | |
| %mul_21 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_9, %rsqrt_9), kwargs = {}) | |
| %_param_constant59 : [#users=1] = get_attr[target=_param_constant59] | |
| %mul_22 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_21, %_param_constant59), kwargs = {}) | |
| %_param_constant60 : [#users=1] = get_attr[target=_param_constant60] | |
| %add_29 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_22, %_param_constant60), kwargs = {}) | |
| %convert_element_type_33 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_29, torch.float16), kwargs = {}) | |
| %_param_constant61 : [#users=1] = get_attr[target=_param_constant61] | |
| %t_14 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant61,), kwargs = {}) | |
| %view_35 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_33, [8192, 512]), kwargs = {}) | |
| %_param_constant62 : [#users=1] = get_attr[target=_param_constant62] | |
| %addmm_7 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant62, %view_35, %t_14), kwargs = {}) | |
| %view_36 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_7, [2, 4096, 4096]), kwargs = {}) | |
| %slice_13 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_36, -1, 0, 2048), kwargs = {}) | |
| %slice_14 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_36, -1, 2048, 4096), kwargs = {}) | |
| %gelu : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_14,), kwargs = {}) | |
| %mul_23 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_13, %gelu), kwargs = {}) | |
| %_param_constant63 : [#users=1] = get_attr[target=_param_constant63] | |
| %t_15 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant63,), kwargs = {}) | |
| %view_37 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_23, [8192, 2048]), kwargs = {}) | |
| %_param_constant64 : [#users=1] = get_attr[target=_param_constant64] | |
| %addmm_8 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant64, %view_37, %t_15), kwargs = {}) | |
| %view_38 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_8, [2, 4096, 512]), kwargs = {}) | |
| %add_30 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_38, %add_27), kwargs = {}) | |
| %_param_constant65 : [#users=1] = get_attr[target=_param_constant65] | |
| %t_16 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant65,), kwargs = {}) | |
| %view_39 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_30, [8192, 512]), kwargs = {}) | |
| %_param_constant66 : [#users=1] = get_attr[target=_param_constant66] | |
| %addmm_9 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant66, %view_39, %t_16), kwargs = {}) | |
| %view_40 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_9, [2, 4096, 512]), kwargs = {}) | |
| %view_41 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_40, [2, 64, 64, 512]), kwargs = {}) | |
| %permute_9 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_41, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_8 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_9,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_31 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_8, %div_3), kwargs = {}) | |
| %view_42 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_31, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_34 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_42, torch.float32), kwargs = {}) | |
| %var_mean_10 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_34, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_20 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_10, 0), kwargs = {}) | |
| %getitem_21 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_10, 1), kwargs = {}) | |
| %add_32 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_20, 1e-05), kwargs = {}) | |
| %rsqrt_10 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_32,), kwargs = {}) | |
| %sub_10 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_42, %getitem_21), kwargs = {}) | |
| %mul_24 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_10, %rsqrt_10), kwargs = {}) | |
| %view_43 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_24, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant67 : [#users=1] = get_attr[target=_param_constant67] | |
| %unsqueeze_50 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant67, 0), kwargs = {}) | |
| %unsqueeze_51 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_50, 2), kwargs = {}) | |
| %unsqueeze_52 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_51, 3), kwargs = {}) | |
| %_param_constant68 : [#users=1] = get_attr[target=_param_constant68] | |
| %unsqueeze_53 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant68, 0), kwargs = {}) | |
| %unsqueeze_54 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_53, 2), kwargs = {}) | |
| %unsqueeze_55 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_54, 3), kwargs = {}) | |
| %mul_25 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_43, %unsqueeze_55), kwargs = {}) | |
| %add_33 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_25, %unsqueeze_52), kwargs = {}) | |
| %convert_element_type_35 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_33, torch.float16), kwargs = {}) | |
| %convert_element_type_36 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_21, torch.float16), kwargs = {}) | |
| %convert_element_type_37 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_10, torch.float16), kwargs = {}) | |
| %squeeze_28 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_36, 3), kwargs = {}) | |
| %squeeze_29 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_28, 2), kwargs = {}) | |
| %squeeze_30 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_37, 3), kwargs = {}) | |
| %squeeze_31 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_30, 2), kwargs = {}) | |
| %detach_16 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_29,), kwargs = {}) | |
| %detach_17 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_31,), kwargs = {}) | |
| %silu_10 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_35,), kwargs = {}) | |
| %_param_constant69 : [#users=1] = get_attr[target=_param_constant69] | |
| %_param_constant70 : [#users=1] = get_attr[target=_param_constant70] | |
| %convolution_9 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_10, %_param_constant69, %_param_constant70, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_11 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant71 : [#users=1] = get_attr[target=_param_constant71] | |
| %t_17 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant71,), kwargs = {}) | |
| %_param_constant72 : [#users=1] = get_attr[target=_param_constant72] | |
| %addmm_10 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant72, %silu_11, %t_17), kwargs = {}) | |
| %slice_15 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_10, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_16 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_15, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_56 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_16, 2), kwargs = {}) | |
| %unsqueeze_57 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_56, 3), kwargs = {}) | |
| %add_34 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_9, %unsqueeze_57), kwargs = {}) | |
| %view_44 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_34, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_38 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_44, torch.float32), kwargs = {}) | |
| %var_mean_11 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_38, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_22 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_11, 0), kwargs = {}) | |
| %getitem_23 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_11, 1), kwargs = {}) | |
| %add_35 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_22, 1e-05), kwargs = {}) | |
| %rsqrt_11 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_35,), kwargs = {}) | |
| %sub_11 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_44, %getitem_23), kwargs = {}) | |
| %mul_26 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_11, %rsqrt_11), kwargs = {}) | |
| %view_45 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_26, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant73 : [#users=1] = get_attr[target=_param_constant73] | |
| %unsqueeze_58 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant73, 0), kwargs = {}) | |
| %unsqueeze_59 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_58, 2), kwargs = {}) | |
| %unsqueeze_60 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_59, 3), kwargs = {}) | |
| %_param_constant74 : [#users=1] = get_attr[target=_param_constant74] | |
| %unsqueeze_61 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant74, 0), kwargs = {}) | |
| %unsqueeze_62 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_61, 2), kwargs = {}) | |
| %unsqueeze_63 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_62, 3), kwargs = {}) | |
| %mul_27 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_45, %unsqueeze_63), kwargs = {}) | |
| %add_36 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_27, %unsqueeze_60), kwargs = {}) | |
| %convert_element_type_39 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_36, torch.float16), kwargs = {}) | |
| %convert_element_type_40 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_23, torch.float16), kwargs = {}) | |
| %convert_element_type_41 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_11, torch.float16), kwargs = {}) | |
| %squeeze_32 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_40, 3), kwargs = {}) | |
| %squeeze_33 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_32, 2), kwargs = {}) | |
| %squeeze_34 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_41, 3), kwargs = {}) | |
| %squeeze_35 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_34, 2), kwargs = {}) | |
| %detach_18 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_33,), kwargs = {}) | |
| %detach_19 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_35,), kwargs = {}) | |
| %silu_12 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_39,), kwargs = {}) | |
| %_param_constant75 : [#users=1] = get_attr[target=_param_constant75] | |
| %_param_constant76 : [#users=1] = get_attr[target=_param_constant76] | |
| %convolution_10 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_12, %_param_constant75, %_param_constant76, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_37 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_31, %convolution_10), kwargs = {}) | |
| %div_4 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_37, 1.0), kwargs = {}) | |
| %view_46 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_4, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_42 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_46, torch.float32), kwargs = {}) | |
| %var_mean_12 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_42, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_24 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_12, 0), kwargs = {}) | |
| %getitem_25 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_12, 1), kwargs = {}) | |
| %add_38 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_24, 1e-06), kwargs = {}) | |
| %rsqrt_12 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_38,), kwargs = {}) | |
| %sub_12 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_46, %getitem_25), kwargs = {}) | |
| %mul_28 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_12, %rsqrt_12), kwargs = {}) | |
| %view_47 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_28, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant77 : [#users=1] = get_attr[target=_param_constant77] | |
| %unsqueeze_64 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant77, 0), kwargs = {}) | |
| %unsqueeze_65 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_64, 2), kwargs = {}) | |
| %unsqueeze_66 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_65, 3), kwargs = {}) | |
| %_param_constant78 : [#users=1] = get_attr[target=_param_constant78] | |
| %unsqueeze_67 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant78, 0), kwargs = {}) | |
| %unsqueeze_68 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_67, 2), kwargs = {}) | |
| %unsqueeze_69 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_68, 3), kwargs = {}) | |
| %mul_29 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_47, %unsqueeze_69), kwargs = {}) | |
| %add_39 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_29, %unsqueeze_66), kwargs = {}) | |
| %convert_element_type_43 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_39, torch.float16), kwargs = {}) | |
| %convert_element_type_44 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_25, torch.float16), kwargs = {}) | |
| %convert_element_type_45 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_12, torch.float16), kwargs = {}) | |
| %squeeze_36 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_44, 3), kwargs = {}) | |
| %squeeze_37 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_36, 2), kwargs = {}) | |
| %squeeze_38 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_45, 3), kwargs = {}) | |
| %squeeze_39 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_38, 2), kwargs = {}) | |
| %detach_20 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_37,), kwargs = {}) | |
| %detach_21 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_39,), kwargs = {}) | |
| %permute_10 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_43, [0, 2, 3, 1]), kwargs = {}) | |
| %view_48 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_10, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant79 : [#users=1] = get_attr[target=_param_constant79] | |
| %t_18 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant79,), kwargs = {}) | |
| %expand_3 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_48, [2, 4096, 512]), kwargs = {}) | |
| %view_49 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_3, [2, 4096, 512]), kwargs = {}) | |
| %expand_4 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_18, [2, 512, 512]), kwargs = {}) | |
| %view_50 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_4, [2, 512, 512]), kwargs = {}) | |
| %bmm_3 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_49, %view_50), kwargs = {}) | |
| %_unsafe_view_15 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_3, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant80 : [#users=1] = get_attr[target=_param_constant80] | |
| %add_40 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_15, %_param_constant80), kwargs = {}) | |
| %convert_element_type_46 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_40, torch.float32), kwargs = {}) | |
| %var_mean_13 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_46, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_26 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_13, 0), kwargs = {}) | |
| %getitem_27 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_13, 1), kwargs = {}) | |
| %add_41 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_26, 1e-05), kwargs = {}) | |
| %rsqrt_13 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_41,), kwargs = {}) | |
| %sub_13 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_40, %getitem_27), kwargs = {}) | |
| %mul_30 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_13, %rsqrt_13), kwargs = {}) | |
| %_param_constant81 : [#users=1] = get_attr[target=_param_constant81] | |
| %mul_31 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_30, %_param_constant81), kwargs = {}) | |
| %_param_constant82 : [#users=1] = get_attr[target=_param_constant82] | |
| %add_42 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_31, %_param_constant82), kwargs = {}) | |
| %convert_element_type_47 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_42, torch.float16), kwargs = {}) | |
| %_param_constant83 : [#users=1] = get_attr[target=_param_constant83] | |
| %t_19 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant83,), kwargs = {}) | |
| %view_51 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_47, [8192, 512]), kwargs = {}) | |
| %mm_6 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_51, %t_19), kwargs = {}) | |
| %_unsafe_view_16 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_6, [2, 4096, 512]), kwargs = {}) | |
| %view_52 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_16, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_11 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_52, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_9 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_11,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_17 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_9, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant84 : [#users=1] = get_attr[target=_param_constant84] | |
| %t_20 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant84,), kwargs = {}) | |
| %view_53 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_7 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_53, %t_20), kwargs = {}) | |
| %_unsafe_view_18 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_7, [2, 77, 512]), kwargs = {}) | |
| %_param_constant85 : [#users=1] = get_attr[target=_param_constant85] | |
| %t_21 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant85,), kwargs = {}) | |
| %view_54 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_8 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_54, %t_21), kwargs = {}) | |
| %_unsafe_view_19 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_8, [2, 77, 512]), kwargs = {}) | |
| %view_55 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_18, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_12 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_55, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_10 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_12,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_20 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_10, [16, 77, 64]), kwargs = {}) | |
| %view_56 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_19, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_13 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_56, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_11 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_13,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_21 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_11, [16, 77, 64]), kwargs = {}) | |
| %empty_2 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_2 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_20, -1, -2), kwargs = {}) | |
| %baddbmm_2 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_2, %_unsafe_view_17, %transpose_2), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_2 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_2, -1, False), kwargs = {}) | |
| %detach_22 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_2,), kwargs = {}) | |
| %bmm_4 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_2, %_unsafe_view_21), kwargs = {}) | |
| %view_57 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_4, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_14 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_57, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_12 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_14,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_22 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_12, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant86 : [#users=1] = get_attr[target=_param_constant86] | |
| %t_22 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant86,), kwargs = {}) | |
| %view_58 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_22, [8192, 512]), kwargs = {}) | |
| %_param_constant87 : [#users=1] = get_attr[target=_param_constant87] | |
| %addmm_11 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant87, %view_58, %t_22), kwargs = {}) | |
| %view_59 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_11, [2, 4096, 512]), kwargs = {}) | |
| %add_43 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_59, %add_40), kwargs = {}) | |
| %convert_element_type_48 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_43, torch.float32), kwargs = {}) | |
| %var_mean_14 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_48, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_28 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_14, 0), kwargs = {}) | |
| %getitem_29 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_14, 1), kwargs = {}) | |
| %add_44 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_28, 1e-05), kwargs = {}) | |
| %rsqrt_14 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_44,), kwargs = {}) | |
| %sub_14 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_43, %getitem_29), kwargs = {}) | |
| %mul_32 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_14, %rsqrt_14), kwargs = {}) | |
| %_param_constant88 : [#users=1] = get_attr[target=_param_constant88] | |
| %mul_33 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_32, %_param_constant88), kwargs = {}) | |
| %_param_constant89 : [#users=1] = get_attr[target=_param_constant89] | |
| %add_45 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_33, %_param_constant89), kwargs = {}) | |
| %convert_element_type_49 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_45, torch.float16), kwargs = {}) | |
| %_param_constant90 : [#users=1] = get_attr[target=_param_constant90] | |
| %t_23 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant90,), kwargs = {}) | |
| %view_60 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_49, [8192, 512]), kwargs = {}) | |
| %mm_9 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_60, %t_23), kwargs = {}) | |
| %_unsafe_view_23 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_9, [2, 4096, 512]), kwargs = {}) | |
| %view_61 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_23, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_15 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_61, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_13 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_15,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_24 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_13, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant91 : [#users=1] = get_attr[target=_param_constant91] | |
| %t_24 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant91,), kwargs = {}) | |
| %view_62 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_10 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_62, %t_24), kwargs = {}) | |
| %_unsafe_view_25 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_10, [2, 77, 512]), kwargs = {}) | |
| %_param_constant92 : [#users=1] = get_attr[target=_param_constant92] | |
| %t_25 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant92,), kwargs = {}) | |
| %view_63 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_11 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_63, %t_25), kwargs = {}) | |
| %_unsafe_view_26 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_11, [2, 77, 512]), kwargs = {}) | |
| %view_64 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_25, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_16 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_64, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_14 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_16,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_27 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_14, [16, 77, 64]), kwargs = {}) | |
| %view_65 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_26, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_17 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_65, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_15 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_17,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_28 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_15, [16, 77, 64]), kwargs = {}) | |
| %empty_3 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_3 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_27, -1, -2), kwargs = {}) | |
| %baddbmm_3 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_3, %_unsafe_view_24, %transpose_3), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_3 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_3, -1, False), kwargs = {}) | |
| %detach_23 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_3,), kwargs = {}) | |
| %bmm_5 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_3, %_unsafe_view_28), kwargs = {}) | |
| %view_66 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_5, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_18 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_66, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_16 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_18,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_29 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_16, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant93 : [#users=1] = get_attr[target=_param_constant93] | |
| %t_26 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant93,), kwargs = {}) | |
| %view_67 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_29, [8192, 512]), kwargs = {}) | |
| %_param_constant94 : [#users=1] = get_attr[target=_param_constant94] | |
| %addmm_12 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant94, %view_67, %t_26), kwargs = {}) | |
| %view_68 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_12, [2, 4096, 512]), kwargs = {}) | |
| %add_46 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_68, %add_43), kwargs = {}) | |
| %convert_element_type_50 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_46, torch.float32), kwargs = {}) | |
| %var_mean_15 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_50, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_30 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_15, 0), kwargs = {}) | |
| %getitem_31 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_15, 1), kwargs = {}) | |
| %add_47 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_30, 1e-05), kwargs = {}) | |
| %rsqrt_15 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_47,), kwargs = {}) | |
| %sub_15 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_46, %getitem_31), kwargs = {}) | |
| %mul_34 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_15, %rsqrt_15), kwargs = {}) | |
| %_param_constant95 : [#users=1] = get_attr[target=_param_constant95] | |
| %mul_35 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_34, %_param_constant95), kwargs = {}) | |
| %_param_constant96 : [#users=1] = get_attr[target=_param_constant96] | |
| %add_48 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_35, %_param_constant96), kwargs = {}) | |
| %convert_element_type_51 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_48, torch.float16), kwargs = {}) | |
| %_param_constant97 : [#users=1] = get_attr[target=_param_constant97] | |
| %t_27 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant97,), kwargs = {}) | |
| %view_69 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_51, [8192, 512]), kwargs = {}) | |
| %_param_constant98 : [#users=1] = get_attr[target=_param_constant98] | |
| %addmm_13 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant98, %view_69, %t_27), kwargs = {}) | |
| %view_70 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_13, [2, 4096, 4096]), kwargs = {}) | |
| %slice_17 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_70, -1, 0, 2048), kwargs = {}) | |
| %slice_18 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_70, -1, 2048, 4096), kwargs = {}) | |
| %gelu_1 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_18,), kwargs = {}) | |
| %mul_36 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_17, %gelu_1), kwargs = {}) | |
| %_param_constant99 : [#users=1] = get_attr[target=_param_constant99] | |
| %t_28 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant99,), kwargs = {}) | |
| %view_71 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_36, [8192, 2048]), kwargs = {}) | |
| %_param_constant100 : [#users=1] = get_attr[target=_param_constant100] | |
| %addmm_14 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant100, %view_71, %t_28), kwargs = {}) | |
| %view_72 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_14, [2, 4096, 512]), kwargs = {}) | |
| %add_49 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_72, %add_46), kwargs = {}) | |
| %_param_constant101 : [#users=1] = get_attr[target=_param_constant101] | |
| %t_29 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant101,), kwargs = {}) | |
| %view_73 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_49, [8192, 512]), kwargs = {}) | |
| %_param_constant102 : [#users=1] = get_attr[target=_param_constant102] | |
| %addmm_15 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant102, %view_73, %t_29), kwargs = {}) | |
| %view_74 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_15, [2, 4096, 512]), kwargs = {}) | |
| %view_75 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_74, [2, 64, 64, 512]), kwargs = {}) | |
| %permute_19 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_75, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_17 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_19,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_50 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_17, %div_4), kwargs = {}) | |
| %_param_constant103 : [#users=1] = get_attr[target=_param_constant103] | |
| %_param_constant104 : [#users=1] = get_attr[target=_param_constant104] | |
| %convolution_11 : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%add_50, %_param_constant103, %_param_constant104, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %view_76 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution_11, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_52 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_76, torch.float32), kwargs = {}) | |
| %var_mean_16 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_52, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_32 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_16, 0), kwargs = {}) | |
| %getitem_33 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_16, 1), kwargs = {}) | |
| %add_51 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_32, 1e-05), kwargs = {}) | |
| %rsqrt_16 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_51,), kwargs = {}) | |
| %sub_16 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_76, %getitem_33), kwargs = {}) | |
| %mul_37 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_16, %rsqrt_16), kwargs = {}) | |
| %view_77 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_37, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant105 : [#users=1] = get_attr[target=_param_constant105] | |
| %unsqueeze_70 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant105, 0), kwargs = {}) | |
| %unsqueeze_71 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_70, 2), kwargs = {}) | |
| %unsqueeze_72 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_71, 3), kwargs = {}) | |
| %_param_constant106 : [#users=1] = get_attr[target=_param_constant106] | |
| %unsqueeze_73 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant106, 0), kwargs = {}) | |
| %unsqueeze_74 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_73, 2), kwargs = {}) | |
| %unsqueeze_75 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_74, 3), kwargs = {}) | |
| %mul_38 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_77, %unsqueeze_75), kwargs = {}) | |
| %add_52 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_38, %unsqueeze_72), kwargs = {}) | |
| %convert_element_type_53 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_52, torch.float16), kwargs = {}) | |
| %convert_element_type_54 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_33, torch.float16), kwargs = {}) | |
| %convert_element_type_55 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_16, torch.float16), kwargs = {}) | |
| %squeeze_40 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_54, 3), kwargs = {}) | |
| %squeeze_41 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_40, 2), kwargs = {}) | |
| %squeeze_42 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_55, 3), kwargs = {}) | |
| %squeeze_43 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_42, 2), kwargs = {}) | |
| %detach_24 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_41,), kwargs = {}) | |
| %detach_25 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_43,), kwargs = {}) | |
| %silu_13 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_53,), kwargs = {}) | |
| %_param_constant107 : [#users=1] = get_attr[target=_param_constant107] | |
| %_param_constant108 : [#users=1] = get_attr[target=_param_constant108] | |
| %convolution_12 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_13, %_param_constant107, %_param_constant108, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_14 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant109 : [#users=1] = get_attr[target=_param_constant109] | |
| %t_30 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant109,), kwargs = {}) | |
| %_param_constant110 : [#users=1] = get_attr[target=_param_constant110] | |
| %addmm_16 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant110, %silu_14, %t_30), kwargs = {}) | |
| %slice_19 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_16, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_20 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_19, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_76 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_20, 2), kwargs = {}) | |
| %unsqueeze_77 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_76, 3), kwargs = {}) | |
| %add_53 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_12, %unsqueeze_77), kwargs = {}) | |
| %view_78 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_53, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_56 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_78, torch.float32), kwargs = {}) | |
| %var_mean_17 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_56, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_34 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_17, 0), kwargs = {}) | |
| %getitem_35 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_17, 1), kwargs = {}) | |
| %add_54 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_34, 1e-05), kwargs = {}) | |
| %rsqrt_17 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_54,), kwargs = {}) | |
| %sub_17 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_78, %getitem_35), kwargs = {}) | |
| %mul_39 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_17, %rsqrt_17), kwargs = {}) | |
| %view_79 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_39, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant111 : [#users=1] = get_attr[target=_param_constant111] | |
| %unsqueeze_78 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant111, 0), kwargs = {}) | |
| %unsqueeze_79 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_78, 2), kwargs = {}) | |
| %unsqueeze_80 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_79, 3), kwargs = {}) | |
| %_param_constant112 : [#users=1] = get_attr[target=_param_constant112] | |
| %unsqueeze_81 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant112, 0), kwargs = {}) | |
| %unsqueeze_82 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_81, 2), kwargs = {}) | |
| %unsqueeze_83 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_82, 3), kwargs = {}) | |
| %mul_40 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_79, %unsqueeze_83), kwargs = {}) | |
| %add_55 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_40, %unsqueeze_80), kwargs = {}) | |
| %convert_element_type_57 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_55, torch.float16), kwargs = {}) | |
| %convert_element_type_58 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_35, torch.float16), kwargs = {}) | |
| %convert_element_type_59 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_17, torch.float16), kwargs = {}) | |
| %squeeze_44 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_58, 3), kwargs = {}) | |
| %squeeze_45 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_44, 2), kwargs = {}) | |
| %squeeze_46 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_59, 3), kwargs = {}) | |
| %squeeze_47 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_46, 2), kwargs = {}) | |
| %detach_26 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_45,), kwargs = {}) | |
| %detach_27 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_47,), kwargs = {}) | |
| %silu_15 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_57,), kwargs = {}) | |
| %_param_constant113 : [#users=1] = get_attr[target=_param_constant113] | |
| %_param_constant114 : [#users=1] = get_attr[target=_param_constant114] | |
| %convolution_13 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_15, %_param_constant113, %_param_constant114, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_56 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_11, %convolution_13), kwargs = {}) | |
| %div_5 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_56, 1.0), kwargs = {}) | |
| %view_80 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_5, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_60 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_80, torch.float32), kwargs = {}) | |
| %var_mean_18 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_60, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_36 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_18, 0), kwargs = {}) | |
| %getitem_37 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_18, 1), kwargs = {}) | |
| %add_57 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_36, 1e-06), kwargs = {}) | |
| %rsqrt_18 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_57,), kwargs = {}) | |
| %sub_18 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_80, %getitem_37), kwargs = {}) | |
| %mul_41 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_18, %rsqrt_18), kwargs = {}) | |
| %view_81 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_41, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant115 : [#users=1] = get_attr[target=_param_constant115] | |
| %unsqueeze_84 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant115, 0), kwargs = {}) | |
| %unsqueeze_85 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_84, 2), kwargs = {}) | |
| %unsqueeze_86 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_85, 3), kwargs = {}) | |
| %_param_constant116 : [#users=1] = get_attr[target=_param_constant116] | |
| %unsqueeze_87 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant116, 0), kwargs = {}) | |
| %unsqueeze_88 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_87, 2), kwargs = {}) | |
| %unsqueeze_89 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_88, 3), kwargs = {}) | |
| %mul_42 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_81, %unsqueeze_89), kwargs = {}) | |
| %add_58 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_42, %unsqueeze_86), kwargs = {}) | |
| %convert_element_type_61 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_58, torch.float16), kwargs = {}) | |
| %convert_element_type_62 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_37, torch.float16), kwargs = {}) | |
| %convert_element_type_63 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_18, torch.float16), kwargs = {}) | |
| %squeeze_48 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_62, 3), kwargs = {}) | |
| %squeeze_49 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_48, 2), kwargs = {}) | |
| %squeeze_50 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_63, 3), kwargs = {}) | |
| %squeeze_51 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_50, 2), kwargs = {}) | |
| %detach_28 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_49,), kwargs = {}) | |
| %detach_29 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_51,), kwargs = {}) | |
| %permute_20 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_61, [0, 2, 3, 1]), kwargs = {}) | |
| %view_82 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_20, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant117 : [#users=1] = get_attr[target=_param_constant117] | |
| %t_31 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant117,), kwargs = {}) | |
| %expand_5 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_82, [2, 1024, 512]), kwargs = {}) | |
| %view_83 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_5, [2, 1024, 512]), kwargs = {}) | |
| %expand_6 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_31, [2, 512, 512]), kwargs = {}) | |
| %view_84 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_6, [2, 512, 512]), kwargs = {}) | |
| %bmm_6 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_83, %view_84), kwargs = {}) | |
| %_unsafe_view_30 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_6, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant118 : [#users=1] = get_attr[target=_param_constant118] | |
| %add_59 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_30, %_param_constant118), kwargs = {}) | |
| %convert_element_type_64 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_59, torch.float32), kwargs = {}) | |
| %var_mean_19 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_64, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_38 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_19, 0), kwargs = {}) | |
| %getitem_39 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_19, 1), kwargs = {}) | |
| %add_60 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_38, 1e-05), kwargs = {}) | |
| %rsqrt_19 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_60,), kwargs = {}) | |
| %sub_19 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_59, %getitem_39), kwargs = {}) | |
| %mul_43 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_19, %rsqrt_19), kwargs = {}) | |
| %_param_constant119 : [#users=1] = get_attr[target=_param_constant119] | |
| %mul_44 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_43, %_param_constant119), kwargs = {}) | |
| %_param_constant120 : [#users=1] = get_attr[target=_param_constant120] | |
| %add_61 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_44, %_param_constant120), kwargs = {}) | |
| %convert_element_type_65 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_61, torch.float16), kwargs = {}) | |
| %_param_constant121 : [#users=1] = get_attr[target=_param_constant121] | |
| %t_32 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant121,), kwargs = {}) | |
| %view_85 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_65, [2048, 512]), kwargs = {}) | |
| %mm_12 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_85, %t_32), kwargs = {}) | |
| %_unsafe_view_31 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_12, [2, 1024, 512]), kwargs = {}) | |
| %view_86 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_31, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_21 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_86, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_18 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_21,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_32 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_18, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant122 : [#users=1] = get_attr[target=_param_constant122] | |
| %t_33 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant122,), kwargs = {}) | |
| %view_87 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_13 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_87, %t_33), kwargs = {}) | |
| %_unsafe_view_33 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_13, [2, 77, 512]), kwargs = {}) | |
| %_param_constant123 : [#users=1] = get_attr[target=_param_constant123] | |
| %t_34 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant123,), kwargs = {}) | |
| %view_88 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_14 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_88, %t_34), kwargs = {}) | |
| %_unsafe_view_34 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_14, [2, 77, 512]), kwargs = {}) | |
| %view_89 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_33, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_22 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_89, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_19 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_22,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_35 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_19, [16, 77, 64]), kwargs = {}) | |
| %view_90 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_34, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_23 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_90, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_20 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_23,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_36 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_20, [16, 77, 64]), kwargs = {}) | |
| %empty_4 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_4 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_35, -1, -2), kwargs = {}) | |
| %baddbmm_4 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_4, %_unsafe_view_32, %transpose_4), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_4 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_4, -1, False), kwargs = {}) | |
| %detach_30 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_4,), kwargs = {}) | |
| %bmm_7 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_4, %_unsafe_view_36), kwargs = {}) | |
| %view_91 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_7, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_24 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_91, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_21 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_24,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_37 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_21, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant124 : [#users=1] = get_attr[target=_param_constant124] | |
| %t_35 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant124,), kwargs = {}) | |
| %view_92 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_37, [2048, 512]), kwargs = {}) | |
| %_param_constant125 : [#users=1] = get_attr[target=_param_constant125] | |
| %addmm_17 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant125, %view_92, %t_35), kwargs = {}) | |
| %view_93 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_17, [2, 1024, 512]), kwargs = {}) | |
| %add_62 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_93, %add_59), kwargs = {}) | |
| %convert_element_type_66 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_62, torch.float32), kwargs = {}) | |
| %var_mean_20 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_66, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_40 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_20, 0), kwargs = {}) | |
| %getitem_41 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_20, 1), kwargs = {}) | |
| %add_63 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_40, 1e-05), kwargs = {}) | |
| %rsqrt_20 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_63,), kwargs = {}) | |
| %sub_20 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_62, %getitem_41), kwargs = {}) | |
| %mul_45 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_20, %rsqrt_20), kwargs = {}) | |
| %_param_constant126 : [#users=1] = get_attr[target=_param_constant126] | |
| %mul_46 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_45, %_param_constant126), kwargs = {}) | |
| %_param_constant127 : [#users=1] = get_attr[target=_param_constant127] | |
| %add_64 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_46, %_param_constant127), kwargs = {}) | |
| %convert_element_type_67 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_64, torch.float16), kwargs = {}) | |
| %_param_constant128 : [#users=1] = get_attr[target=_param_constant128] | |
| %t_36 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant128,), kwargs = {}) | |
| %view_94 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_67, [2048, 512]), kwargs = {}) | |
| %mm_15 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_94, %t_36), kwargs = {}) | |
| %_unsafe_view_38 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_15, [2, 1024, 512]), kwargs = {}) | |
| %view_95 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_38, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_25 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_95, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_22 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_25,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_39 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_22, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant129 : [#users=1] = get_attr[target=_param_constant129] | |
| %t_37 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant129,), kwargs = {}) | |
| %view_96 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_16 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_96, %t_37), kwargs = {}) | |
| %_unsafe_view_40 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_16, [2, 77, 512]), kwargs = {}) | |
| %_param_constant130 : [#users=1] = get_attr[target=_param_constant130] | |
| %t_38 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant130,), kwargs = {}) | |
| %view_97 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_17 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_97, %t_38), kwargs = {}) | |
| %_unsafe_view_41 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_17, [2, 77, 512]), kwargs = {}) | |
| %view_98 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_40, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_26 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_98, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_23 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_26,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_42 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_23, [16, 77, 64]), kwargs = {}) | |
| %view_99 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_41, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_27 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_99, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_24 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_27,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_43 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_24, [16, 77, 64]), kwargs = {}) | |
| %empty_5 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_5 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_42, -1, -2), kwargs = {}) | |
| %baddbmm_5 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_5, %_unsafe_view_39, %transpose_5), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_5 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_5, -1, False), kwargs = {}) | |
| %detach_31 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_5,), kwargs = {}) | |
| %bmm_8 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_5, %_unsafe_view_43), kwargs = {}) | |
| %view_100 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_8, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_28 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_100, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_25 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_28,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_44 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_25, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant131 : [#users=1] = get_attr[target=_param_constant131] | |
| %t_39 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant131,), kwargs = {}) | |
| %view_101 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_44, [2048, 512]), kwargs = {}) | |
| %_param_constant132 : [#users=1] = get_attr[target=_param_constant132] | |
| %addmm_18 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant132, %view_101, %t_39), kwargs = {}) | |
| %view_102 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_18, [2, 1024, 512]), kwargs = {}) | |
| %add_65 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_102, %add_62), kwargs = {}) | |
| %convert_element_type_68 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_65, torch.float32), kwargs = {}) | |
| %var_mean_21 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_68, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_42 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_21, 0), kwargs = {}) | |
| %getitem_43 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_21, 1), kwargs = {}) | |
| %add_66 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_42, 1e-05), kwargs = {}) | |
| %rsqrt_21 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_66,), kwargs = {}) | |
| %sub_21 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_65, %getitem_43), kwargs = {}) | |
| %mul_47 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_21, %rsqrt_21), kwargs = {}) | |
| %_param_constant133 : [#users=1] = get_attr[target=_param_constant133] | |
| %mul_48 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_47, %_param_constant133), kwargs = {}) | |
| %_param_constant134 : [#users=1] = get_attr[target=_param_constant134] | |
| %add_67 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_48, %_param_constant134), kwargs = {}) | |
| %convert_element_type_69 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_67, torch.float16), kwargs = {}) | |
| %_param_constant135 : [#users=1] = get_attr[target=_param_constant135] | |
| %t_40 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant135,), kwargs = {}) | |
| %view_103 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_69, [2048, 512]), kwargs = {}) | |
| %_param_constant136 : [#users=1] = get_attr[target=_param_constant136] | |
| %addmm_19 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant136, %view_103, %t_40), kwargs = {}) | |
| %view_104 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_19, [2, 1024, 4096]), kwargs = {}) | |
| %slice_21 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_104, -1, 0, 2048), kwargs = {}) | |
| %slice_22 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_104, -1, 2048, 4096), kwargs = {}) | |
| %gelu_2 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_22,), kwargs = {}) | |
| %mul_49 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_21, %gelu_2), kwargs = {}) | |
| %_param_constant137 : [#users=1] = get_attr[target=_param_constant137] | |
| %t_41 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant137,), kwargs = {}) | |
| %view_105 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_49, [2048, 2048]), kwargs = {}) | |
| %_param_constant138 : [#users=1] = get_attr[target=_param_constant138] | |
| %addmm_20 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant138, %view_105, %t_41), kwargs = {}) | |
| %view_106 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_20, [2, 1024, 512]), kwargs = {}) | |
| %add_68 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_106, %add_65), kwargs = {}) | |
| %_param_constant139 : [#users=1] = get_attr[target=_param_constant139] | |
| %t_42 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant139,), kwargs = {}) | |
| %view_107 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_68, [2048, 512]), kwargs = {}) | |
| %_param_constant140 : [#users=1] = get_attr[target=_param_constant140] | |
| %addmm_21 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant140, %view_107, %t_42), kwargs = {}) | |
| %view_108 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_21, [2, 1024, 512]), kwargs = {}) | |
| %view_109 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_108, [2, 32, 32, 512]), kwargs = {}) | |
| %permute_29 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_109, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_26 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_29,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_69 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_26, %div_5), kwargs = {}) | |
| %view_110 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_69, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_70 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_110, torch.float32), kwargs = {}) | |
| %var_mean_22 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_70, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_44 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_22, 0), kwargs = {}) | |
| %getitem_45 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_22, 1), kwargs = {}) | |
| %add_70 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_44, 1e-05), kwargs = {}) | |
| %rsqrt_22 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_70,), kwargs = {}) | |
| %sub_22 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_110, %getitem_45), kwargs = {}) | |
| %mul_50 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_22, %rsqrt_22), kwargs = {}) | |
| %view_111 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_50, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant141 : [#users=1] = get_attr[target=_param_constant141] | |
| %unsqueeze_90 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant141, 0), kwargs = {}) | |
| %unsqueeze_91 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_90, 2), kwargs = {}) | |
| %unsqueeze_92 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_91, 3), kwargs = {}) | |
| %_param_constant142 : [#users=1] = get_attr[target=_param_constant142] | |
| %unsqueeze_93 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant142, 0), kwargs = {}) | |
| %unsqueeze_94 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_93, 2), kwargs = {}) | |
| %unsqueeze_95 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_94, 3), kwargs = {}) | |
| %mul_51 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_111, %unsqueeze_95), kwargs = {}) | |
| %add_71 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_51, %unsqueeze_92), kwargs = {}) | |
| %convert_element_type_71 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_71, torch.float16), kwargs = {}) | |
| %convert_element_type_72 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_45, torch.float16), kwargs = {}) | |
| %convert_element_type_73 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_22, torch.float16), kwargs = {}) | |
| %squeeze_52 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_72, 3), kwargs = {}) | |
| %squeeze_53 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_52, 2), kwargs = {}) | |
| %squeeze_54 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_73, 3), kwargs = {}) | |
| %squeeze_55 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_54, 2), kwargs = {}) | |
| %detach_32 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_53,), kwargs = {}) | |
| %detach_33 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_55,), kwargs = {}) | |
| %silu_16 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_71,), kwargs = {}) | |
| %_param_constant143 : [#users=1] = get_attr[target=_param_constant143] | |
| %_param_constant144 : [#users=1] = get_attr[target=_param_constant144] | |
| %convolution_14 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_16, %_param_constant143, %_param_constant144, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_17 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant145 : [#users=1] = get_attr[target=_param_constant145] | |
| %t_43 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant145,), kwargs = {}) | |
| %_param_constant146 : [#users=1] = get_attr[target=_param_constant146] | |
| %addmm_22 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant146, %silu_17, %t_43), kwargs = {}) | |
| %slice_23 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_22, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_24 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_23, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_96 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_24, 2), kwargs = {}) | |
| %unsqueeze_97 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_96, 3), kwargs = {}) | |
| %add_72 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_14, %unsqueeze_97), kwargs = {}) | |
| %view_112 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_72, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_74 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_112, torch.float32), kwargs = {}) | |
| %var_mean_23 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_74, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_46 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_23, 0), kwargs = {}) | |
| %getitem_47 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_23, 1), kwargs = {}) | |
| %add_73 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_46, 1e-05), kwargs = {}) | |
| %rsqrt_23 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_73,), kwargs = {}) | |
| %sub_23 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_112, %getitem_47), kwargs = {}) | |
| %mul_52 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_23, %rsqrt_23), kwargs = {}) | |
| %view_113 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_52, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant147 : [#users=1] = get_attr[target=_param_constant147] | |
| %unsqueeze_98 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant147, 0), kwargs = {}) | |
| %unsqueeze_99 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_98, 2), kwargs = {}) | |
| %unsqueeze_100 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_99, 3), kwargs = {}) | |
| %_param_constant148 : [#users=1] = get_attr[target=_param_constant148] | |
| %unsqueeze_101 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant148, 0), kwargs = {}) | |
| %unsqueeze_102 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_101, 2), kwargs = {}) | |
| %unsqueeze_103 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_102, 3), kwargs = {}) | |
| %mul_53 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_113, %unsqueeze_103), kwargs = {}) | |
| %add_74 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_53, %unsqueeze_100), kwargs = {}) | |
| %convert_element_type_75 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_74, torch.float16), kwargs = {}) | |
| %convert_element_type_76 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_47, torch.float16), kwargs = {}) | |
| %convert_element_type_77 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_23, torch.float16), kwargs = {}) | |
| %squeeze_56 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_76, 3), kwargs = {}) | |
| %squeeze_57 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_56, 2), kwargs = {}) | |
| %squeeze_58 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_77, 3), kwargs = {}) | |
| %squeeze_59 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_58, 2), kwargs = {}) | |
| %detach_34 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_57,), kwargs = {}) | |
| %detach_35 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_59,), kwargs = {}) | |
| %silu_18 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_75,), kwargs = {}) | |
| %_param_constant149 : [#users=1] = get_attr[target=_param_constant149] | |
| %_param_constant150 : [#users=1] = get_attr[target=_param_constant150] | |
| %convolution_15 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_18, %_param_constant149, %_param_constant150, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_75 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_69, %convolution_15), kwargs = {}) | |
| %div_6 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_75, 1.0), kwargs = {}) | |
| %view_114 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_6, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_78 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_114, torch.float32), kwargs = {}) | |
| %var_mean_24 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_78, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_48 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_24, 0), kwargs = {}) | |
| %getitem_49 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_24, 1), kwargs = {}) | |
| %add_76 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_48, 1e-06), kwargs = {}) | |
| %rsqrt_24 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_76,), kwargs = {}) | |
| %sub_24 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_114, %getitem_49), kwargs = {}) | |
| %mul_54 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_24, %rsqrt_24), kwargs = {}) | |
| %view_115 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_54, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant151 : [#users=1] = get_attr[target=_param_constant151] | |
| %unsqueeze_104 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant151, 0), kwargs = {}) | |
| %unsqueeze_105 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_104, 2), kwargs = {}) | |
| %unsqueeze_106 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_105, 3), kwargs = {}) | |
| %_param_constant152 : [#users=1] = get_attr[target=_param_constant152] | |
| %unsqueeze_107 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant152, 0), kwargs = {}) | |
| %unsqueeze_108 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_107, 2), kwargs = {}) | |
| %unsqueeze_109 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_108, 3), kwargs = {}) | |
| %mul_55 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_115, %unsqueeze_109), kwargs = {}) | |
| %add_77 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_55, %unsqueeze_106), kwargs = {}) | |
| %convert_element_type_79 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_77, torch.float16), kwargs = {}) | |
| %convert_element_type_80 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_49, torch.float16), kwargs = {}) | |
| %convert_element_type_81 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_24, torch.float16), kwargs = {}) | |
| %squeeze_60 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_80, 3), kwargs = {}) | |
| %squeeze_61 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_60, 2), kwargs = {}) | |
| %squeeze_62 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_81, 3), kwargs = {}) | |
| %squeeze_63 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_62, 2), kwargs = {}) | |
| %detach_36 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_61,), kwargs = {}) | |
| %detach_37 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_63,), kwargs = {}) | |
| %permute_30 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_79, [0, 2, 3, 1]), kwargs = {}) | |
| %view_116 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_30, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant153 : [#users=1] = get_attr[target=_param_constant153] | |
| %t_44 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant153,), kwargs = {}) | |
| %expand_7 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_116, [2, 1024, 512]), kwargs = {}) | |
| %view_117 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_7, [2, 1024, 512]), kwargs = {}) | |
| %expand_8 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_44, [2, 512, 512]), kwargs = {}) | |
| %view_118 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_8, [2, 512, 512]), kwargs = {}) | |
| %bmm_9 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_117, %view_118), kwargs = {}) | |
| %_unsafe_view_45 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_9, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant154 : [#users=1] = get_attr[target=_param_constant154] | |
| %add_78 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_45, %_param_constant154), kwargs = {}) | |
| %convert_element_type_82 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_78, torch.float32), kwargs = {}) | |
| %var_mean_25 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_82, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_50 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_25, 0), kwargs = {}) | |
| %getitem_51 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_25, 1), kwargs = {}) | |
| %add_79 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_50, 1e-05), kwargs = {}) | |
| %rsqrt_25 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_79,), kwargs = {}) | |
| %sub_25 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_78, %getitem_51), kwargs = {}) | |
| %mul_56 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_25, %rsqrt_25), kwargs = {}) | |
| %_param_constant155 : [#users=1] = get_attr[target=_param_constant155] | |
| %mul_57 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_56, %_param_constant155), kwargs = {}) | |
| %_param_constant156 : [#users=1] = get_attr[target=_param_constant156] | |
| %add_80 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_57, %_param_constant156), kwargs = {}) | |
| %convert_element_type_83 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_80, torch.float16), kwargs = {}) | |
| %_param_constant157 : [#users=1] = get_attr[target=_param_constant157] | |
| %t_45 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant157,), kwargs = {}) | |
| %view_119 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_83, [2048, 512]), kwargs = {}) | |
| %mm_18 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_119, %t_45), kwargs = {}) | |
| %_unsafe_view_46 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_18, [2, 1024, 512]), kwargs = {}) | |
| %view_120 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_46, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_31 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_120, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_27 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_31,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_47 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_27, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant158 : [#users=1] = get_attr[target=_param_constant158] | |
| %t_46 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant158,), kwargs = {}) | |
| %view_121 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_19 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_121, %t_46), kwargs = {}) | |
| %_unsafe_view_48 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_19, [2, 77, 512]), kwargs = {}) | |
| %_param_constant159 : [#users=1] = get_attr[target=_param_constant159] | |
| %t_47 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant159,), kwargs = {}) | |
| %view_122 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_20 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_122, %t_47), kwargs = {}) | |
| %_unsafe_view_49 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_20, [2, 77, 512]), kwargs = {}) | |
| %view_123 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_48, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_32 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_123, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_28 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_32,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_50 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_28, [16, 77, 64]), kwargs = {}) | |
| %view_124 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_49, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_33 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_124, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_29 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_33,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_51 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_29, [16, 77, 64]), kwargs = {}) | |
| %empty_6 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_6 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_50, -1, -2), kwargs = {}) | |
| %baddbmm_6 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_6, %_unsafe_view_47, %transpose_6), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_6 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_6, -1, False), kwargs = {}) | |
| %detach_38 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_6,), kwargs = {}) | |
| %bmm_10 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_6, %_unsafe_view_51), kwargs = {}) | |
| %view_125 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_10, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_34 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_125, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_30 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_34,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_52 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_30, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant160 : [#users=1] = get_attr[target=_param_constant160] | |
| %t_48 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant160,), kwargs = {}) | |
| %view_126 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_52, [2048, 512]), kwargs = {}) | |
| %_param_constant161 : [#users=1] = get_attr[target=_param_constant161] | |
| %addmm_23 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant161, %view_126, %t_48), kwargs = {}) | |
| %view_127 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_23, [2, 1024, 512]), kwargs = {}) | |
| %add_81 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_127, %add_78), kwargs = {}) | |
| %convert_element_type_84 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_81, torch.float32), kwargs = {}) | |
| %var_mean_26 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_84, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_52 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_26, 0), kwargs = {}) | |
| %getitem_53 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_26, 1), kwargs = {}) | |
| %add_82 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_52, 1e-05), kwargs = {}) | |
| %rsqrt_26 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_82,), kwargs = {}) | |
| %sub_26 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_81, %getitem_53), kwargs = {}) | |
| %mul_58 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_26, %rsqrt_26), kwargs = {}) | |
| %_param_constant162 : [#users=1] = get_attr[target=_param_constant162] | |
| %mul_59 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_58, %_param_constant162), kwargs = {}) | |
| %_param_constant163 : [#users=1] = get_attr[target=_param_constant163] | |
| %add_83 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_59, %_param_constant163), kwargs = {}) | |
| %convert_element_type_85 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_83, torch.float16), kwargs = {}) | |
| %_param_constant164 : [#users=1] = get_attr[target=_param_constant164] | |
| %t_49 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant164,), kwargs = {}) | |
| %view_128 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_85, [2048, 512]), kwargs = {}) | |
| %mm_21 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_128, %t_49), kwargs = {}) | |
| %_unsafe_view_53 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_21, [2, 1024, 512]), kwargs = {}) | |
| %view_129 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_53, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_35 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_129, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_31 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_35,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_54 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_31, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant165 : [#users=1] = get_attr[target=_param_constant165] | |
| %t_50 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant165,), kwargs = {}) | |
| %view_130 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_22 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_130, %t_50), kwargs = {}) | |
| %_unsafe_view_55 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_22, [2, 77, 512]), kwargs = {}) | |
| %_param_constant166 : [#users=1] = get_attr[target=_param_constant166] | |
| %t_51 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant166,), kwargs = {}) | |
| %view_131 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_23 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_131, %t_51), kwargs = {}) | |
| %_unsafe_view_56 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_23, [2, 77, 512]), kwargs = {}) | |
| %view_132 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_55, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_36 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_132, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_32 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_36,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_57 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_32, [16, 77, 64]), kwargs = {}) | |
| %view_133 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_56, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_37 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_133, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_33 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_37,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_58 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_33, [16, 77, 64]), kwargs = {}) | |
| %empty_7 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_7 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_57, -1, -2), kwargs = {}) | |
| %baddbmm_7 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_7, %_unsafe_view_54, %transpose_7), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_7 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_7, -1, False), kwargs = {}) | |
| %detach_39 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_7,), kwargs = {}) | |
| %bmm_11 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_7, %_unsafe_view_58), kwargs = {}) | |
| %view_134 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_11, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_38 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_134, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_34 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_38,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_59 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_34, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant167 : [#users=1] = get_attr[target=_param_constant167] | |
| %t_52 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant167,), kwargs = {}) | |
| %view_135 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_59, [2048, 512]), kwargs = {}) | |
| %_param_constant168 : [#users=1] = get_attr[target=_param_constant168] | |
| %addmm_24 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant168, %view_135, %t_52), kwargs = {}) | |
| %view_136 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_24, [2, 1024, 512]), kwargs = {}) | |
| %add_84 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_136, %add_81), kwargs = {}) | |
| %convert_element_type_86 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_84, torch.float32), kwargs = {}) | |
| %var_mean_27 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_86, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_54 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_27, 0), kwargs = {}) | |
| %getitem_55 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_27, 1), kwargs = {}) | |
| %add_85 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_54, 1e-05), kwargs = {}) | |
| %rsqrt_27 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_85,), kwargs = {}) | |
| %sub_27 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_84, %getitem_55), kwargs = {}) | |
| %mul_60 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_27, %rsqrt_27), kwargs = {}) | |
| %_param_constant169 : [#users=1] = get_attr[target=_param_constant169] | |
| %mul_61 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_60, %_param_constant169), kwargs = {}) | |
| %_param_constant170 : [#users=1] = get_attr[target=_param_constant170] | |
| %add_86 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_61, %_param_constant170), kwargs = {}) | |
| %convert_element_type_87 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_86, torch.float16), kwargs = {}) | |
| %_param_constant171 : [#users=1] = get_attr[target=_param_constant171] | |
| %t_53 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant171,), kwargs = {}) | |
| %view_137 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_87, [2048, 512]), kwargs = {}) | |
| %_param_constant172 : [#users=1] = get_attr[target=_param_constant172] | |
| %addmm_25 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant172, %view_137, %t_53), kwargs = {}) | |
| %view_138 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_25, [2, 1024, 4096]), kwargs = {}) | |
| %slice_25 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_138, -1, 0, 2048), kwargs = {}) | |
| %slice_26 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_138, -1, 2048, 4096), kwargs = {}) | |
| %gelu_3 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_26,), kwargs = {}) | |
| %mul_62 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_25, %gelu_3), kwargs = {}) | |
| %_param_constant173 : [#users=1] = get_attr[target=_param_constant173] | |
| %t_54 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant173,), kwargs = {}) | |
| %view_139 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_62, [2048, 2048]), kwargs = {}) | |
| %_param_constant174 : [#users=1] = get_attr[target=_param_constant174] | |
| %addmm_26 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant174, %view_139, %t_54), kwargs = {}) | |
| %view_140 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_26, [2, 1024, 512]), kwargs = {}) | |
| %add_87 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_140, %add_84), kwargs = {}) | |
| %_param_constant175 : [#users=1] = get_attr[target=_param_constant175] | |
| %t_55 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant175,), kwargs = {}) | |
| %view_141 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_87, [2048, 512]), kwargs = {}) | |
| %_param_constant176 : [#users=1] = get_attr[target=_param_constant176] | |
| %addmm_27 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant176, %view_141, %t_55), kwargs = {}) | |
| %view_142 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_27, [2, 1024, 512]), kwargs = {}) | |
| %view_143 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_142, [2, 32, 32, 512]), kwargs = {}) | |
| %permute_39 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_143, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_35 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_39,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_88 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_35, %div_6), kwargs = {}) | |
| %_param_constant177 : [#users=1] = get_attr[target=_param_constant177] | |
| %_param_constant178 : [#users=1] = get_attr[target=_param_constant178] | |
| %convolution_16 : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%add_88, %_param_constant177, %_param_constant178, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %view_144 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution_16, [2, 32, 16, 256]), kwargs = {}) | |
| %convert_element_type_88 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_144, torch.float32), kwargs = {}) | |
| %var_mean_28 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_88, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_56 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_28, 0), kwargs = {}) | |
| %getitem_57 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_28, 1), kwargs = {}) | |
| %add_89 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_56, 1e-05), kwargs = {}) | |
| %rsqrt_28 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_89,), kwargs = {}) | |
| %sub_28 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_144, %getitem_57), kwargs = {}) | |
| %mul_63 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_28, %rsqrt_28), kwargs = {}) | |
| %view_145 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_63, [2, 512, 16, 16]), kwargs = {}) | |
| %_param_constant179 : [#users=1] = get_attr[target=_param_constant179] | |
| %unsqueeze_110 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant179, 0), kwargs = {}) | |
| %unsqueeze_111 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_110, 2), kwargs = {}) | |
| %unsqueeze_112 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_111, 3), kwargs = {}) | |
| %_param_constant180 : [#users=1] = get_attr[target=_param_constant180] | |
| %unsqueeze_113 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant180, 0), kwargs = {}) | |
| %unsqueeze_114 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_113, 2), kwargs = {}) | |
| %unsqueeze_115 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_114, 3), kwargs = {}) | |
| %mul_64 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_145, %unsqueeze_115), kwargs = {}) | |
| %add_90 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_64, %unsqueeze_112), kwargs = {}) | |
| %convert_element_type_89 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_90, torch.float16), kwargs = {}) | |
| %convert_element_type_90 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_57, torch.float16), kwargs = {}) | |
| %convert_element_type_91 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_28, torch.float16), kwargs = {}) | |
| %squeeze_64 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_90, 3), kwargs = {}) | |
| %squeeze_65 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_64, 2), kwargs = {}) | |
| %squeeze_66 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_91, 3), kwargs = {}) | |
| %squeeze_67 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_66, 2), kwargs = {}) | |
| %detach_40 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_65,), kwargs = {}) | |
| %detach_41 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_67,), kwargs = {}) | |
| %silu_19 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_89,), kwargs = {}) | |
| %_param_constant181 : [#users=1] = get_attr[target=_param_constant181] | |
| %_param_constant182 : [#users=1] = get_attr[target=_param_constant182] | |
| %convolution_17 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_19, %_param_constant181, %_param_constant182, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_20 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant183 : [#users=1] = get_attr[target=_param_constant183] | |
| %t_56 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant183,), kwargs = {}) | |
| %_param_constant184 : [#users=1] = get_attr[target=_param_constant184] | |
| %addmm_28 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant184, %silu_20, %t_56), kwargs = {}) | |
| %slice_27 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_28, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_28 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_27, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_116 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_28, 2), kwargs = {}) | |
| %unsqueeze_117 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_116, 3), kwargs = {}) | |
| %add_91 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_17, %unsqueeze_117), kwargs = {}) | |
| %view_146 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_91, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_92 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_146, torch.float32), kwargs = {}) | |
| %var_mean_29 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_92, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_58 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_29, 0), kwargs = {}) | |
| %getitem_59 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_29, 1), kwargs = {}) | |
| %add_92 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_58, 1e-05), kwargs = {}) | |
| %rsqrt_29 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_92,), kwargs = {}) | |
| %sub_29 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_146, %getitem_59), kwargs = {}) | |
| %mul_65 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_29, %rsqrt_29), kwargs = {}) | |
| %view_147 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_65, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant185 : [#users=1] = get_attr[target=_param_constant185] | |
| %unsqueeze_118 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant185, 0), kwargs = {}) | |
| %unsqueeze_119 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_118, 2), kwargs = {}) | |
| %unsqueeze_120 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_119, 3), kwargs = {}) | |
| %_param_constant186 : [#users=1] = get_attr[target=_param_constant186] | |
| %unsqueeze_121 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant186, 0), kwargs = {}) | |
| %unsqueeze_122 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_121, 2), kwargs = {}) | |
| %unsqueeze_123 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_122, 3), kwargs = {}) | |
| %mul_66 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_147, %unsqueeze_123), kwargs = {}) | |
| %add_93 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_66, %unsqueeze_120), kwargs = {}) | |
| %convert_element_type_93 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_93, torch.float16), kwargs = {}) | |
| %convert_element_type_94 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_59, torch.float16), kwargs = {}) | |
| %convert_element_type_95 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_29, torch.float16), kwargs = {}) | |
| %squeeze_68 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_94, 3), kwargs = {}) | |
| %squeeze_69 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_68, 2), kwargs = {}) | |
| %squeeze_70 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_95, 3), kwargs = {}) | |
| %squeeze_71 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_70, 2), kwargs = {}) | |
| %detach_42 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_69,), kwargs = {}) | |
| %detach_43 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_71,), kwargs = {}) | |
| %silu_21 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_93,), kwargs = {}) | |
| %_param_constant187 : [#users=1] = get_attr[target=_param_constant187] | |
| %_param_constant188 : [#users=1] = get_attr[target=_param_constant188] | |
| %convolution_18 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_21, %_param_constant187, %_param_constant188, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant189 : [#users=1] = get_attr[target=_param_constant189] | |
| %_param_constant190 : [#users=1] = get_attr[target=_param_constant190] | |
| %convolution_19 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%convolution_16, %_param_constant189, %_param_constant190, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_94 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_19, %convolution_18), kwargs = {}) | |
| %div_7 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_94, 1.0), kwargs = {}) | |
| %view_148 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_7, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_96 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_148, torch.float32), kwargs = {}) | |
| %var_mean_30 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_96, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_60 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_30, 0), kwargs = {}) | |
| %getitem_61 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_30, 1), kwargs = {}) | |
| %add_95 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_60, 1e-06), kwargs = {}) | |
| %rsqrt_30 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_95,), kwargs = {}) | |
| %sub_30 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_148, %getitem_61), kwargs = {}) | |
| %mul_67 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_30, %rsqrt_30), kwargs = {}) | |
| %view_149 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_67, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant191 : [#users=1] = get_attr[target=_param_constant191] | |
| %unsqueeze_124 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant191, 0), kwargs = {}) | |
| %unsqueeze_125 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_124, 2), kwargs = {}) | |
| %unsqueeze_126 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_125, 3), kwargs = {}) | |
| %_param_constant192 : [#users=1] = get_attr[target=_param_constant192] | |
| %unsqueeze_127 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant192, 0), kwargs = {}) | |
| %unsqueeze_128 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_127, 2), kwargs = {}) | |
| %unsqueeze_129 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_128, 3), kwargs = {}) | |
| %mul_68 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_149, %unsqueeze_129), kwargs = {}) | |
| %add_96 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_68, %unsqueeze_126), kwargs = {}) | |
| %convert_element_type_97 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_96, torch.float16), kwargs = {}) | |
| %convert_element_type_98 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_61, torch.float16), kwargs = {}) | |
| %convert_element_type_99 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_30, torch.float16), kwargs = {}) | |
| %squeeze_72 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_98, 3), kwargs = {}) | |
| %squeeze_73 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_72, 2), kwargs = {}) | |
| %squeeze_74 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_99, 3), kwargs = {}) | |
| %squeeze_75 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_74, 2), kwargs = {}) | |
| %detach_44 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_73,), kwargs = {}) | |
| %detach_45 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_75,), kwargs = {}) | |
| %permute_40 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_97, [0, 2, 3, 1]), kwargs = {}) | |
| %view_150 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_40, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant193 : [#users=1] = get_attr[target=_param_constant193] | |
| %t_57 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant193,), kwargs = {}) | |
| %expand_9 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_150, [2, 256, 1024]), kwargs = {}) | |
| %view_151 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_9, [2, 256, 1024]), kwargs = {}) | |
| %expand_10 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_57, [2, 1024, 1024]), kwargs = {}) | |
| %view_152 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_10, [2, 1024, 1024]), kwargs = {}) | |
| %bmm_12 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_151, %view_152), kwargs = {}) | |
| %_unsafe_view_60 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_12, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant194 : [#users=1] = get_attr[target=_param_constant194] | |
| %add_97 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_60, %_param_constant194), kwargs = {}) | |
| %convert_element_type_100 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_97, torch.float32), kwargs = {}) | |
| %var_mean_31 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_100, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_62 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_31, 0), kwargs = {}) | |
| %getitem_63 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_31, 1), kwargs = {}) | |
| %add_98 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_62, 1e-05), kwargs = {}) | |
| %rsqrt_31 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_98,), kwargs = {}) | |
| %sub_31 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_97, %getitem_63), kwargs = {}) | |
| %mul_69 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_31, %rsqrt_31), kwargs = {}) | |
| %_param_constant195 : [#users=1] = get_attr[target=_param_constant195] | |
| %mul_70 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_69, %_param_constant195), kwargs = {}) | |
| %_param_constant196 : [#users=1] = get_attr[target=_param_constant196] | |
| %add_99 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_70, %_param_constant196), kwargs = {}) | |
| %convert_element_type_101 : [#users=3] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_99, torch.float16), kwargs = {}) | |
| %_param_constant197 : [#users=1] = get_attr[target=_param_constant197] | |
| %t_58 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant197,), kwargs = {}) | |
| %view_153 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_101, [512, 1024]), kwargs = {}) | |
| %mm_24 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_153, %t_58), kwargs = {}) | |
| %_unsafe_view_61 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_24, [2, 256, 1024]), kwargs = {}) | |
| %view_154 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_61, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_41 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_154, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_36 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_41,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_62 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_36, [16, 256, 128]), kwargs = {}) | |
| %_param_constant198 : [#users=1] = get_attr[target=_param_constant198] | |
| %t_59 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant198,), kwargs = {}) | |
| %view_155 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_101, [512, 1024]), kwargs = {}) | |
| %mm_25 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_155, %t_59), kwargs = {}) | |
| %_unsafe_view_63 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_25, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant199 : [#users=1] = get_attr[target=_param_constant199] | |
| %t_60 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant199,), kwargs = {}) | |
| %view_156 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_101, [512, 1024]), kwargs = {}) | |
| %mm_26 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_156, %t_60), kwargs = {}) | |
| %_unsafe_view_64 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_26, [2, 256, 1024]), kwargs = {}) | |
| %view_157 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_63, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_42 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_157, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_37 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_42,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_65 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_37, [16, 256, 128]), kwargs = {}) | |
| %view_158 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_64, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_43 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_158, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_38 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_43,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_66 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_38, [16, 256, 128]), kwargs = {}) | |
| %empty_8 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 256],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_8 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_65, -1, -2), kwargs = {}) | |
| %baddbmm_8 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_8, %_unsafe_view_62, %transpose_8), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_8 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_8, -1, False), kwargs = {}) | |
| %detach_46 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_8,), kwargs = {}) | |
| %bmm_13 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_8, %_unsafe_view_66), kwargs = {}) | |
| %view_159 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_13, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_44 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_159, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_39 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_44,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_67 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_39, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant200 : [#users=1] = get_attr[target=_param_constant200] | |
| %t_61 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant200,), kwargs = {}) | |
| %view_160 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_67, [512, 1024]), kwargs = {}) | |
| %_param_constant201 : [#users=1] = get_attr[target=_param_constant201] | |
| %addmm_29 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant201, %view_160, %t_61), kwargs = {}) | |
| %view_161 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_29, [2, 256, 1024]), kwargs = {}) | |
| %add_100 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_161, %add_97), kwargs = {}) | |
| %convert_element_type_102 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_100, torch.float32), kwargs = {}) | |
| %var_mean_32 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_102, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_64 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_32, 0), kwargs = {}) | |
| %getitem_65 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_32, 1), kwargs = {}) | |
| %add_101 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_64, 1e-05), kwargs = {}) | |
| %rsqrt_32 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_101,), kwargs = {}) | |
| %sub_32 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_100, %getitem_65), kwargs = {}) | |
| %mul_71 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_32, %rsqrt_32), kwargs = {}) | |
| %_param_constant202 : [#users=1] = get_attr[target=_param_constant202] | |
| %mul_72 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_71, %_param_constant202), kwargs = {}) | |
| %_param_constant203 : [#users=1] = get_attr[target=_param_constant203] | |
| %add_102 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_72, %_param_constant203), kwargs = {}) | |
| %convert_element_type_103 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_102, torch.float16), kwargs = {}) | |
| %_param_constant204 : [#users=1] = get_attr[target=_param_constant204] | |
| %t_62 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant204,), kwargs = {}) | |
| %view_162 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_103, [512, 1024]), kwargs = {}) | |
| %mm_27 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_162, %t_62), kwargs = {}) | |
| %_unsafe_view_68 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_27, [2, 256, 1024]), kwargs = {}) | |
| %view_163 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_68, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_45 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_163, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_40 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_45,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_69 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_40, [16, 256, 128]), kwargs = {}) | |
| %_param_constant205 : [#users=1] = get_attr[target=_param_constant205] | |
| %t_63 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant205,), kwargs = {}) | |
| %view_164 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_28 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_164, %t_63), kwargs = {}) | |
| %_unsafe_view_70 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_28, [2, 77, 1024]), kwargs = {}) | |
| %_param_constant206 : [#users=1] = get_attr[target=_param_constant206] | |
| %t_64 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant206,), kwargs = {}) | |
| %view_165 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_29 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_165, %t_64), kwargs = {}) | |
| %_unsafe_view_71 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_29, [2, 77, 1024]), kwargs = {}) | |
| %view_166 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_70, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_46 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_166, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_41 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_46,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_72 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_41, [16, 77, 128]), kwargs = {}) | |
| %view_167 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_71, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_47 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_167, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_42 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_47,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_73 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_42, [16, 77, 128]), kwargs = {}) | |
| %empty_9 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_9 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_72, -1, -2), kwargs = {}) | |
| %baddbmm_9 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_9, %_unsafe_view_69, %transpose_9), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_9 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_9, -1, False), kwargs = {}) | |
| %detach_47 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_9,), kwargs = {}) | |
| %bmm_14 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_9, %_unsafe_view_73), kwargs = {}) | |
| %view_168 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_14, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_48 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_168, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_43 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_48,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_74 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_43, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant207 : [#users=1] = get_attr[target=_param_constant207] | |
| %t_65 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant207,), kwargs = {}) | |
| %view_169 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_74, [512, 1024]), kwargs = {}) | |
| %_param_constant208 : [#users=1] = get_attr[target=_param_constant208] | |
| %addmm_30 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant208, %view_169, %t_65), kwargs = {}) | |
| %view_170 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_30, [2, 256, 1024]), kwargs = {}) | |
| %add_103 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_170, %add_100), kwargs = {}) | |
| %convert_element_type_104 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_103, torch.float32), kwargs = {}) | |
| %var_mean_33 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_104, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_66 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_33, 0), kwargs = {}) | |
| %getitem_67 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_33, 1), kwargs = {}) | |
| %add_104 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_66, 1e-05), kwargs = {}) | |
| %rsqrt_33 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_104,), kwargs = {}) | |
| %sub_33 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_103, %getitem_67), kwargs = {}) | |
| %mul_73 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_33, %rsqrt_33), kwargs = {}) | |
| %_param_constant209 : [#users=1] = get_attr[target=_param_constant209] | |
| %mul_74 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_73, %_param_constant209), kwargs = {}) | |
| %_param_constant210 : [#users=1] = get_attr[target=_param_constant210] | |
| %add_105 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_74, %_param_constant210), kwargs = {}) | |
| %convert_element_type_105 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_105, torch.float16), kwargs = {}) | |
| %_param_constant211 : [#users=1] = get_attr[target=_param_constant211] | |
| %t_66 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant211,), kwargs = {}) | |
| %view_171 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_105, [512, 1024]), kwargs = {}) | |
| %_param_constant212 : [#users=1] = get_attr[target=_param_constant212] | |
| %addmm_31 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant212, %view_171, %t_66), kwargs = {}) | |
| %view_172 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_31, [2, 256, 8192]), kwargs = {}) | |
| %slice_29 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_172, -1, 0, 4096), kwargs = {}) | |
| %slice_30 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_172, -1, 4096, 8192), kwargs = {}) | |
| %gelu_4 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_30,), kwargs = {}) | |
| %mul_75 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_29, %gelu_4), kwargs = {}) | |
| %_param_constant213 : [#users=1] = get_attr[target=_param_constant213] | |
| %t_67 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant213,), kwargs = {}) | |
| %view_173 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_75, [512, 4096]), kwargs = {}) | |
| %_param_constant214 : [#users=1] = get_attr[target=_param_constant214] | |
| %addmm_32 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant214, %view_173, %t_67), kwargs = {}) | |
| %view_174 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_32, [2, 256, 1024]), kwargs = {}) | |
| %add_106 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_174, %add_103), kwargs = {}) | |
| %_param_constant215 : [#users=1] = get_attr[target=_param_constant215] | |
| %t_68 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant215,), kwargs = {}) | |
| %view_175 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_106, [512, 1024]), kwargs = {}) | |
| %_param_constant216 : [#users=1] = get_attr[target=_param_constant216] | |
| %addmm_33 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant216, %view_175, %t_68), kwargs = {}) | |
| %view_176 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_33, [2, 256, 1024]), kwargs = {}) | |
| %view_177 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_176, [2, 16, 16, 1024]), kwargs = {}) | |
| %permute_49 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_177, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_44 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_49,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_107 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_44, %div_7), kwargs = {}) | |
| %view_178 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_107, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_106 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_178, torch.float32), kwargs = {}) | |
| %var_mean_34 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_106, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_68 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_34, 0), kwargs = {}) | |
| %getitem_69 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_34, 1), kwargs = {}) | |
| %add_108 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_68, 1e-05), kwargs = {}) | |
| %rsqrt_34 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_108,), kwargs = {}) | |
| %sub_34 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_178, %getitem_69), kwargs = {}) | |
| %mul_76 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_34, %rsqrt_34), kwargs = {}) | |
| %view_179 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_76, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant217 : [#users=1] = get_attr[target=_param_constant217] | |
| %unsqueeze_130 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant217, 0), kwargs = {}) | |
| %unsqueeze_131 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_130, 2), kwargs = {}) | |
| %unsqueeze_132 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_131, 3), kwargs = {}) | |
| %_param_constant218 : [#users=1] = get_attr[target=_param_constant218] | |
| %unsqueeze_133 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant218, 0), kwargs = {}) | |
| %unsqueeze_134 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_133, 2), kwargs = {}) | |
| %unsqueeze_135 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_134, 3), kwargs = {}) | |
| %mul_77 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_179, %unsqueeze_135), kwargs = {}) | |
| %add_109 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_77, %unsqueeze_132), kwargs = {}) | |
| %convert_element_type_107 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_109, torch.float16), kwargs = {}) | |
| %convert_element_type_108 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_69, torch.float16), kwargs = {}) | |
| %convert_element_type_109 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_34, torch.float16), kwargs = {}) | |
| %squeeze_76 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_108, 3), kwargs = {}) | |
| %squeeze_77 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_76, 2), kwargs = {}) | |
| %squeeze_78 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_109, 3), kwargs = {}) | |
| %squeeze_79 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_78, 2), kwargs = {}) | |
| %detach_48 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_77,), kwargs = {}) | |
| %detach_49 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_79,), kwargs = {}) | |
| %silu_22 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_107,), kwargs = {}) | |
| %_param_constant219 : [#users=1] = get_attr[target=_param_constant219] | |
| %_param_constant220 : [#users=1] = get_attr[target=_param_constant220] | |
| %convolution_20 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_22, %_param_constant219, %_param_constant220, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_23 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant221 : [#users=1] = get_attr[target=_param_constant221] | |
| %t_69 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant221,), kwargs = {}) | |
| %_param_constant222 : [#users=1] = get_attr[target=_param_constant222] | |
| %addmm_34 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant222, %silu_23, %t_69), kwargs = {}) | |
| %slice_31 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_34, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_32 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_31, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_136 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_32, 2), kwargs = {}) | |
| %unsqueeze_137 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_136, 3), kwargs = {}) | |
| %add_110 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_20, %unsqueeze_137), kwargs = {}) | |
| %view_180 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_110, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_110 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_180, torch.float32), kwargs = {}) | |
| %var_mean_35 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_110, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_70 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_35, 0), kwargs = {}) | |
| %getitem_71 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_35, 1), kwargs = {}) | |
| %add_111 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_70, 1e-05), kwargs = {}) | |
| %rsqrt_35 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_111,), kwargs = {}) | |
| %sub_35 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_180, %getitem_71), kwargs = {}) | |
| %mul_78 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_35, %rsqrt_35), kwargs = {}) | |
| %view_181 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_78, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant223 : [#users=1] = get_attr[target=_param_constant223] | |
| %unsqueeze_138 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant223, 0), kwargs = {}) | |
| %unsqueeze_139 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_138, 2), kwargs = {}) | |
| %unsqueeze_140 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_139, 3), kwargs = {}) | |
| %_param_constant224 : [#users=1] = get_attr[target=_param_constant224] | |
| %unsqueeze_141 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant224, 0), kwargs = {}) | |
| %unsqueeze_142 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_141, 2), kwargs = {}) | |
| %unsqueeze_143 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_142, 3), kwargs = {}) | |
| %mul_79 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_181, %unsqueeze_143), kwargs = {}) | |
| %add_112 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_79, %unsqueeze_140), kwargs = {}) | |
| %convert_element_type_111 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_112, torch.float16), kwargs = {}) | |
| %convert_element_type_112 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_71, torch.float16), kwargs = {}) | |
| %convert_element_type_113 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_35, torch.float16), kwargs = {}) | |
| %squeeze_80 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_112, 3), kwargs = {}) | |
| %squeeze_81 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_80, 2), kwargs = {}) | |
| %squeeze_82 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_113, 3), kwargs = {}) | |
| %squeeze_83 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_82, 2), kwargs = {}) | |
| %detach_50 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_81,), kwargs = {}) | |
| %detach_51 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_83,), kwargs = {}) | |
| %silu_24 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_111,), kwargs = {}) | |
| %_param_constant225 : [#users=1] = get_attr[target=_param_constant225] | |
| %_param_constant226 : [#users=1] = get_attr[target=_param_constant226] | |
| %convolution_21 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_24, %_param_constant225, %_param_constant226, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_113 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_107, %convolution_21), kwargs = {}) | |
| %div_8 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_113, 1.0), kwargs = {}) | |
| %view_182 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_8, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_114 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_182, torch.float32), kwargs = {}) | |
| %var_mean_36 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_114, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_72 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_36, 0), kwargs = {}) | |
| %getitem_73 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_36, 1), kwargs = {}) | |
| %add_114 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_72, 1e-06), kwargs = {}) | |
| %rsqrt_36 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_114,), kwargs = {}) | |
| %sub_36 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_182, %getitem_73), kwargs = {}) | |
| %mul_80 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_36, %rsqrt_36), kwargs = {}) | |
| %view_183 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_80, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant227 : [#users=1] = get_attr[target=_param_constant227] | |
| %unsqueeze_144 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant227, 0), kwargs = {}) | |
| %unsqueeze_145 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_144, 2), kwargs = {}) | |
| %unsqueeze_146 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_145, 3), kwargs = {}) | |
| %_param_constant228 : [#users=1] = get_attr[target=_param_constant228] | |
| %unsqueeze_147 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant228, 0), kwargs = {}) | |
| %unsqueeze_148 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_147, 2), kwargs = {}) | |
| %unsqueeze_149 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_148, 3), kwargs = {}) | |
| %mul_81 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_183, %unsqueeze_149), kwargs = {}) | |
| %add_115 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_81, %unsqueeze_146), kwargs = {}) | |
| %convert_element_type_115 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_115, torch.float16), kwargs = {}) | |
| %convert_element_type_116 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_73, torch.float16), kwargs = {}) | |
| %convert_element_type_117 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_36, torch.float16), kwargs = {}) | |
| %squeeze_84 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_116, 3), kwargs = {}) | |
| %squeeze_85 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_84, 2), kwargs = {}) | |
| %squeeze_86 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_117, 3), kwargs = {}) | |
| %squeeze_87 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_86, 2), kwargs = {}) | |
| %detach_52 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_85,), kwargs = {}) | |
| %detach_53 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_87,), kwargs = {}) | |
| %permute_50 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_115, [0, 2, 3, 1]), kwargs = {}) | |
| %view_184 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_50, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant229 : [#users=1] = get_attr[target=_param_constant229] | |
| %t_70 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant229,), kwargs = {}) | |
| %expand_11 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_184, [2, 256, 1024]), kwargs = {}) | |
| %view_185 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_11, [2, 256, 1024]), kwargs = {}) | |
| %expand_12 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_70, [2, 1024, 1024]), kwargs = {}) | |
| %view_186 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_12, [2, 1024, 1024]), kwargs = {}) | |
| %bmm_15 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_185, %view_186), kwargs = {}) | |
| %_unsafe_view_75 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_15, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant230 : [#users=1] = get_attr[target=_param_constant230] | |
| %add_116 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_75, %_param_constant230), kwargs = {}) | |
| %convert_element_type_118 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_116, torch.float32), kwargs = {}) | |
| %var_mean_37 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_118, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_74 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_37, 0), kwargs = {}) | |
| %getitem_75 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_37, 1), kwargs = {}) | |
| %add_117 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_74, 1e-05), kwargs = {}) | |
| %rsqrt_37 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_117,), kwargs = {}) | |
| %sub_37 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_116, %getitem_75), kwargs = {}) | |
| %mul_82 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_37, %rsqrt_37), kwargs = {}) | |
| %_param_constant231 : [#users=1] = get_attr[target=_param_constant231] | |
| %mul_83 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_82, %_param_constant231), kwargs = {}) | |
| %_param_constant232 : [#users=1] = get_attr[target=_param_constant232] | |
| %add_118 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_83, %_param_constant232), kwargs = {}) | |
| %convert_element_type_119 : [#users=3] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_118, torch.float16), kwargs = {}) | |
| %_param_constant233 : [#users=1] = get_attr[target=_param_constant233] | |
| %t_71 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant233,), kwargs = {}) | |
| %view_187 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_119, [512, 1024]), kwargs = {}) | |
| %mm_30 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_187, %t_71), kwargs = {}) | |
| %_unsafe_view_76 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_30, [2, 256, 1024]), kwargs = {}) | |
| %view_188 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_76, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_51 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_188, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_45 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_51,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_77 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_45, [16, 256, 128]), kwargs = {}) | |
| %_param_constant234 : [#users=1] = get_attr[target=_param_constant234] | |
| %t_72 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant234,), kwargs = {}) | |
| %view_189 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_119, [512, 1024]), kwargs = {}) | |
| %mm_31 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_189, %t_72), kwargs = {}) | |
| %_unsafe_view_78 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_31, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant235 : [#users=1] = get_attr[target=_param_constant235] | |
| %t_73 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant235,), kwargs = {}) | |
| %view_190 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_119, [512, 1024]), kwargs = {}) | |
| %mm_32 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_190, %t_73), kwargs = {}) | |
| %_unsafe_view_79 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_32, [2, 256, 1024]), kwargs = {}) | |
| %view_191 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_78, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_52 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_191, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_46 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_52,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_80 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_46, [16, 256, 128]), kwargs = {}) | |
| %view_192 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_79, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_53 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_192, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_47 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_53,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_81 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_47, [16, 256, 128]), kwargs = {}) | |
| %empty_10 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 256],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_10 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_80, -1, -2), kwargs = {}) | |
| %baddbmm_10 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_10, %_unsafe_view_77, %transpose_10), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_10 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_10, -1, False), kwargs = {}) | |
| %detach_54 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_10,), kwargs = {}) | |
| %bmm_16 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_10, %_unsafe_view_81), kwargs = {}) | |
| %view_193 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_16, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_54 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_193, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_48 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_54,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_82 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_48, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant236 : [#users=1] = get_attr[target=_param_constant236] | |
| %t_74 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant236,), kwargs = {}) | |
| %view_194 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_82, [512, 1024]), kwargs = {}) | |
| %_param_constant237 : [#users=1] = get_attr[target=_param_constant237] | |
| %addmm_35 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant237, %view_194, %t_74), kwargs = {}) | |
| %view_195 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_35, [2, 256, 1024]), kwargs = {}) | |
| %add_119 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_195, %add_116), kwargs = {}) | |
| %convert_element_type_120 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_119, torch.float32), kwargs = {}) | |
| %var_mean_38 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_120, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_76 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_38, 0), kwargs = {}) | |
| %getitem_77 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_38, 1), kwargs = {}) | |
| %add_120 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_76, 1e-05), kwargs = {}) | |
| %rsqrt_38 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_120,), kwargs = {}) | |
| %sub_38 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_119, %getitem_77), kwargs = {}) | |
| %mul_84 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_38, %rsqrt_38), kwargs = {}) | |
| %_param_constant238 : [#users=1] = get_attr[target=_param_constant238] | |
| %mul_85 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_84, %_param_constant238), kwargs = {}) | |
| %_param_constant239 : [#users=1] = get_attr[target=_param_constant239] | |
| %add_121 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_85, %_param_constant239), kwargs = {}) | |
| %convert_element_type_121 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_121, torch.float16), kwargs = {}) | |
| %_param_constant240 : [#users=1] = get_attr[target=_param_constant240] | |
| %t_75 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant240,), kwargs = {}) | |
| %view_196 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_121, [512, 1024]), kwargs = {}) | |
| %mm_33 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_196, %t_75), kwargs = {}) | |
| %_unsafe_view_83 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_33, [2, 256, 1024]), kwargs = {}) | |
| %view_197 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_83, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_55 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_197, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_49 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_55,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_84 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_49, [16, 256, 128]), kwargs = {}) | |
| %_param_constant241 : [#users=1] = get_attr[target=_param_constant241] | |
| %t_76 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant241,), kwargs = {}) | |
| %view_198 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_34 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_198, %t_76), kwargs = {}) | |
| %_unsafe_view_85 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_34, [2, 77, 1024]), kwargs = {}) | |
| %_param_constant242 : [#users=1] = get_attr[target=_param_constant242] | |
| %t_77 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant242,), kwargs = {}) | |
| %view_199 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_35 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_199, %t_77), kwargs = {}) | |
| %_unsafe_view_86 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_35, [2, 77, 1024]), kwargs = {}) | |
| %view_200 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_85, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_56 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_200, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_50 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_56,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_87 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_50, [16, 77, 128]), kwargs = {}) | |
| %view_201 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_86, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_57 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_201, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_51 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_57,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_88 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_51, [16, 77, 128]), kwargs = {}) | |
| %empty_11 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_11 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_87, -1, -2), kwargs = {}) | |
| %baddbmm_11 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_11, %_unsafe_view_84, %transpose_11), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_11 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_11, -1, False), kwargs = {}) | |
| %detach_55 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_11,), kwargs = {}) | |
| %bmm_17 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_11, %_unsafe_view_88), kwargs = {}) | |
| %view_202 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_17, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_58 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_202, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_52 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_58,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_89 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_52, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant243 : [#users=1] = get_attr[target=_param_constant243] | |
| %t_78 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant243,), kwargs = {}) | |
| %view_203 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_89, [512, 1024]), kwargs = {}) | |
| %_param_constant244 : [#users=1] = get_attr[target=_param_constant244] | |
| %addmm_36 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant244, %view_203, %t_78), kwargs = {}) | |
| %view_204 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_36, [2, 256, 1024]), kwargs = {}) | |
| %add_122 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_204, %add_119), kwargs = {}) | |
| %convert_element_type_122 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_122, torch.float32), kwargs = {}) | |
| %var_mean_39 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_122, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_78 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_39, 0), kwargs = {}) | |
| %getitem_79 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_39, 1), kwargs = {}) | |
| %add_123 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_78, 1e-05), kwargs = {}) | |
| %rsqrt_39 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_123,), kwargs = {}) | |
| %sub_39 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_122, %getitem_79), kwargs = {}) | |
| %mul_86 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_39, %rsqrt_39), kwargs = {}) | |
| %_param_constant245 : [#users=1] = get_attr[target=_param_constant245] | |
| %mul_87 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_86, %_param_constant245), kwargs = {}) | |
| %_param_constant246 : [#users=1] = get_attr[target=_param_constant246] | |
| %add_124 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_87, %_param_constant246), kwargs = {}) | |
| %convert_element_type_123 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_124, torch.float16), kwargs = {}) | |
| %_param_constant247 : [#users=1] = get_attr[target=_param_constant247] | |
| %t_79 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant247,), kwargs = {}) | |
| %view_205 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_123, [512, 1024]), kwargs = {}) | |
| %_param_constant248 : [#users=1] = get_attr[target=_param_constant248] | |
| %addmm_37 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant248, %view_205, %t_79), kwargs = {}) | |
| %view_206 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_37, [2, 256, 8192]), kwargs = {}) | |
| %slice_33 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_206, -1, 0, 4096), kwargs = {}) | |
| %slice_34 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_206, -1, 4096, 8192), kwargs = {}) | |
| %gelu_5 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_34,), kwargs = {}) | |
| %mul_88 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_33, %gelu_5), kwargs = {}) | |
| %_param_constant249 : [#users=1] = get_attr[target=_param_constant249] | |
| %t_80 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant249,), kwargs = {}) | |
| %view_207 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_88, [512, 4096]), kwargs = {}) | |
| %_param_constant250 : [#users=1] = get_attr[target=_param_constant250] | |
| %addmm_38 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant250, %view_207, %t_80), kwargs = {}) | |
| %view_208 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_38, [2, 256, 1024]), kwargs = {}) | |
| %add_125 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_208, %add_122), kwargs = {}) | |
| %_param_constant251 : [#users=1] = get_attr[target=_param_constant251] | |
| %t_81 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant251,), kwargs = {}) | |
| %view_209 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_125, [512, 1024]), kwargs = {}) | |
| %_param_constant252 : [#users=1] = get_attr[target=_param_constant252] | |
| %addmm_39 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant252, %view_209, %t_81), kwargs = {}) | |
| %view_210 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_39, [2, 256, 1024]), kwargs = {}) | |
| %view_211 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_210, [2, 16, 16, 1024]), kwargs = {}) | |
| %permute_59 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_211, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_53 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_59,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_126 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_53, %div_8), kwargs = {}) | |
| %view_212 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_126, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_124 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_212, torch.float32), kwargs = {}) | |
| %var_mean_40 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_124, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_80 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_40, 0), kwargs = {}) | |
| %getitem_81 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_40, 1), kwargs = {}) | |
| %add_127 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_80, 1e-05), kwargs = {}) | |
| %rsqrt_40 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_127,), kwargs = {}) | |
| %sub_40 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_212, %getitem_81), kwargs = {}) | |
| %mul_89 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_40, %rsqrt_40), kwargs = {}) | |
| %view_213 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_89, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant253 : [#users=1] = get_attr[target=_param_constant253] | |
| %unsqueeze_150 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant253, 0), kwargs = {}) | |
| %unsqueeze_151 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_150, 2), kwargs = {}) | |
| %unsqueeze_152 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_151, 3), kwargs = {}) | |
| %_param_constant254 : [#users=1] = get_attr[target=_param_constant254] | |
| %unsqueeze_153 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant254, 0), kwargs = {}) | |
| %unsqueeze_154 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_153, 2), kwargs = {}) | |
| %unsqueeze_155 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_154, 3), kwargs = {}) | |
| %mul_90 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_213, %unsqueeze_155), kwargs = {}) | |
| %add_128 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_90, %unsqueeze_152), kwargs = {}) | |
| %convert_element_type_125 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_128, torch.float16), kwargs = {}) | |
| %convert_element_type_126 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_81, torch.float16), kwargs = {}) | |
| %convert_element_type_127 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_40, torch.float16), kwargs = {}) | |
| %squeeze_88 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_126, 3), kwargs = {}) | |
| %squeeze_89 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_88, 2), kwargs = {}) | |
| %squeeze_90 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_127, 3), kwargs = {}) | |
| %squeeze_91 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_90, 2), kwargs = {}) | |
| %detach_56 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_89,), kwargs = {}) | |
| %detach_57 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_91,), kwargs = {}) | |
| %silu_25 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_125,), kwargs = {}) | |
| %_param_constant255 : [#users=1] = get_attr[target=_param_constant255] | |
| %_param_constant256 : [#users=1] = get_attr[target=_param_constant256] | |
| %convolution_22 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_25, %_param_constant255, %_param_constant256, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_26 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant257 : [#users=1] = get_attr[target=_param_constant257] | |
| %t_82 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant257,), kwargs = {}) | |
| %_param_constant258 : [#users=1] = get_attr[target=_param_constant258] | |
| %addmm_40 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant258, %silu_26, %t_82), kwargs = {}) | |
| %slice_35 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_40, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_36 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_35, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_156 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_36, 2), kwargs = {}) | |
| %unsqueeze_157 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_156, 3), kwargs = {}) | |
| %add_129 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_22, %unsqueeze_157), kwargs = {}) | |
| %view_214 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_129, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_128 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_214, torch.float32), kwargs = {}) | |
| %var_mean_41 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_128, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_82 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_41, 0), kwargs = {}) | |
| %getitem_83 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_41, 1), kwargs = {}) | |
| %add_130 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_82, 1e-05), kwargs = {}) | |
| %rsqrt_41 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_130,), kwargs = {}) | |
| %sub_41 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_214, %getitem_83), kwargs = {}) | |
| %mul_91 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_41, %rsqrt_41), kwargs = {}) | |
| %view_215 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_91, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant259 : [#users=1] = get_attr[target=_param_constant259] | |
| %unsqueeze_158 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant259, 0), kwargs = {}) | |
| %unsqueeze_159 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_158, 2), kwargs = {}) | |
| %unsqueeze_160 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_159, 3), kwargs = {}) | |
| %_param_constant260 : [#users=1] = get_attr[target=_param_constant260] | |
| %unsqueeze_161 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant260, 0), kwargs = {}) | |
| %unsqueeze_162 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_161, 2), kwargs = {}) | |
| %unsqueeze_163 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_162, 3), kwargs = {}) | |
| %mul_92 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_215, %unsqueeze_163), kwargs = {}) | |
| %add_131 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_92, %unsqueeze_160), kwargs = {}) | |
| %convert_element_type_129 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_131, torch.float16), kwargs = {}) | |
| %convert_element_type_130 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_83, torch.float16), kwargs = {}) | |
| %convert_element_type_131 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_41, torch.float16), kwargs = {}) | |
| %squeeze_92 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_130, 3), kwargs = {}) | |
| %squeeze_93 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_92, 2), kwargs = {}) | |
| %squeeze_94 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_131, 3), kwargs = {}) | |
| %squeeze_95 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_94, 2), kwargs = {}) | |
| %detach_58 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_93,), kwargs = {}) | |
| %detach_59 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_95,), kwargs = {}) | |
| %silu_27 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_129,), kwargs = {}) | |
| %_param_constant261 : [#users=1] = get_attr[target=_param_constant261] | |
| %_param_constant262 : [#users=1] = get_attr[target=_param_constant262] | |
| %convolution_23 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_27, %_param_constant261, %_param_constant262, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_132 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_126, %convolution_23), kwargs = {}) | |
| %div_9 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_132, 1), kwargs = {}) | |
| %view_216 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_9, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_132 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_216, torch.float32), kwargs = {}) | |
| %var_mean_42 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_132, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_84 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_42, 0), kwargs = {}) | |
| %getitem_85 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_42, 1), kwargs = {}) | |
| %add_133 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_84, 1e-06), kwargs = {}) | |
| %rsqrt_42 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_133,), kwargs = {}) | |
| %sub_42 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_216, %getitem_85), kwargs = {}) | |
| %mul_93 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_42, %rsqrt_42), kwargs = {}) | |
| %view_217 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_93, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant263 : [#users=1] = get_attr[target=_param_constant263] | |
| %unsqueeze_164 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant263, 0), kwargs = {}) | |
| %unsqueeze_165 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_164, 2), kwargs = {}) | |
| %unsqueeze_166 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_165, 3), kwargs = {}) | |
| %_param_constant264 : [#users=1] = get_attr[target=_param_constant264] | |
| %unsqueeze_167 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant264, 0), kwargs = {}) | |
| %unsqueeze_168 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_167, 2), kwargs = {}) | |
| %unsqueeze_169 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_168, 3), kwargs = {}) | |
| %mul_94 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_217, %unsqueeze_169), kwargs = {}) | |
| %add_134 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_94, %unsqueeze_166), kwargs = {}) | |
| %convert_element_type_133 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_134, torch.float16), kwargs = {}) | |
| %convert_element_type_134 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_85, torch.float16), kwargs = {}) | |
| %convert_element_type_135 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_42, torch.float16), kwargs = {}) | |
| %squeeze_96 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_134, 3), kwargs = {}) | |
| %squeeze_97 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_96, 2), kwargs = {}) | |
| %squeeze_98 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_135, 3), kwargs = {}) | |
| %squeeze_99 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_98, 2), kwargs = {}) | |
| %detach_60 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_97,), kwargs = {}) | |
| %detach_61 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_99,), kwargs = {}) | |
| %permute_60 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_133, [0, 2, 3, 1]), kwargs = {}) | |
| %view_218 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_60, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant265 : [#users=1] = get_attr[target=_param_constant265] | |
| %t_83 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant265,), kwargs = {}) | |
| %expand_13 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_218, [2, 256, 1024]), kwargs = {}) | |
| %view_219 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_13, [2, 256, 1024]), kwargs = {}) | |
| %expand_14 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_83, [2, 1024, 1024]), kwargs = {}) | |
| %view_220 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_14, [2, 1024, 1024]), kwargs = {}) | |
| %bmm_18 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_219, %view_220), kwargs = {}) | |
| %_unsafe_view_90 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_18, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant266 : [#users=1] = get_attr[target=_param_constant266] | |
| %add_135 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_90, %_param_constant266), kwargs = {}) | |
| %convert_element_type_136 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_135, torch.float32), kwargs = {}) | |
| %var_mean_43 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_136, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_86 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_43, 0), kwargs = {}) | |
| %getitem_87 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_43, 1), kwargs = {}) | |
| %add_136 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_86, 1e-05), kwargs = {}) | |
| %rsqrt_43 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_136,), kwargs = {}) | |
| %sub_43 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_135, %getitem_87), kwargs = {}) | |
| %mul_95 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_43, %rsqrt_43), kwargs = {}) | |
| %_param_constant267 : [#users=1] = get_attr[target=_param_constant267] | |
| %mul_96 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_95, %_param_constant267), kwargs = {}) | |
| %_param_constant268 : [#users=1] = get_attr[target=_param_constant268] | |
| %add_137 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_96, %_param_constant268), kwargs = {}) | |
| %convert_element_type_137 : [#users=3] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_137, torch.float16), kwargs = {}) | |
| %_param_constant269 : [#users=1] = get_attr[target=_param_constant269] | |
| %t_84 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant269,), kwargs = {}) | |
| %view_221 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_137, [512, 1024]), kwargs = {}) | |
| %mm_36 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_221, %t_84), kwargs = {}) | |
| %_unsafe_view_91 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_36, [2, 256, 1024]), kwargs = {}) | |
| %view_222 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_91, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_61 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_222, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_54 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_61,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_92 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_54, [16, 256, 128]), kwargs = {}) | |
| %_param_constant270 : [#users=1] = get_attr[target=_param_constant270] | |
| %t_85 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant270,), kwargs = {}) | |
| %view_223 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_137, [512, 1024]), kwargs = {}) | |
| %mm_37 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_223, %t_85), kwargs = {}) | |
| %_unsafe_view_93 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_37, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant271 : [#users=1] = get_attr[target=_param_constant271] | |
| %t_86 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant271,), kwargs = {}) | |
| %view_224 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_137, [512, 1024]), kwargs = {}) | |
| %mm_38 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_224, %t_86), kwargs = {}) | |
| %_unsafe_view_94 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_38, [2, 256, 1024]), kwargs = {}) | |
| %view_225 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_93, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_62 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_225, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_55 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_62,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_95 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_55, [16, 256, 128]), kwargs = {}) | |
| %view_226 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_94, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_63 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_226, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_56 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_63,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_96 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_56, [16, 256, 128]), kwargs = {}) | |
| %empty_12 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 256],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_12 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_95, -1, -2), kwargs = {}) | |
| %baddbmm_12 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_12, %_unsafe_view_92, %transpose_12), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_12 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_12, -1, False), kwargs = {}) | |
| %detach_62 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_12,), kwargs = {}) | |
| %bmm_19 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_12, %_unsafe_view_96), kwargs = {}) | |
| %view_227 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_19, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_64 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_227, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_57 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_64,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_97 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_57, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant272 : [#users=1] = get_attr[target=_param_constant272] | |
| %t_87 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant272,), kwargs = {}) | |
| %view_228 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_97, [512, 1024]), kwargs = {}) | |
| %_param_constant273 : [#users=1] = get_attr[target=_param_constant273] | |
| %addmm_41 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant273, %view_228, %t_87), kwargs = {}) | |
| %view_229 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_41, [2, 256, 1024]), kwargs = {}) | |
| %add_138 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_229, %add_135), kwargs = {}) | |
| %convert_element_type_138 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_138, torch.float32), kwargs = {}) | |
| %var_mean_44 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_138, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_88 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_44, 0), kwargs = {}) | |
| %getitem_89 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_44, 1), kwargs = {}) | |
| %add_139 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_88, 1e-05), kwargs = {}) | |
| %rsqrt_44 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_139,), kwargs = {}) | |
| %sub_44 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_138, %getitem_89), kwargs = {}) | |
| %mul_97 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_44, %rsqrt_44), kwargs = {}) | |
| %_param_constant274 : [#users=1] = get_attr[target=_param_constant274] | |
| %mul_98 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_97, %_param_constant274), kwargs = {}) | |
| %_param_constant275 : [#users=1] = get_attr[target=_param_constant275] | |
| %add_140 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_98, %_param_constant275), kwargs = {}) | |
| %convert_element_type_139 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_140, torch.float16), kwargs = {}) | |
| %_param_constant276 : [#users=1] = get_attr[target=_param_constant276] | |
| %t_88 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant276,), kwargs = {}) | |
| %view_230 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_139, [512, 1024]), kwargs = {}) | |
| %mm_39 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_230, %t_88), kwargs = {}) | |
| %_unsafe_view_98 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_39, [2, 256, 1024]), kwargs = {}) | |
| %view_231 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_98, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_65 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_231, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_58 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_65,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_99 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_58, [16, 256, 128]), kwargs = {}) | |
| %_param_constant277 : [#users=1] = get_attr[target=_param_constant277] | |
| %t_89 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant277,), kwargs = {}) | |
| %view_232 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_40 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_232, %t_89), kwargs = {}) | |
| %_unsafe_view_100 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_40, [2, 77, 1024]), kwargs = {}) | |
| %_param_constant278 : [#users=1] = get_attr[target=_param_constant278] | |
| %t_90 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant278,), kwargs = {}) | |
| %view_233 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_41 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_233, %t_90), kwargs = {}) | |
| %_unsafe_view_101 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_41, [2, 77, 1024]), kwargs = {}) | |
| %view_234 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_100, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_66 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_234, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_59 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_66,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_102 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_59, [16, 77, 128]), kwargs = {}) | |
| %view_235 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_101, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_67 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_235, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_60 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_67,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_103 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_60, [16, 77, 128]), kwargs = {}) | |
| %empty_13 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_13 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_102, -1, -2), kwargs = {}) | |
| %baddbmm_13 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_13, %_unsafe_view_99, %transpose_13), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_13 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_13, -1, False), kwargs = {}) | |
| %detach_63 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_13,), kwargs = {}) | |
| %bmm_20 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_13, %_unsafe_view_103), kwargs = {}) | |
| %view_236 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_20, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_68 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_236, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_61 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_68,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_104 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_61, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant279 : [#users=1] = get_attr[target=_param_constant279] | |
| %t_91 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant279,), kwargs = {}) | |
| %view_237 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_104, [512, 1024]), kwargs = {}) | |
| %_param_constant280 : [#users=1] = get_attr[target=_param_constant280] | |
| %addmm_42 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant280, %view_237, %t_91), kwargs = {}) | |
| %view_238 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_42, [2, 256, 1024]), kwargs = {}) | |
| %add_141 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_238, %add_138), kwargs = {}) | |
| %convert_element_type_140 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_141, torch.float32), kwargs = {}) | |
| %var_mean_45 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_140, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_90 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_45, 0), kwargs = {}) | |
| %getitem_91 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_45, 1), kwargs = {}) | |
| %add_142 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_90, 1e-05), kwargs = {}) | |
| %rsqrt_45 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_142,), kwargs = {}) | |
| %sub_45 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_141, %getitem_91), kwargs = {}) | |
| %mul_99 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_45, %rsqrt_45), kwargs = {}) | |
| %_param_constant281 : [#users=1] = get_attr[target=_param_constant281] | |
| %mul_100 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_99, %_param_constant281), kwargs = {}) | |
| %_param_constant282 : [#users=1] = get_attr[target=_param_constant282] | |
| %add_143 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_100, %_param_constant282), kwargs = {}) | |
| %convert_element_type_141 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_143, torch.float16), kwargs = {}) | |
| %_param_constant283 : [#users=1] = get_attr[target=_param_constant283] | |
| %t_92 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant283,), kwargs = {}) | |
| %view_239 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_141, [512, 1024]), kwargs = {}) | |
| %_param_constant284 : [#users=1] = get_attr[target=_param_constant284] | |
| %addmm_43 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant284, %view_239, %t_92), kwargs = {}) | |
| %view_240 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_43, [2, 256, 8192]), kwargs = {}) | |
| %slice_37 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_240, -1, 0, 4096), kwargs = {}) | |
| %slice_38 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_240, -1, 4096, 8192), kwargs = {}) | |
| %gelu_6 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_38,), kwargs = {}) | |
| %mul_101 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_37, %gelu_6), kwargs = {}) | |
| %_param_constant285 : [#users=1] = get_attr[target=_param_constant285] | |
| %t_93 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant285,), kwargs = {}) | |
| %view_241 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_101, [512, 4096]), kwargs = {}) | |
| %_param_constant286 : [#users=1] = get_attr[target=_param_constant286] | |
| %addmm_44 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant286, %view_241, %t_93), kwargs = {}) | |
| %view_242 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_44, [2, 256, 1024]), kwargs = {}) | |
| %add_144 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_242, %add_141), kwargs = {}) | |
| %_param_constant287 : [#users=1] = get_attr[target=_param_constant287] | |
| %t_94 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant287,), kwargs = {}) | |
| %view_243 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_144, [512, 1024]), kwargs = {}) | |
| %_param_constant288 : [#users=1] = get_attr[target=_param_constant288] | |
| %addmm_45 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant288, %view_243, %t_94), kwargs = {}) | |
| %view_244 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_45, [2, 256, 1024]), kwargs = {}) | |
| %view_245 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_244, [2, 16, 16, 1024]), kwargs = {}) | |
| %permute_69 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_245, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_62 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_69,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_145 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_62, %div_9), kwargs = {}) | |
| %view_246 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_145, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_142 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_246, torch.float32), kwargs = {}) | |
| %var_mean_46 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_142, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_92 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_46, 0), kwargs = {}) | |
| %getitem_93 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_46, 1), kwargs = {}) | |
| %add_146 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_92, 1e-05), kwargs = {}) | |
| %rsqrt_46 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_146,), kwargs = {}) | |
| %sub_46 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_246, %getitem_93), kwargs = {}) | |
| %mul_102 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_46, %rsqrt_46), kwargs = {}) | |
| %view_247 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_102, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant289 : [#users=1] = get_attr[target=_param_constant289] | |
| %unsqueeze_170 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant289, 0), kwargs = {}) | |
| %unsqueeze_171 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_170, 2), kwargs = {}) | |
| %unsqueeze_172 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_171, 3), kwargs = {}) | |
| %_param_constant290 : [#users=1] = get_attr[target=_param_constant290] | |
| %unsqueeze_173 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant290, 0), kwargs = {}) | |
| %unsqueeze_174 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_173, 2), kwargs = {}) | |
| %unsqueeze_175 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_174, 3), kwargs = {}) | |
| %mul_103 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_247, %unsqueeze_175), kwargs = {}) | |
| %add_147 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_103, %unsqueeze_172), kwargs = {}) | |
| %convert_element_type_143 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_147, torch.float16), kwargs = {}) | |
| %convert_element_type_144 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_93, torch.float16), kwargs = {}) | |
| %convert_element_type_145 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_46, torch.float16), kwargs = {}) | |
| %squeeze_100 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_144, 3), kwargs = {}) | |
| %squeeze_101 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_100, 2), kwargs = {}) | |
| %squeeze_102 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_145, 3), kwargs = {}) | |
| %squeeze_103 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_102, 2), kwargs = {}) | |
| %detach_64 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_101,), kwargs = {}) | |
| %detach_65 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_103,), kwargs = {}) | |
| %silu_28 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_143,), kwargs = {}) | |
| %_param_constant291 : [#users=1] = get_attr[target=_param_constant291] | |
| %_param_constant292 : [#users=1] = get_attr[target=_param_constant292] | |
| %convolution_24 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_28, %_param_constant291, %_param_constant292, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_29 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant293 : [#users=1] = get_attr[target=_param_constant293] | |
| %t_95 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant293,), kwargs = {}) | |
| %_param_constant294 : [#users=1] = get_attr[target=_param_constant294] | |
| %addmm_46 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant294, %silu_29, %t_95), kwargs = {}) | |
| %slice_39 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_46, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_40 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_39, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_176 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_40, 2), kwargs = {}) | |
| %unsqueeze_177 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_176, 3), kwargs = {}) | |
| %add_148 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_24, %unsqueeze_177), kwargs = {}) | |
| %view_248 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_148, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_146 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_248, torch.float32), kwargs = {}) | |
| %var_mean_47 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_146, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_94 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_47, 0), kwargs = {}) | |
| %getitem_95 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_47, 1), kwargs = {}) | |
| %add_149 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_94, 1e-05), kwargs = {}) | |
| %rsqrt_47 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_149,), kwargs = {}) | |
| %sub_47 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_248, %getitem_95), kwargs = {}) | |
| %mul_104 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_47, %rsqrt_47), kwargs = {}) | |
| %view_249 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_104, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant295 : [#users=1] = get_attr[target=_param_constant295] | |
| %unsqueeze_178 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant295, 0), kwargs = {}) | |
| %unsqueeze_179 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_178, 2), kwargs = {}) | |
| %unsqueeze_180 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_179, 3), kwargs = {}) | |
| %_param_constant296 : [#users=1] = get_attr[target=_param_constant296] | |
| %unsqueeze_181 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant296, 0), kwargs = {}) | |
| %unsqueeze_182 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_181, 2), kwargs = {}) | |
| %unsqueeze_183 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_182, 3), kwargs = {}) | |
| %mul_105 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_249, %unsqueeze_183), kwargs = {}) | |
| %add_150 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_105, %unsqueeze_180), kwargs = {}) | |
| %convert_element_type_147 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_150, torch.float16), kwargs = {}) | |
| %convert_element_type_148 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_95, torch.float16), kwargs = {}) | |
| %convert_element_type_149 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_47, torch.float16), kwargs = {}) | |
| %squeeze_104 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_148, 3), kwargs = {}) | |
| %squeeze_105 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_104, 2), kwargs = {}) | |
| %squeeze_106 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_149, 3), kwargs = {}) | |
| %squeeze_107 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_106, 2), kwargs = {}) | |
| %detach_66 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_105,), kwargs = {}) | |
| %detach_67 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_107,), kwargs = {}) | |
| %silu_30 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_147,), kwargs = {}) | |
| %_param_constant297 : [#users=1] = get_attr[target=_param_constant297] | |
| %_param_constant298 : [#users=1] = get_attr[target=_param_constant298] | |
| %convolution_25 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_30, %_param_constant297, %_param_constant298, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_151 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_145, %convolution_25), kwargs = {}) | |
| %div_10 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_151, 1), kwargs = {}) | |
| %cat_2 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%div_10, %add_126], 1), kwargs = {}) | |
| %view_250 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_2, [2, 32, 64, 256]), kwargs = {}) | |
| %convert_element_type_150 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_250, torch.float32), kwargs = {}) | |
| %var_mean_48 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_150, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_96 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_48, 0), kwargs = {}) | |
| %getitem_97 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_48, 1), kwargs = {}) | |
| %add_152 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_96, 1e-05), kwargs = {}) | |
| %rsqrt_48 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_152,), kwargs = {}) | |
| %sub_48 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_250, %getitem_97), kwargs = {}) | |
| %mul_106 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_48, %rsqrt_48), kwargs = {}) | |
| %view_251 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_106, [2, 2048, 16, 16]), kwargs = {}) | |
| %_param_constant299 : [#users=1] = get_attr[target=_param_constant299] | |
| %unsqueeze_184 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant299, 0), kwargs = {}) | |
| %unsqueeze_185 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_184, 2), kwargs = {}) | |
| %unsqueeze_186 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_185, 3), kwargs = {}) | |
| %_param_constant300 : [#users=1] = get_attr[target=_param_constant300] | |
| %unsqueeze_187 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant300, 0), kwargs = {}) | |
| %unsqueeze_188 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_187, 2), kwargs = {}) | |
| %unsqueeze_189 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_188, 3), kwargs = {}) | |
| %mul_107 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_251, %unsqueeze_189), kwargs = {}) | |
| %add_153 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_107, %unsqueeze_186), kwargs = {}) | |
| %convert_element_type_151 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_153, torch.float16), kwargs = {}) | |
| %convert_element_type_152 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_97, torch.float16), kwargs = {}) | |
| %convert_element_type_153 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_48, torch.float16), kwargs = {}) | |
| %squeeze_108 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_152, 3), kwargs = {}) | |
| %squeeze_109 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_108, 2), kwargs = {}) | |
| %squeeze_110 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_153, 3), kwargs = {}) | |
| %squeeze_111 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_110, 2), kwargs = {}) | |
| %detach_68 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_109,), kwargs = {}) | |
| %detach_69 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_111,), kwargs = {}) | |
| %silu_31 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_151,), kwargs = {}) | |
| %_param_constant301 : [#users=1] = get_attr[target=_param_constant301] | |
| %_param_constant302 : [#users=1] = get_attr[target=_param_constant302] | |
| %convolution_26 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_31, %_param_constant301, %_param_constant302, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_32 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant303 : [#users=1] = get_attr[target=_param_constant303] | |
| %t_96 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant303,), kwargs = {}) | |
| %_param_constant304 : [#users=1] = get_attr[target=_param_constant304] | |
| %addmm_47 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant304, %silu_32, %t_96), kwargs = {}) | |
| %slice_41 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_47, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_42 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_41, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_190 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_42, 2), kwargs = {}) | |
| %unsqueeze_191 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_190, 3), kwargs = {}) | |
| %add_154 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_26, %unsqueeze_191), kwargs = {}) | |
| %view_252 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_154, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_154 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_252, torch.float32), kwargs = {}) | |
| %var_mean_49 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_154, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_98 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_49, 0), kwargs = {}) | |
| %getitem_99 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_49, 1), kwargs = {}) | |
| %add_155 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_98, 1e-05), kwargs = {}) | |
| %rsqrt_49 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_155,), kwargs = {}) | |
| %sub_49 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_252, %getitem_99), kwargs = {}) | |
| %mul_108 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_49, %rsqrt_49), kwargs = {}) | |
| %view_253 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_108, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant305 : [#users=1] = get_attr[target=_param_constant305] | |
| %unsqueeze_192 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant305, 0), kwargs = {}) | |
| %unsqueeze_193 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_192, 2), kwargs = {}) | |
| %unsqueeze_194 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_193, 3), kwargs = {}) | |
| %_param_constant306 : [#users=1] = get_attr[target=_param_constant306] | |
| %unsqueeze_195 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant306, 0), kwargs = {}) | |
| %unsqueeze_196 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_195, 2), kwargs = {}) | |
| %unsqueeze_197 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_196, 3), kwargs = {}) | |
| %mul_109 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_253, %unsqueeze_197), kwargs = {}) | |
| %add_156 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_109, %unsqueeze_194), kwargs = {}) | |
| %convert_element_type_155 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_156, torch.float16), kwargs = {}) | |
| %convert_element_type_156 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_99, torch.float16), kwargs = {}) | |
| %convert_element_type_157 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_49, torch.float16), kwargs = {}) | |
| %squeeze_112 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_156, 3), kwargs = {}) | |
| %squeeze_113 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_112, 2), kwargs = {}) | |
| %squeeze_114 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_157, 3), kwargs = {}) | |
| %squeeze_115 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_114, 2), kwargs = {}) | |
| %detach_70 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_113,), kwargs = {}) | |
| %detach_71 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_115,), kwargs = {}) | |
| %silu_33 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_155,), kwargs = {}) | |
| %_param_constant307 : [#users=1] = get_attr[target=_param_constant307] | |
| %_param_constant308 : [#users=1] = get_attr[target=_param_constant308] | |
| %convolution_27 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_33, %_param_constant307, %_param_constant308, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant309 : [#users=1] = get_attr[target=_param_constant309] | |
| %_param_constant310 : [#users=1] = get_attr[target=_param_constant310] | |
| %convolution_28 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_2, %_param_constant309, %_param_constant310, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_157 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_28, %convolution_27), kwargs = {}) | |
| %div_11 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_157, 1.0), kwargs = {}) | |
| %view_254 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_11, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_158 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_254, torch.float32), kwargs = {}) | |
| %var_mean_50 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_158, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_100 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_50, 0), kwargs = {}) | |
| %getitem_101 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_50, 1), kwargs = {}) | |
| %add_158 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_100, 1e-06), kwargs = {}) | |
| %rsqrt_50 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_158,), kwargs = {}) | |
| %sub_50 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_254, %getitem_101), kwargs = {}) | |
| %mul_110 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_50, %rsqrt_50), kwargs = {}) | |
| %view_255 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_110, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant311 : [#users=1] = get_attr[target=_param_constant311] | |
| %unsqueeze_198 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant311, 0), kwargs = {}) | |
| %unsqueeze_199 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_198, 2), kwargs = {}) | |
| %unsqueeze_200 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_199, 3), kwargs = {}) | |
| %_param_constant312 : [#users=1] = get_attr[target=_param_constant312] | |
| %unsqueeze_201 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant312, 0), kwargs = {}) | |
| %unsqueeze_202 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_201, 2), kwargs = {}) | |
| %unsqueeze_203 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_202, 3), kwargs = {}) | |
| %mul_111 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_255, %unsqueeze_203), kwargs = {}) | |
| %add_159 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_111, %unsqueeze_200), kwargs = {}) | |
| %convert_element_type_159 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_159, torch.float16), kwargs = {}) | |
| %convert_element_type_160 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_101, torch.float16), kwargs = {}) | |
| %convert_element_type_161 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_50, torch.float16), kwargs = {}) | |
| %squeeze_116 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_160, 3), kwargs = {}) | |
| %squeeze_117 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_116, 2), kwargs = {}) | |
| %squeeze_118 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_161, 3), kwargs = {}) | |
| %squeeze_119 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_118, 2), kwargs = {}) | |
| %detach_72 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_117,), kwargs = {}) | |
| %detach_73 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_119,), kwargs = {}) | |
| %permute_70 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_159, [0, 2, 3, 1]), kwargs = {}) | |
| %view_256 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_70, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant313 : [#users=1] = get_attr[target=_param_constant313] | |
| %t_97 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant313,), kwargs = {}) | |
| %expand_15 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_256, [2, 256, 1024]), kwargs = {}) | |
| %view_257 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_15, [2, 256, 1024]), kwargs = {}) | |
| %expand_16 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_97, [2, 1024, 1024]), kwargs = {}) | |
| %view_258 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_16, [2, 1024, 1024]), kwargs = {}) | |
| %bmm_21 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_257, %view_258), kwargs = {}) | |
| %_unsafe_view_105 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_21, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant314 : [#users=1] = get_attr[target=_param_constant314] | |
| %add_160 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_105, %_param_constant314), kwargs = {}) | |
| %convert_element_type_162 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_160, torch.float32), kwargs = {}) | |
| %var_mean_51 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_162, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_102 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_51, 0), kwargs = {}) | |
| %getitem_103 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_51, 1), kwargs = {}) | |
| %add_161 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_102, 1e-05), kwargs = {}) | |
| %rsqrt_51 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_161,), kwargs = {}) | |
| %sub_51 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_160, %getitem_103), kwargs = {}) | |
| %mul_112 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_51, %rsqrt_51), kwargs = {}) | |
| %_param_constant315 : [#users=1] = get_attr[target=_param_constant315] | |
| %mul_113 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_112, %_param_constant315), kwargs = {}) | |
| %_param_constant316 : [#users=1] = get_attr[target=_param_constant316] | |
| %add_162 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_113, %_param_constant316), kwargs = {}) | |
| %convert_element_type_163 : [#users=3] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_162, torch.float16), kwargs = {}) | |
| %_param_constant317 : [#users=1] = get_attr[target=_param_constant317] | |
| %t_98 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant317,), kwargs = {}) | |
| %view_259 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_163, [512, 1024]), kwargs = {}) | |
| %mm_42 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_259, %t_98), kwargs = {}) | |
| %_unsafe_view_106 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_42, [2, 256, 1024]), kwargs = {}) | |
| %view_260 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_106, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_71 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_260, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_63 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_71,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_107 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_63, [16, 256, 128]), kwargs = {}) | |
| %_param_constant318 : [#users=1] = get_attr[target=_param_constant318] | |
| %t_99 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant318,), kwargs = {}) | |
| %view_261 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_163, [512, 1024]), kwargs = {}) | |
| %mm_43 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_261, %t_99), kwargs = {}) | |
| %_unsafe_view_108 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_43, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant319 : [#users=1] = get_attr[target=_param_constant319] | |
| %t_100 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant319,), kwargs = {}) | |
| %view_262 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_163, [512, 1024]), kwargs = {}) | |
| %mm_44 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_262, %t_100), kwargs = {}) | |
| %_unsafe_view_109 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_44, [2, 256, 1024]), kwargs = {}) | |
| %view_263 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_108, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_72 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_263, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_64 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_72,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_110 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_64, [16, 256, 128]), kwargs = {}) | |
| %view_264 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_109, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_73 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_264, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_65 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_73,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_111 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_65, [16, 256, 128]), kwargs = {}) | |
| %empty_14 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 256],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_14 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_110, -1, -2), kwargs = {}) | |
| %baddbmm_14 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_14, %_unsafe_view_107, %transpose_14), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_14 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_14, -1, False), kwargs = {}) | |
| %detach_74 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_14,), kwargs = {}) | |
| %bmm_22 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_14, %_unsafe_view_111), kwargs = {}) | |
| %view_265 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_22, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_74 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_265, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_66 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_74,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_112 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_66, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant320 : [#users=1] = get_attr[target=_param_constant320] | |
| %t_101 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant320,), kwargs = {}) | |
| %view_266 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_112, [512, 1024]), kwargs = {}) | |
| %_param_constant321 : [#users=1] = get_attr[target=_param_constant321] | |
| %addmm_48 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant321, %view_266, %t_101), kwargs = {}) | |
| %view_267 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_48, [2, 256, 1024]), kwargs = {}) | |
| %add_163 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_267, %add_160), kwargs = {}) | |
| %convert_element_type_164 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_163, torch.float32), kwargs = {}) | |
| %var_mean_52 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_164, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_104 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_52, 0), kwargs = {}) | |
| %getitem_105 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_52, 1), kwargs = {}) | |
| %add_164 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_104, 1e-05), kwargs = {}) | |
| %rsqrt_52 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_164,), kwargs = {}) | |
| %sub_52 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_163, %getitem_105), kwargs = {}) | |
| %mul_114 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_52, %rsqrt_52), kwargs = {}) | |
| %_param_constant322 : [#users=1] = get_attr[target=_param_constant322] | |
| %mul_115 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_114, %_param_constant322), kwargs = {}) | |
| %_param_constant323 : [#users=1] = get_attr[target=_param_constant323] | |
| %add_165 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_115, %_param_constant323), kwargs = {}) | |
| %convert_element_type_165 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_165, torch.float16), kwargs = {}) | |
| %_param_constant324 : [#users=1] = get_attr[target=_param_constant324] | |
| %t_102 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant324,), kwargs = {}) | |
| %view_268 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_165, [512, 1024]), kwargs = {}) | |
| %mm_45 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_268, %t_102), kwargs = {}) | |
| %_unsafe_view_113 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_45, [2, 256, 1024]), kwargs = {}) | |
| %view_269 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_113, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_75 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_269, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_67 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_75,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_114 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_67, [16, 256, 128]), kwargs = {}) | |
| %_param_constant325 : [#users=1] = get_attr[target=_param_constant325] | |
| %t_103 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant325,), kwargs = {}) | |
| %view_270 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_46 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_270, %t_103), kwargs = {}) | |
| %_unsafe_view_115 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_46, [2, 77, 1024]), kwargs = {}) | |
| %_param_constant326 : [#users=1] = get_attr[target=_param_constant326] | |
| %t_104 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant326,), kwargs = {}) | |
| %view_271 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_47 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_271, %t_104), kwargs = {}) | |
| %_unsafe_view_116 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_47, [2, 77, 1024]), kwargs = {}) | |
| %view_272 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_115, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_76 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_272, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_68 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_76,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_117 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_68, [16, 77, 128]), kwargs = {}) | |
| %view_273 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_116, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_77 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_273, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_69 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_77,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_118 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_69, [16, 77, 128]), kwargs = {}) | |
| %empty_15 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_15 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_117, -1, -2), kwargs = {}) | |
| %baddbmm_15 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_15, %_unsafe_view_114, %transpose_15), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_15 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_15, -1, False), kwargs = {}) | |
| %detach_75 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_15,), kwargs = {}) | |
| %bmm_23 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_15, %_unsafe_view_118), kwargs = {}) | |
| %view_274 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_23, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_78 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_274, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_70 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_78,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_119 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_70, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant327 : [#users=1] = get_attr[target=_param_constant327] | |
| %t_105 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant327,), kwargs = {}) | |
| %view_275 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_119, [512, 1024]), kwargs = {}) | |
| %_param_constant328 : [#users=1] = get_attr[target=_param_constant328] | |
| %addmm_49 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant328, %view_275, %t_105), kwargs = {}) | |
| %view_276 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_49, [2, 256, 1024]), kwargs = {}) | |
| %add_166 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_276, %add_163), kwargs = {}) | |
| %convert_element_type_166 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_166, torch.float32), kwargs = {}) | |
| %var_mean_53 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_166, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_106 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_53, 0), kwargs = {}) | |
| %getitem_107 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_53, 1), kwargs = {}) | |
| %add_167 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_106, 1e-05), kwargs = {}) | |
| %rsqrt_53 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_167,), kwargs = {}) | |
| %sub_53 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_166, %getitem_107), kwargs = {}) | |
| %mul_116 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_53, %rsqrt_53), kwargs = {}) | |
| %_param_constant329 : [#users=1] = get_attr[target=_param_constant329] | |
| %mul_117 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_116, %_param_constant329), kwargs = {}) | |
| %_param_constant330 : [#users=1] = get_attr[target=_param_constant330] | |
| %add_168 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_117, %_param_constant330), kwargs = {}) | |
| %convert_element_type_167 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_168, torch.float16), kwargs = {}) | |
| %_param_constant331 : [#users=1] = get_attr[target=_param_constant331] | |
| %t_106 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant331,), kwargs = {}) | |
| %view_277 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_167, [512, 1024]), kwargs = {}) | |
| %_param_constant332 : [#users=1] = get_attr[target=_param_constant332] | |
| %addmm_50 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant332, %view_277, %t_106), kwargs = {}) | |
| %view_278 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_50, [2, 256, 8192]), kwargs = {}) | |
| %slice_43 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_278, -1, 0, 4096), kwargs = {}) | |
| %slice_44 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_278, -1, 4096, 8192), kwargs = {}) | |
| %gelu_7 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_44,), kwargs = {}) | |
| %mul_118 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_43, %gelu_7), kwargs = {}) | |
| %_param_constant333 : [#users=1] = get_attr[target=_param_constant333] | |
| %t_107 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant333,), kwargs = {}) | |
| %view_279 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_118, [512, 4096]), kwargs = {}) | |
| %_param_constant334 : [#users=1] = get_attr[target=_param_constant334] | |
| %addmm_51 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant334, %view_279, %t_107), kwargs = {}) | |
| %view_280 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_51, [2, 256, 1024]), kwargs = {}) | |
| %add_169 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_280, %add_166), kwargs = {}) | |
| %_param_constant335 : [#users=1] = get_attr[target=_param_constant335] | |
| %t_108 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant335,), kwargs = {}) | |
| %view_281 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_169, [512, 1024]), kwargs = {}) | |
| %_param_constant336 : [#users=1] = get_attr[target=_param_constant336] | |
| %addmm_52 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant336, %view_281, %t_108), kwargs = {}) | |
| %view_282 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_52, [2, 256, 1024]), kwargs = {}) | |
| %view_283 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_282, [2, 16, 16, 1024]), kwargs = {}) | |
| %permute_79 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_283, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_71 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_79,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_170 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_71, %div_11), kwargs = {}) | |
| %cat_3 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_170, %add_107], 1), kwargs = {}) | |
| %view_284 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_3, [2, 32, 64, 256]), kwargs = {}) | |
| %convert_element_type_168 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_284, torch.float32), kwargs = {}) | |
| %var_mean_54 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_168, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_108 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_54, 0), kwargs = {}) | |
| %getitem_109 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_54, 1), kwargs = {}) | |
| %add_171 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_108, 1e-05), kwargs = {}) | |
| %rsqrt_54 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_171,), kwargs = {}) | |
| %sub_54 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_284, %getitem_109), kwargs = {}) | |
| %mul_119 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_54, %rsqrt_54), kwargs = {}) | |
| %view_285 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_119, [2, 2048, 16, 16]), kwargs = {}) | |
| %_param_constant337 : [#users=1] = get_attr[target=_param_constant337] | |
| %unsqueeze_204 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant337, 0), kwargs = {}) | |
| %unsqueeze_205 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_204, 2), kwargs = {}) | |
| %unsqueeze_206 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_205, 3), kwargs = {}) | |
| %_param_constant338 : [#users=1] = get_attr[target=_param_constant338] | |
| %unsqueeze_207 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant338, 0), kwargs = {}) | |
| %unsqueeze_208 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_207, 2), kwargs = {}) | |
| %unsqueeze_209 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_208, 3), kwargs = {}) | |
| %mul_120 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_285, %unsqueeze_209), kwargs = {}) | |
| %add_172 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_120, %unsqueeze_206), kwargs = {}) | |
| %convert_element_type_169 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_172, torch.float16), kwargs = {}) | |
| %convert_element_type_170 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_109, torch.float16), kwargs = {}) | |
| %convert_element_type_171 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_54, torch.float16), kwargs = {}) | |
| %squeeze_120 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_170, 3), kwargs = {}) | |
| %squeeze_121 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_120, 2), kwargs = {}) | |
| %squeeze_122 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_171, 3), kwargs = {}) | |
| %squeeze_123 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_122, 2), kwargs = {}) | |
| %detach_76 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_121,), kwargs = {}) | |
| %detach_77 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_123,), kwargs = {}) | |
| %silu_34 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_169,), kwargs = {}) | |
| %_param_constant339 : [#users=1] = get_attr[target=_param_constant339] | |
| %_param_constant340 : [#users=1] = get_attr[target=_param_constant340] | |
| %convolution_29 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_34, %_param_constant339, %_param_constant340, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_35 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant341 : [#users=1] = get_attr[target=_param_constant341] | |
| %t_109 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant341,), kwargs = {}) | |
| %_param_constant342 : [#users=1] = get_attr[target=_param_constant342] | |
| %addmm_53 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant342, %silu_35, %t_109), kwargs = {}) | |
| %slice_45 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_53, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_46 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_45, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_210 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_46, 2), kwargs = {}) | |
| %unsqueeze_211 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_210, 3), kwargs = {}) | |
| %add_173 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_29, %unsqueeze_211), kwargs = {}) | |
| %view_286 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_173, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_172 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_286, torch.float32), kwargs = {}) | |
| %var_mean_55 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_172, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_110 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_55, 0), kwargs = {}) | |
| %getitem_111 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_55, 1), kwargs = {}) | |
| %add_174 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_110, 1e-05), kwargs = {}) | |
| %rsqrt_55 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_174,), kwargs = {}) | |
| %sub_55 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_286, %getitem_111), kwargs = {}) | |
| %mul_121 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_55, %rsqrt_55), kwargs = {}) | |
| %view_287 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_121, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant343 : [#users=1] = get_attr[target=_param_constant343] | |
| %unsqueeze_212 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant343, 0), kwargs = {}) | |
| %unsqueeze_213 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_212, 2), kwargs = {}) | |
| %unsqueeze_214 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_213, 3), kwargs = {}) | |
| %_param_constant344 : [#users=1] = get_attr[target=_param_constant344] | |
| %unsqueeze_215 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant344, 0), kwargs = {}) | |
| %unsqueeze_216 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_215, 2), kwargs = {}) | |
| %unsqueeze_217 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_216, 3), kwargs = {}) | |
| %mul_122 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_287, %unsqueeze_217), kwargs = {}) | |
| %add_175 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_122, %unsqueeze_214), kwargs = {}) | |
| %convert_element_type_173 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_175, torch.float16), kwargs = {}) | |
| %convert_element_type_174 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_111, torch.float16), kwargs = {}) | |
| %convert_element_type_175 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_55, torch.float16), kwargs = {}) | |
| %squeeze_124 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_174, 3), kwargs = {}) | |
| %squeeze_125 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_124, 2), kwargs = {}) | |
| %squeeze_126 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_175, 3), kwargs = {}) | |
| %squeeze_127 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_126, 2), kwargs = {}) | |
| %detach_78 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_125,), kwargs = {}) | |
| %detach_79 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_127,), kwargs = {}) | |
| %silu_36 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_173,), kwargs = {}) | |
| %_param_constant345 : [#users=1] = get_attr[target=_param_constant345] | |
| %_param_constant346 : [#users=1] = get_attr[target=_param_constant346] | |
| %convolution_30 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_36, %_param_constant345, %_param_constant346, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant347 : [#users=1] = get_attr[target=_param_constant347] | |
| %_param_constant348 : [#users=1] = get_attr[target=_param_constant348] | |
| %convolution_31 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_3, %_param_constant347, %_param_constant348, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_176 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_31, %convolution_30), kwargs = {}) | |
| %div_12 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_176, 1.0), kwargs = {}) | |
| %view_288 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_12, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_176 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_288, torch.float32), kwargs = {}) | |
| %var_mean_56 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_176, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_112 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_56, 0), kwargs = {}) | |
| %getitem_113 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_56, 1), kwargs = {}) | |
| %add_177 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_112, 1e-06), kwargs = {}) | |
| %rsqrt_56 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_177,), kwargs = {}) | |
| %sub_56 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_288, %getitem_113), kwargs = {}) | |
| %mul_123 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_56, %rsqrt_56), kwargs = {}) | |
| %view_289 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_123, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant349 : [#users=1] = get_attr[target=_param_constant349] | |
| %unsqueeze_218 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant349, 0), kwargs = {}) | |
| %unsqueeze_219 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_218, 2), kwargs = {}) | |
| %unsqueeze_220 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_219, 3), kwargs = {}) | |
| %_param_constant350 : [#users=1] = get_attr[target=_param_constant350] | |
| %unsqueeze_221 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant350, 0), kwargs = {}) | |
| %unsqueeze_222 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_221, 2), kwargs = {}) | |
| %unsqueeze_223 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_222, 3), kwargs = {}) | |
| %mul_124 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_289, %unsqueeze_223), kwargs = {}) | |
| %add_178 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_124, %unsqueeze_220), kwargs = {}) | |
| %convert_element_type_177 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_178, torch.float16), kwargs = {}) | |
| %convert_element_type_178 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_113, torch.float16), kwargs = {}) | |
| %convert_element_type_179 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_56, torch.float16), kwargs = {}) | |
| %squeeze_128 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_178, 3), kwargs = {}) | |
| %squeeze_129 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_128, 2), kwargs = {}) | |
| %squeeze_130 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_179, 3), kwargs = {}) | |
| %squeeze_131 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_130, 2), kwargs = {}) | |
| %detach_80 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_129,), kwargs = {}) | |
| %detach_81 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_131,), kwargs = {}) | |
| %permute_80 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_177, [0, 2, 3, 1]), kwargs = {}) | |
| %view_290 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_80, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant351 : [#users=1] = get_attr[target=_param_constant351] | |
| %t_110 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant351,), kwargs = {}) | |
| %expand_17 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_290, [2, 256, 1024]), kwargs = {}) | |
| %view_291 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_17, [2, 256, 1024]), kwargs = {}) | |
| %expand_18 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_110, [2, 1024, 1024]), kwargs = {}) | |
| %view_292 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_18, [2, 1024, 1024]), kwargs = {}) | |
| %bmm_24 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_291, %view_292), kwargs = {}) | |
| %_unsafe_view_120 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_24, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant352 : [#users=1] = get_attr[target=_param_constant352] | |
| %add_179 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_120, %_param_constant352), kwargs = {}) | |
| %convert_element_type_180 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_179, torch.float32), kwargs = {}) | |
| %var_mean_57 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_180, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_114 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_57, 0), kwargs = {}) | |
| %getitem_115 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_57, 1), kwargs = {}) | |
| %add_180 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_114, 1e-05), kwargs = {}) | |
| %rsqrt_57 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_180,), kwargs = {}) | |
| %sub_57 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_179, %getitem_115), kwargs = {}) | |
| %mul_125 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_57, %rsqrt_57), kwargs = {}) | |
| %_param_constant353 : [#users=1] = get_attr[target=_param_constant353] | |
| %mul_126 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_125, %_param_constant353), kwargs = {}) | |
| %_param_constant354 : [#users=1] = get_attr[target=_param_constant354] | |
| %add_181 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_126, %_param_constant354), kwargs = {}) | |
| %convert_element_type_181 : [#users=3] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_181, torch.float16), kwargs = {}) | |
| %_param_constant355 : [#users=1] = get_attr[target=_param_constant355] | |
| %t_111 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant355,), kwargs = {}) | |
| %view_293 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_181, [512, 1024]), kwargs = {}) | |
| %mm_48 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_293, %t_111), kwargs = {}) | |
| %_unsafe_view_121 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_48, [2, 256, 1024]), kwargs = {}) | |
| %view_294 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_121, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_81 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_294, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_72 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_81,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_122 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_72, [16, 256, 128]), kwargs = {}) | |
| %_param_constant356 : [#users=1] = get_attr[target=_param_constant356] | |
| %t_112 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant356,), kwargs = {}) | |
| %view_295 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_181, [512, 1024]), kwargs = {}) | |
| %mm_49 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_295, %t_112), kwargs = {}) | |
| %_unsafe_view_123 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_49, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant357 : [#users=1] = get_attr[target=_param_constant357] | |
| %t_113 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant357,), kwargs = {}) | |
| %view_296 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_181, [512, 1024]), kwargs = {}) | |
| %mm_50 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_296, %t_113), kwargs = {}) | |
| %_unsafe_view_124 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_50, [2, 256, 1024]), kwargs = {}) | |
| %view_297 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_123, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_82 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_297, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_73 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_82,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_125 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_73, [16, 256, 128]), kwargs = {}) | |
| %view_298 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_124, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_83 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_298, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_74 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_83,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_126 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_74, [16, 256, 128]), kwargs = {}) | |
| %empty_16 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 256],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_16 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_125, -1, -2), kwargs = {}) | |
| %baddbmm_16 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_16, %_unsafe_view_122, %transpose_16), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_16 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_16, -1, False), kwargs = {}) | |
| %detach_82 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_16,), kwargs = {}) | |
| %bmm_25 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_16, %_unsafe_view_126), kwargs = {}) | |
| %view_299 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_25, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_84 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_299, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_75 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_84,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_127 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_75, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant358 : [#users=1] = get_attr[target=_param_constant358] | |
| %t_114 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant358,), kwargs = {}) | |
| %view_300 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_127, [512, 1024]), kwargs = {}) | |
| %_param_constant359 : [#users=1] = get_attr[target=_param_constant359] | |
| %addmm_54 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant359, %view_300, %t_114), kwargs = {}) | |
| %view_301 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_54, [2, 256, 1024]), kwargs = {}) | |
| %add_182 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_301, %add_179), kwargs = {}) | |
| %convert_element_type_182 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_182, torch.float32), kwargs = {}) | |
| %var_mean_58 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_182, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_116 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_58, 0), kwargs = {}) | |
| %getitem_117 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_58, 1), kwargs = {}) | |
| %add_183 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_116, 1e-05), kwargs = {}) | |
| %rsqrt_58 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_183,), kwargs = {}) | |
| %sub_58 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_182, %getitem_117), kwargs = {}) | |
| %mul_127 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_58, %rsqrt_58), kwargs = {}) | |
| %_param_constant360 : [#users=1] = get_attr[target=_param_constant360] | |
| %mul_128 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_127, %_param_constant360), kwargs = {}) | |
| %_param_constant361 : [#users=1] = get_attr[target=_param_constant361] | |
| %add_184 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_128, %_param_constant361), kwargs = {}) | |
| %convert_element_type_183 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_184, torch.float16), kwargs = {}) | |
| %_param_constant362 : [#users=1] = get_attr[target=_param_constant362] | |
| %t_115 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant362,), kwargs = {}) | |
| %view_302 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_183, [512, 1024]), kwargs = {}) | |
| %mm_51 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_302, %t_115), kwargs = {}) | |
| %_unsafe_view_128 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_51, [2, 256, 1024]), kwargs = {}) | |
| %view_303 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_128, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_85 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_303, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_76 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_85,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_129 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_76, [16, 256, 128]), kwargs = {}) | |
| %_param_constant363 : [#users=1] = get_attr[target=_param_constant363] | |
| %t_116 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant363,), kwargs = {}) | |
| %view_304 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_52 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_304, %t_116), kwargs = {}) | |
| %_unsafe_view_130 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_52, [2, 77, 1024]), kwargs = {}) | |
| %_param_constant364 : [#users=1] = get_attr[target=_param_constant364] | |
| %t_117 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant364,), kwargs = {}) | |
| %view_305 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_53 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_305, %t_117), kwargs = {}) | |
| %_unsafe_view_131 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_53, [2, 77, 1024]), kwargs = {}) | |
| %view_306 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_130, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_86 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_306, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_77 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_86,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_132 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_77, [16, 77, 128]), kwargs = {}) | |
| %view_307 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_131, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_87 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_307, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_78 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_87,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_133 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_78, [16, 77, 128]), kwargs = {}) | |
| %empty_17 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_17 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_132, -1, -2), kwargs = {}) | |
| %baddbmm_17 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_17, %_unsafe_view_129, %transpose_17), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_17 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_17, -1, False), kwargs = {}) | |
| %detach_83 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_17,), kwargs = {}) | |
| %bmm_26 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_17, %_unsafe_view_133), kwargs = {}) | |
| %view_308 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_26, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_88 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_308, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_79 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_88,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_134 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_79, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant365 : [#users=1] = get_attr[target=_param_constant365] | |
| %t_118 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant365,), kwargs = {}) | |
| %view_309 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_134, [512, 1024]), kwargs = {}) | |
| %_param_constant366 : [#users=1] = get_attr[target=_param_constant366] | |
| %addmm_55 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant366, %view_309, %t_118), kwargs = {}) | |
| %view_310 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_55, [2, 256, 1024]), kwargs = {}) | |
| %add_185 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_310, %add_182), kwargs = {}) | |
| %convert_element_type_184 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_185, torch.float32), kwargs = {}) | |
| %var_mean_59 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_184, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_118 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_59, 0), kwargs = {}) | |
| %getitem_119 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_59, 1), kwargs = {}) | |
| %add_186 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_118, 1e-05), kwargs = {}) | |
| %rsqrt_59 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_186,), kwargs = {}) | |
| %sub_59 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_185, %getitem_119), kwargs = {}) | |
| %mul_129 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_59, %rsqrt_59), kwargs = {}) | |
| %_param_constant367 : [#users=1] = get_attr[target=_param_constant367] | |
| %mul_130 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_129, %_param_constant367), kwargs = {}) | |
| %_param_constant368 : [#users=1] = get_attr[target=_param_constant368] | |
| %add_187 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_130, %_param_constant368), kwargs = {}) | |
| %convert_element_type_185 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_187, torch.float16), kwargs = {}) | |
| %_param_constant369 : [#users=1] = get_attr[target=_param_constant369] | |
| %t_119 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant369,), kwargs = {}) | |
| %view_311 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_185, [512, 1024]), kwargs = {}) | |
| %_param_constant370 : [#users=1] = get_attr[target=_param_constant370] | |
| %addmm_56 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant370, %view_311, %t_119), kwargs = {}) | |
| %view_312 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_56, [2, 256, 8192]), kwargs = {}) | |
| %slice_47 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_312, -1, 0, 4096), kwargs = {}) | |
| %slice_48 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_312, -1, 4096, 8192), kwargs = {}) | |
| %gelu_8 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_48,), kwargs = {}) | |
| %mul_131 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_47, %gelu_8), kwargs = {}) | |
| %_param_constant371 : [#users=1] = get_attr[target=_param_constant371] | |
| %t_120 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant371,), kwargs = {}) | |
| %view_313 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_131, [512, 4096]), kwargs = {}) | |
| %_param_constant372 : [#users=1] = get_attr[target=_param_constant372] | |
| %addmm_57 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant372, %view_313, %t_120), kwargs = {}) | |
| %view_314 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_57, [2, 256, 1024]), kwargs = {}) | |
| %add_188 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_314, %add_185), kwargs = {}) | |
| %_param_constant373 : [#users=1] = get_attr[target=_param_constant373] | |
| %t_121 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant373,), kwargs = {}) | |
| %view_315 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_188, [512, 1024]), kwargs = {}) | |
| %_param_constant374 : [#users=1] = get_attr[target=_param_constant374] | |
| %addmm_58 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant374, %view_315, %t_121), kwargs = {}) | |
| %view_316 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_58, [2, 256, 1024]), kwargs = {}) | |
| %view_317 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_316, [2, 16, 16, 1024]), kwargs = {}) | |
| %permute_89 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_317, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_80 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_89,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_189 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_80, %div_12), kwargs = {}) | |
| %cat_4 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_189, %convolution_16], 1), kwargs = {}) | |
| %view_318 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_4, [2, 32, 48, 256]), kwargs = {}) | |
| %convert_element_type_186 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_318, torch.float32), kwargs = {}) | |
| %var_mean_60 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_186, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_120 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_60, 0), kwargs = {}) | |
| %getitem_121 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_60, 1), kwargs = {}) | |
| %add_190 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_120, 1e-05), kwargs = {}) | |
| %rsqrt_60 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_190,), kwargs = {}) | |
| %sub_60 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_318, %getitem_121), kwargs = {}) | |
| %mul_132 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_60, %rsqrt_60), kwargs = {}) | |
| %view_319 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_132, [2, 1536, 16, 16]), kwargs = {}) | |
| %_param_constant375 : [#users=1] = get_attr[target=_param_constant375] | |
| %unsqueeze_224 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant375, 0), kwargs = {}) | |
| %unsqueeze_225 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_224, 2), kwargs = {}) | |
| %unsqueeze_226 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_225, 3), kwargs = {}) | |
| %_param_constant376 : [#users=1] = get_attr[target=_param_constant376] | |
| %unsqueeze_227 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant376, 0), kwargs = {}) | |
| %unsqueeze_228 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_227, 2), kwargs = {}) | |
| %unsqueeze_229 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_228, 3), kwargs = {}) | |
| %mul_133 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_319, %unsqueeze_229), kwargs = {}) | |
| %add_191 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_133, %unsqueeze_226), kwargs = {}) | |
| %convert_element_type_187 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_191, torch.float16), kwargs = {}) | |
| %convert_element_type_188 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_121, torch.float16), kwargs = {}) | |
| %convert_element_type_189 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_60, torch.float16), kwargs = {}) | |
| %squeeze_132 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_188, 3), kwargs = {}) | |
| %squeeze_133 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_132, 2), kwargs = {}) | |
| %squeeze_134 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_189, 3), kwargs = {}) | |
| %squeeze_135 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_134, 2), kwargs = {}) | |
| %detach_84 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_133,), kwargs = {}) | |
| %detach_85 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_135,), kwargs = {}) | |
| %silu_37 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_187,), kwargs = {}) | |
| %_param_constant377 : [#users=1] = get_attr[target=_param_constant377] | |
| %_param_constant378 : [#users=1] = get_attr[target=_param_constant378] | |
| %convolution_32 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_37, %_param_constant377, %_param_constant378, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_38 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant379 : [#users=1] = get_attr[target=_param_constant379] | |
| %t_122 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant379,), kwargs = {}) | |
| %_param_constant380 : [#users=1] = get_attr[target=_param_constant380] | |
| %addmm_59 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant380, %silu_38, %t_122), kwargs = {}) | |
| %slice_49 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_59, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_50 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_49, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_230 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_50, 2), kwargs = {}) | |
| %unsqueeze_231 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_230, 3), kwargs = {}) | |
| %add_192 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_32, %unsqueeze_231), kwargs = {}) | |
| %view_320 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_192, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_190 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_320, torch.float32), kwargs = {}) | |
| %var_mean_61 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_190, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_122 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_61, 0), kwargs = {}) | |
| %getitem_123 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_61, 1), kwargs = {}) | |
| %add_193 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_122, 1e-05), kwargs = {}) | |
| %rsqrt_61 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_193,), kwargs = {}) | |
| %sub_61 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_320, %getitem_123), kwargs = {}) | |
| %mul_134 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_61, %rsqrt_61), kwargs = {}) | |
| %view_321 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_134, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant381 : [#users=1] = get_attr[target=_param_constant381] | |
| %unsqueeze_232 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant381, 0), kwargs = {}) | |
| %unsqueeze_233 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_232, 2), kwargs = {}) | |
| %unsqueeze_234 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_233, 3), kwargs = {}) | |
| %_param_constant382 : [#users=1] = get_attr[target=_param_constant382] | |
| %unsqueeze_235 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant382, 0), kwargs = {}) | |
| %unsqueeze_236 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_235, 2), kwargs = {}) | |
| %unsqueeze_237 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_236, 3), kwargs = {}) | |
| %mul_135 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_321, %unsqueeze_237), kwargs = {}) | |
| %add_194 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_135, %unsqueeze_234), kwargs = {}) | |
| %convert_element_type_191 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_194, torch.float16), kwargs = {}) | |
| %convert_element_type_192 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_123, torch.float16), kwargs = {}) | |
| %convert_element_type_193 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_61, torch.float16), kwargs = {}) | |
| %squeeze_136 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_192, 3), kwargs = {}) | |
| %squeeze_137 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_136, 2), kwargs = {}) | |
| %squeeze_138 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_193, 3), kwargs = {}) | |
| %squeeze_139 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_138, 2), kwargs = {}) | |
| %detach_86 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_137,), kwargs = {}) | |
| %detach_87 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_139,), kwargs = {}) | |
| %silu_39 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_191,), kwargs = {}) | |
| %_param_constant383 : [#users=1] = get_attr[target=_param_constant383] | |
| %_param_constant384 : [#users=1] = get_attr[target=_param_constant384] | |
| %convolution_33 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_39, %_param_constant383, %_param_constant384, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant385 : [#users=1] = get_attr[target=_param_constant385] | |
| %_param_constant386 : [#users=1] = get_attr[target=_param_constant386] | |
| %convolution_34 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_4, %_param_constant385, %_param_constant386, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_195 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_34, %convolution_33), kwargs = {}) | |
| %div_13 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_195, 1.0), kwargs = {}) | |
| %view_322 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_13, [2, 32, 32, 256]), kwargs = {}) | |
| %convert_element_type_194 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_322, torch.float32), kwargs = {}) | |
| %var_mean_62 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_194, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_124 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_62, 0), kwargs = {}) | |
| %getitem_125 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_62, 1), kwargs = {}) | |
| %add_196 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_124, 1e-06), kwargs = {}) | |
| %rsqrt_62 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_196,), kwargs = {}) | |
| %sub_62 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_322, %getitem_125), kwargs = {}) | |
| %mul_136 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_62, %rsqrt_62), kwargs = {}) | |
| %view_323 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_136, [2, 1024, 16, 16]), kwargs = {}) | |
| %_param_constant387 : [#users=1] = get_attr[target=_param_constant387] | |
| %unsqueeze_238 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant387, 0), kwargs = {}) | |
| %unsqueeze_239 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_238, 2), kwargs = {}) | |
| %unsqueeze_240 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_239, 3), kwargs = {}) | |
| %_param_constant388 : [#users=1] = get_attr[target=_param_constant388] | |
| %unsqueeze_241 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant388, 0), kwargs = {}) | |
| %unsqueeze_242 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_241, 2), kwargs = {}) | |
| %unsqueeze_243 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_242, 3), kwargs = {}) | |
| %mul_137 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_323, %unsqueeze_243), kwargs = {}) | |
| %add_197 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_137, %unsqueeze_240), kwargs = {}) | |
| %convert_element_type_195 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_197, torch.float16), kwargs = {}) | |
| %convert_element_type_196 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_125, torch.float16), kwargs = {}) | |
| %convert_element_type_197 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_62, torch.float16), kwargs = {}) | |
| %squeeze_140 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_196, 3), kwargs = {}) | |
| %squeeze_141 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_140, 2), kwargs = {}) | |
| %squeeze_142 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_197, 3), kwargs = {}) | |
| %squeeze_143 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_142, 2), kwargs = {}) | |
| %detach_88 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_141,), kwargs = {}) | |
| %detach_89 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_143,), kwargs = {}) | |
| %permute_90 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_195, [0, 2, 3, 1]), kwargs = {}) | |
| %view_324 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_90, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant389 : [#users=1] = get_attr[target=_param_constant389] | |
| %t_123 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant389,), kwargs = {}) | |
| %expand_19 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_324, [2, 256, 1024]), kwargs = {}) | |
| %view_325 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_19, [2, 256, 1024]), kwargs = {}) | |
| %expand_20 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_123, [2, 1024, 1024]), kwargs = {}) | |
| %view_326 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_20, [2, 1024, 1024]), kwargs = {}) | |
| %bmm_27 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_325, %view_326), kwargs = {}) | |
| %_unsafe_view_135 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_27, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant390 : [#users=1] = get_attr[target=_param_constant390] | |
| %add_198 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_135, %_param_constant390), kwargs = {}) | |
| %convert_element_type_198 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_198, torch.float32), kwargs = {}) | |
| %var_mean_63 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_198, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_126 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_63, 0), kwargs = {}) | |
| %getitem_127 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_63, 1), kwargs = {}) | |
| %add_199 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_126, 1e-05), kwargs = {}) | |
| %rsqrt_63 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_199,), kwargs = {}) | |
| %sub_63 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_198, %getitem_127), kwargs = {}) | |
| %mul_138 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_63, %rsqrt_63), kwargs = {}) | |
| %_param_constant391 : [#users=1] = get_attr[target=_param_constant391] | |
| %mul_139 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_138, %_param_constant391), kwargs = {}) | |
| %_param_constant392 : [#users=1] = get_attr[target=_param_constant392] | |
| %add_200 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_139, %_param_constant392), kwargs = {}) | |
| %convert_element_type_199 : [#users=3] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_200, torch.float16), kwargs = {}) | |
| %_param_constant393 : [#users=1] = get_attr[target=_param_constant393] | |
| %t_124 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant393,), kwargs = {}) | |
| %view_327 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_199, [512, 1024]), kwargs = {}) | |
| %mm_54 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_327, %t_124), kwargs = {}) | |
| %_unsafe_view_136 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_54, [2, 256, 1024]), kwargs = {}) | |
| %view_328 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_136, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_91 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_328, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_81 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_91,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_137 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_81, [16, 256, 128]), kwargs = {}) | |
| %_param_constant394 : [#users=1] = get_attr[target=_param_constant394] | |
| %t_125 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant394,), kwargs = {}) | |
| %view_329 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_199, [512, 1024]), kwargs = {}) | |
| %mm_55 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_329, %t_125), kwargs = {}) | |
| %_unsafe_view_138 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_55, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant395 : [#users=1] = get_attr[target=_param_constant395] | |
| %t_126 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant395,), kwargs = {}) | |
| %view_330 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_199, [512, 1024]), kwargs = {}) | |
| %mm_56 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_330, %t_126), kwargs = {}) | |
| %_unsafe_view_139 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_56, [2, 256, 1024]), kwargs = {}) | |
| %view_331 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_138, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_92 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_331, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_82 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_92,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_140 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_82, [16, 256, 128]), kwargs = {}) | |
| %view_332 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_139, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_93 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_332, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_83 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_93,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_141 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_83, [16, 256, 128]), kwargs = {}) | |
| %empty_18 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 256],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_18 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_140, -1, -2), kwargs = {}) | |
| %baddbmm_18 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_18, %_unsafe_view_137, %transpose_18), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_18 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_18, -1, False), kwargs = {}) | |
| %detach_90 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_18,), kwargs = {}) | |
| %bmm_28 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_18, %_unsafe_view_141), kwargs = {}) | |
| %view_333 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_28, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_94 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_333, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_84 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_94,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_142 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_84, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant396 : [#users=1] = get_attr[target=_param_constant396] | |
| %t_127 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant396,), kwargs = {}) | |
| %view_334 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_142, [512, 1024]), kwargs = {}) | |
| %_param_constant397 : [#users=1] = get_attr[target=_param_constant397] | |
| %addmm_60 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant397, %view_334, %t_127), kwargs = {}) | |
| %view_335 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_60, [2, 256, 1024]), kwargs = {}) | |
| %add_201 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_335, %add_198), kwargs = {}) | |
| %convert_element_type_200 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_201, torch.float32), kwargs = {}) | |
| %var_mean_64 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_200, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_128 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_64, 0), kwargs = {}) | |
| %getitem_129 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_64, 1), kwargs = {}) | |
| %add_202 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_128, 1e-05), kwargs = {}) | |
| %rsqrt_64 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_202,), kwargs = {}) | |
| %sub_64 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_201, %getitem_129), kwargs = {}) | |
| %mul_140 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_64, %rsqrt_64), kwargs = {}) | |
| %_param_constant398 : [#users=1] = get_attr[target=_param_constant398] | |
| %mul_141 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_140, %_param_constant398), kwargs = {}) | |
| %_param_constant399 : [#users=1] = get_attr[target=_param_constant399] | |
| %add_203 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_141, %_param_constant399), kwargs = {}) | |
| %convert_element_type_201 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_203, torch.float16), kwargs = {}) | |
| %_param_constant400 : [#users=1] = get_attr[target=_param_constant400] | |
| %t_128 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant400,), kwargs = {}) | |
| %view_336 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_201, [512, 1024]), kwargs = {}) | |
| %mm_57 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_336, %t_128), kwargs = {}) | |
| %_unsafe_view_143 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_57, [2, 256, 1024]), kwargs = {}) | |
| %view_337 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_143, [2, 256, 8, 128]), kwargs = {}) | |
| %permute_95 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_337, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_85 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_95,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_144 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_85, [16, 256, 128]), kwargs = {}) | |
| %_param_constant401 : [#users=1] = get_attr[target=_param_constant401] | |
| %t_129 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant401,), kwargs = {}) | |
| %view_338 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_58 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_338, %t_129), kwargs = {}) | |
| %_unsafe_view_145 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_58, [2, 77, 1024]), kwargs = {}) | |
| %_param_constant402 : [#users=1] = get_attr[target=_param_constant402] | |
| %t_130 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant402,), kwargs = {}) | |
| %view_339 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_59 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_339, %t_130), kwargs = {}) | |
| %_unsafe_view_146 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_59, [2, 77, 1024]), kwargs = {}) | |
| %view_340 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_145, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_96 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_340, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_86 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_96,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_147 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_86, [16, 77, 128]), kwargs = {}) | |
| %view_341 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_146, [2, 77, 8, 128]), kwargs = {}) | |
| %permute_97 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_341, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_87 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_97,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_148 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_87, [16, 77, 128]), kwargs = {}) | |
| %empty_19 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 256, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_19 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_147, -1, -2), kwargs = {}) | |
| %baddbmm_19 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_19, %_unsafe_view_144, %transpose_19), kwargs = {beta: 0, alpha: 0.08838834764831845}) | |
| %_softmax_19 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_19, -1, False), kwargs = {}) | |
| %detach_91 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_19,), kwargs = {}) | |
| %bmm_29 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_19, %_unsafe_view_148), kwargs = {}) | |
| %view_342 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_29, [2, 8, 256, 128]), kwargs = {}) | |
| %permute_98 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_342, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_88 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_98,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_149 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_88, [2, 256, 1024]), kwargs = {}) | |
| %_param_constant403 : [#users=1] = get_attr[target=_param_constant403] | |
| %t_131 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant403,), kwargs = {}) | |
| %view_343 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_149, [512, 1024]), kwargs = {}) | |
| %_param_constant404 : [#users=1] = get_attr[target=_param_constant404] | |
| %addmm_61 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant404, %view_343, %t_131), kwargs = {}) | |
| %view_344 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_61, [2, 256, 1024]), kwargs = {}) | |
| %add_204 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_344, %add_201), kwargs = {}) | |
| %convert_element_type_202 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_204, torch.float32), kwargs = {}) | |
| %var_mean_65 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_202, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_130 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_65, 0), kwargs = {}) | |
| %getitem_131 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_65, 1), kwargs = {}) | |
| %add_205 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_130, 1e-05), kwargs = {}) | |
| %rsqrt_65 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_205,), kwargs = {}) | |
| %sub_65 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_204, %getitem_131), kwargs = {}) | |
| %mul_142 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_65, %rsqrt_65), kwargs = {}) | |
| %_param_constant405 : [#users=1] = get_attr[target=_param_constant405] | |
| %mul_143 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_142, %_param_constant405), kwargs = {}) | |
| %_param_constant406 : [#users=1] = get_attr[target=_param_constant406] | |
| %add_206 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_143, %_param_constant406), kwargs = {}) | |
| %convert_element_type_203 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_206, torch.float16), kwargs = {}) | |
| %_param_constant407 : [#users=1] = get_attr[target=_param_constant407] | |
| %t_132 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant407,), kwargs = {}) | |
| %view_345 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_203, [512, 1024]), kwargs = {}) | |
| %_param_constant408 : [#users=1] = get_attr[target=_param_constant408] | |
| %addmm_62 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant408, %view_345, %t_132), kwargs = {}) | |
| %view_346 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_62, [2, 256, 8192]), kwargs = {}) | |
| %slice_51 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_346, -1, 0, 4096), kwargs = {}) | |
| %slice_52 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_346, -1, 4096, 8192), kwargs = {}) | |
| %gelu_9 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_52,), kwargs = {}) | |
| %mul_144 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_51, %gelu_9), kwargs = {}) | |
| %_param_constant409 : [#users=1] = get_attr[target=_param_constant409] | |
| %t_133 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant409,), kwargs = {}) | |
| %view_347 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_144, [512, 4096]), kwargs = {}) | |
| %_param_constant410 : [#users=1] = get_attr[target=_param_constant410] | |
| %addmm_63 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant410, %view_347, %t_133), kwargs = {}) | |
| %view_348 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_63, [2, 256, 1024]), kwargs = {}) | |
| %add_207 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_348, %add_204), kwargs = {}) | |
| %_param_constant411 : [#users=1] = get_attr[target=_param_constant411] | |
| %t_134 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant411,), kwargs = {}) | |
| %view_349 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_207, [512, 1024]), kwargs = {}) | |
| %_param_constant412 : [#users=1] = get_attr[target=_param_constant412] | |
| %addmm_64 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant412, %view_349, %t_134), kwargs = {}) | |
| %view_350 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_64, [2, 256, 1024]), kwargs = {}) | |
| %view_351 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_350, [2, 16, 16, 1024]), kwargs = {}) | |
| %permute_99 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_351, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_89 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_99,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_208 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_89, %div_13), kwargs = {}) | |
| %upsample_nearest2d : [#users=1] = call_function[target=torch.ops.aten.upsample_nearest2d](args = (%add_208, [32, 32], 2.0, 2.0), kwargs = {}) | |
| %_param_constant413 : [#users=1] = get_attr[target=_param_constant413] | |
| %_param_constant414 : [#users=1] = get_attr[target=_param_constant414] | |
| %convolution_35 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%upsample_nearest2d, %_param_constant413, %_param_constant414, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %cat_5 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%convolution_35, %add_88], 1), kwargs = {}) | |
| %view_352 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_5, [2, 32, 48, 1024]), kwargs = {}) | |
| %convert_element_type_204 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_352, torch.float32), kwargs = {}) | |
| %var_mean_66 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_204, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_132 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_66, 0), kwargs = {}) | |
| %getitem_133 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_66, 1), kwargs = {}) | |
| %add_209 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_132, 1e-05), kwargs = {}) | |
| %rsqrt_66 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_209,), kwargs = {}) | |
| %sub_66 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_352, %getitem_133), kwargs = {}) | |
| %mul_145 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_66, %rsqrt_66), kwargs = {}) | |
| %view_353 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_145, [2, 1536, 32, 32]), kwargs = {}) | |
| %_param_constant415 : [#users=1] = get_attr[target=_param_constant415] | |
| %unsqueeze_244 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant415, 0), kwargs = {}) | |
| %unsqueeze_245 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_244, 2), kwargs = {}) | |
| %unsqueeze_246 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_245, 3), kwargs = {}) | |
| %_param_constant416 : [#users=1] = get_attr[target=_param_constant416] | |
| %unsqueeze_247 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant416, 0), kwargs = {}) | |
| %unsqueeze_248 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_247, 2), kwargs = {}) | |
| %unsqueeze_249 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_248, 3), kwargs = {}) | |
| %mul_146 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_353, %unsqueeze_249), kwargs = {}) | |
| %add_210 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_146, %unsqueeze_246), kwargs = {}) | |
| %convert_element_type_205 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_210, torch.float16), kwargs = {}) | |
| %convert_element_type_206 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_133, torch.float16), kwargs = {}) | |
| %convert_element_type_207 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_66, torch.float16), kwargs = {}) | |
| %squeeze_144 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_206, 3), kwargs = {}) | |
| %squeeze_145 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_144, 2), kwargs = {}) | |
| %squeeze_146 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_207, 3), kwargs = {}) | |
| %squeeze_147 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_146, 2), kwargs = {}) | |
| %detach_92 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_145,), kwargs = {}) | |
| %detach_93 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_147,), kwargs = {}) | |
| %silu_40 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_205,), kwargs = {}) | |
| %_param_constant417 : [#users=1] = get_attr[target=_param_constant417] | |
| %_param_constant418 : [#users=1] = get_attr[target=_param_constant418] | |
| %convolution_36 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_40, %_param_constant417, %_param_constant418, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_41 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant419 : [#users=1] = get_attr[target=_param_constant419] | |
| %t_135 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant419,), kwargs = {}) | |
| %_param_constant420 : [#users=1] = get_attr[target=_param_constant420] | |
| %addmm_65 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant420, %silu_41, %t_135), kwargs = {}) | |
| %slice_53 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_65, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_54 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_53, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_250 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_54, 2), kwargs = {}) | |
| %unsqueeze_251 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_250, 3), kwargs = {}) | |
| %add_211 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_36, %unsqueeze_251), kwargs = {}) | |
| %view_354 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_211, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_208 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_354, torch.float32), kwargs = {}) | |
| %var_mean_67 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_208, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_134 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_67, 0), kwargs = {}) | |
| %getitem_135 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_67, 1), kwargs = {}) | |
| %add_212 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_134, 1e-05), kwargs = {}) | |
| %rsqrt_67 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_212,), kwargs = {}) | |
| %sub_67 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_354, %getitem_135), kwargs = {}) | |
| %mul_147 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_67, %rsqrt_67), kwargs = {}) | |
| %view_355 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_147, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant421 : [#users=1] = get_attr[target=_param_constant421] | |
| %unsqueeze_252 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant421, 0), kwargs = {}) | |
| %unsqueeze_253 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_252, 2), kwargs = {}) | |
| %unsqueeze_254 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_253, 3), kwargs = {}) | |
| %_param_constant422 : [#users=1] = get_attr[target=_param_constant422] | |
| %unsqueeze_255 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant422, 0), kwargs = {}) | |
| %unsqueeze_256 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_255, 2), kwargs = {}) | |
| %unsqueeze_257 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_256, 3), kwargs = {}) | |
| %mul_148 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_355, %unsqueeze_257), kwargs = {}) | |
| %add_213 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_148, %unsqueeze_254), kwargs = {}) | |
| %convert_element_type_209 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_213, torch.float16), kwargs = {}) | |
| %convert_element_type_210 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_135, torch.float16), kwargs = {}) | |
| %convert_element_type_211 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_67, torch.float16), kwargs = {}) | |
| %squeeze_148 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_210, 3), kwargs = {}) | |
| %squeeze_149 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_148, 2), kwargs = {}) | |
| %squeeze_150 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_211, 3), kwargs = {}) | |
| %squeeze_151 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_150, 2), kwargs = {}) | |
| %detach_94 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_149,), kwargs = {}) | |
| %detach_95 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_151,), kwargs = {}) | |
| %silu_42 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_209,), kwargs = {}) | |
| %_param_constant423 : [#users=1] = get_attr[target=_param_constant423] | |
| %_param_constant424 : [#users=1] = get_attr[target=_param_constant424] | |
| %convolution_37 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_42, %_param_constant423, %_param_constant424, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant425 : [#users=1] = get_attr[target=_param_constant425] | |
| %_param_constant426 : [#users=1] = get_attr[target=_param_constant426] | |
| %convolution_38 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_5, %_param_constant425, %_param_constant426, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_214 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_38, %convolution_37), kwargs = {}) | |
| %div_14 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_214, 1.0), kwargs = {}) | |
| %view_356 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_14, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_212 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_356, torch.float32), kwargs = {}) | |
| %var_mean_68 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_212, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_136 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_68, 0), kwargs = {}) | |
| %getitem_137 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_68, 1), kwargs = {}) | |
| %add_215 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_136, 1e-06), kwargs = {}) | |
| %rsqrt_68 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_215,), kwargs = {}) | |
| %sub_68 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_356, %getitem_137), kwargs = {}) | |
| %mul_149 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_68, %rsqrt_68), kwargs = {}) | |
| %view_357 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_149, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant427 : [#users=1] = get_attr[target=_param_constant427] | |
| %unsqueeze_258 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant427, 0), kwargs = {}) | |
| %unsqueeze_259 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_258, 2), kwargs = {}) | |
| %unsqueeze_260 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_259, 3), kwargs = {}) | |
| %_param_constant428 : [#users=1] = get_attr[target=_param_constant428] | |
| %unsqueeze_261 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant428, 0), kwargs = {}) | |
| %unsqueeze_262 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_261, 2), kwargs = {}) | |
| %unsqueeze_263 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_262, 3), kwargs = {}) | |
| %mul_150 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_357, %unsqueeze_263), kwargs = {}) | |
| %add_216 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_150, %unsqueeze_260), kwargs = {}) | |
| %convert_element_type_213 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_216, torch.float16), kwargs = {}) | |
| %convert_element_type_214 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_137, torch.float16), kwargs = {}) | |
| %convert_element_type_215 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_68, torch.float16), kwargs = {}) | |
| %squeeze_152 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_214, 3), kwargs = {}) | |
| %squeeze_153 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_152, 2), kwargs = {}) | |
| %squeeze_154 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_215, 3), kwargs = {}) | |
| %squeeze_155 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_154, 2), kwargs = {}) | |
| %detach_96 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_153,), kwargs = {}) | |
| %detach_97 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_155,), kwargs = {}) | |
| %permute_100 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_213, [0, 2, 3, 1]), kwargs = {}) | |
| %view_358 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_100, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant429 : [#users=1] = get_attr[target=_param_constant429] | |
| %t_136 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant429,), kwargs = {}) | |
| %expand_21 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_358, [2, 1024, 512]), kwargs = {}) | |
| %view_359 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_21, [2, 1024, 512]), kwargs = {}) | |
| %expand_22 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_136, [2, 512, 512]), kwargs = {}) | |
| %view_360 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_22, [2, 512, 512]), kwargs = {}) | |
| %bmm_30 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_359, %view_360), kwargs = {}) | |
| %_unsafe_view_150 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_30, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant430 : [#users=1] = get_attr[target=_param_constant430] | |
| %add_217 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_150, %_param_constant430), kwargs = {}) | |
| %convert_element_type_216 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_217, torch.float32), kwargs = {}) | |
| %var_mean_69 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_216, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_138 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_69, 0), kwargs = {}) | |
| %getitem_139 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_69, 1), kwargs = {}) | |
| %add_218 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_138, 1e-05), kwargs = {}) | |
| %rsqrt_69 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_218,), kwargs = {}) | |
| %sub_69 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_217, %getitem_139), kwargs = {}) | |
| %mul_151 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_69, %rsqrt_69), kwargs = {}) | |
| %_param_constant431 : [#users=1] = get_attr[target=_param_constant431] | |
| %mul_152 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_151, %_param_constant431), kwargs = {}) | |
| %_param_constant432 : [#users=1] = get_attr[target=_param_constant432] | |
| %add_219 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_152, %_param_constant432), kwargs = {}) | |
| %convert_element_type_217 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_219, torch.float16), kwargs = {}) | |
| %_param_constant433 : [#users=1] = get_attr[target=_param_constant433] | |
| %t_137 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant433,), kwargs = {}) | |
| %view_361 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_217, [2048, 512]), kwargs = {}) | |
| %mm_60 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_361, %t_137), kwargs = {}) | |
| %_unsafe_view_151 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_60, [2, 1024, 512]), kwargs = {}) | |
| %view_362 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_151, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_101 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_362, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_90 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_101,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_152 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_90, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant434 : [#users=1] = get_attr[target=_param_constant434] | |
| %t_138 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant434,), kwargs = {}) | |
| %view_363 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_61 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_363, %t_138), kwargs = {}) | |
| %_unsafe_view_153 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_61, [2, 77, 512]), kwargs = {}) | |
| %_param_constant435 : [#users=1] = get_attr[target=_param_constant435] | |
| %t_139 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant435,), kwargs = {}) | |
| %view_364 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_62 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_364, %t_139), kwargs = {}) | |
| %_unsafe_view_154 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_62, [2, 77, 512]), kwargs = {}) | |
| %view_365 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_153, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_102 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_365, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_91 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_102,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_155 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_91, [16, 77, 64]), kwargs = {}) | |
| %view_366 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_154, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_103 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_366, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_92 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_103,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_156 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_92, [16, 77, 64]), kwargs = {}) | |
| %empty_20 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_20 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_155, -1, -2), kwargs = {}) | |
| %baddbmm_20 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_20, %_unsafe_view_152, %transpose_20), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_20 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_20, -1, False), kwargs = {}) | |
| %detach_98 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_20,), kwargs = {}) | |
| %bmm_31 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_20, %_unsafe_view_156), kwargs = {}) | |
| %view_367 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_31, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_104 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_367, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_93 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_104,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_157 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_93, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant436 : [#users=1] = get_attr[target=_param_constant436] | |
| %t_140 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant436,), kwargs = {}) | |
| %view_368 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_157, [2048, 512]), kwargs = {}) | |
| %_param_constant437 : [#users=1] = get_attr[target=_param_constant437] | |
| %addmm_66 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant437, %view_368, %t_140), kwargs = {}) | |
| %view_369 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_66, [2, 1024, 512]), kwargs = {}) | |
| %add_220 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_369, %add_217), kwargs = {}) | |
| %convert_element_type_218 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_220, torch.float32), kwargs = {}) | |
| %var_mean_70 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_218, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_140 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_70, 0), kwargs = {}) | |
| %getitem_141 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_70, 1), kwargs = {}) | |
| %add_221 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_140, 1e-05), kwargs = {}) | |
| %rsqrt_70 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_221,), kwargs = {}) | |
| %sub_70 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_220, %getitem_141), kwargs = {}) | |
| %mul_153 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_70, %rsqrt_70), kwargs = {}) | |
| %_param_constant438 : [#users=1] = get_attr[target=_param_constant438] | |
| %mul_154 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_153, %_param_constant438), kwargs = {}) | |
| %_param_constant439 : [#users=1] = get_attr[target=_param_constant439] | |
| %add_222 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_154, %_param_constant439), kwargs = {}) | |
| %convert_element_type_219 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_222, torch.float16), kwargs = {}) | |
| %_param_constant440 : [#users=1] = get_attr[target=_param_constant440] | |
| %t_141 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant440,), kwargs = {}) | |
| %view_370 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_219, [2048, 512]), kwargs = {}) | |
| %mm_63 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_370, %t_141), kwargs = {}) | |
| %_unsafe_view_158 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_63, [2, 1024, 512]), kwargs = {}) | |
| %view_371 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_158, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_105 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_371, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_94 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_105,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_159 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_94, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant441 : [#users=1] = get_attr[target=_param_constant441] | |
| %t_142 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant441,), kwargs = {}) | |
| %view_372 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_64 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_372, %t_142), kwargs = {}) | |
| %_unsafe_view_160 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_64, [2, 77, 512]), kwargs = {}) | |
| %_param_constant442 : [#users=1] = get_attr[target=_param_constant442] | |
| %t_143 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant442,), kwargs = {}) | |
| %view_373 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_65 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_373, %t_143), kwargs = {}) | |
| %_unsafe_view_161 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_65, [2, 77, 512]), kwargs = {}) | |
| %view_374 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_160, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_106 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_374, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_95 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_106,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_162 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_95, [16, 77, 64]), kwargs = {}) | |
| %view_375 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_161, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_107 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_375, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_96 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_107,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_163 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_96, [16, 77, 64]), kwargs = {}) | |
| %empty_21 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_21 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_162, -1, -2), kwargs = {}) | |
| %baddbmm_21 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_21, %_unsafe_view_159, %transpose_21), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_21 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_21, -1, False), kwargs = {}) | |
| %detach_99 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_21,), kwargs = {}) | |
| %bmm_32 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_21, %_unsafe_view_163), kwargs = {}) | |
| %view_376 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_32, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_108 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_376, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_97 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_108,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_164 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_97, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant443 : [#users=1] = get_attr[target=_param_constant443] | |
| %t_144 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant443,), kwargs = {}) | |
| %view_377 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_164, [2048, 512]), kwargs = {}) | |
| %_param_constant444 : [#users=1] = get_attr[target=_param_constant444] | |
| %addmm_67 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant444, %view_377, %t_144), kwargs = {}) | |
| %view_378 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_67, [2, 1024, 512]), kwargs = {}) | |
| %add_223 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_378, %add_220), kwargs = {}) | |
| %convert_element_type_220 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_223, torch.float32), kwargs = {}) | |
| %var_mean_71 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_220, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_142 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_71, 0), kwargs = {}) | |
| %getitem_143 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_71, 1), kwargs = {}) | |
| %add_224 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_142, 1e-05), kwargs = {}) | |
| %rsqrt_71 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_224,), kwargs = {}) | |
| %sub_71 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_223, %getitem_143), kwargs = {}) | |
| %mul_155 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_71, %rsqrt_71), kwargs = {}) | |
| %_param_constant445 : [#users=1] = get_attr[target=_param_constant445] | |
| %mul_156 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_155, %_param_constant445), kwargs = {}) | |
| %_param_constant446 : [#users=1] = get_attr[target=_param_constant446] | |
| %add_225 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_156, %_param_constant446), kwargs = {}) | |
| %convert_element_type_221 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_225, torch.float16), kwargs = {}) | |
| %_param_constant447 : [#users=1] = get_attr[target=_param_constant447] | |
| %t_145 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant447,), kwargs = {}) | |
| %view_379 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_221, [2048, 512]), kwargs = {}) | |
| %_param_constant448 : [#users=1] = get_attr[target=_param_constant448] | |
| %addmm_68 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant448, %view_379, %t_145), kwargs = {}) | |
| %view_380 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_68, [2, 1024, 4096]), kwargs = {}) | |
| %slice_55 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_380, -1, 0, 2048), kwargs = {}) | |
| %slice_56 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_380, -1, 2048, 4096), kwargs = {}) | |
| %gelu_10 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_56,), kwargs = {}) | |
| %mul_157 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_55, %gelu_10), kwargs = {}) | |
| %_param_constant449 : [#users=1] = get_attr[target=_param_constant449] | |
| %t_146 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant449,), kwargs = {}) | |
| %view_381 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_157, [2048, 2048]), kwargs = {}) | |
| %_param_constant450 : [#users=1] = get_attr[target=_param_constant450] | |
| %addmm_69 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant450, %view_381, %t_146), kwargs = {}) | |
| %view_382 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_69, [2, 1024, 512]), kwargs = {}) | |
| %add_226 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_382, %add_223), kwargs = {}) | |
| %_param_constant451 : [#users=1] = get_attr[target=_param_constant451] | |
| %t_147 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant451,), kwargs = {}) | |
| %view_383 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_226, [2048, 512]), kwargs = {}) | |
| %_param_constant452 : [#users=1] = get_attr[target=_param_constant452] | |
| %addmm_70 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant452, %view_383, %t_147), kwargs = {}) | |
| %view_384 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_70, [2, 1024, 512]), kwargs = {}) | |
| %view_385 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_384, [2, 32, 32, 512]), kwargs = {}) | |
| %permute_109 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_385, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_98 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_109,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_227 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_98, %div_14), kwargs = {}) | |
| %cat_6 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_227, %add_69], 1), kwargs = {}) | |
| %view_386 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_6, [2, 32, 32, 1024]), kwargs = {}) | |
| %convert_element_type_222 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_386, torch.float32), kwargs = {}) | |
| %var_mean_72 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_222, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_144 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_72, 0), kwargs = {}) | |
| %getitem_145 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_72, 1), kwargs = {}) | |
| %add_228 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_144, 1e-05), kwargs = {}) | |
| %rsqrt_72 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_228,), kwargs = {}) | |
| %sub_72 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_386, %getitem_145), kwargs = {}) | |
| %mul_158 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_72, %rsqrt_72), kwargs = {}) | |
| %view_387 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_158, [2, 1024, 32, 32]), kwargs = {}) | |
| %_param_constant453 : [#users=1] = get_attr[target=_param_constant453] | |
| %unsqueeze_264 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant453, 0), kwargs = {}) | |
| %unsqueeze_265 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_264, 2), kwargs = {}) | |
| %unsqueeze_266 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_265, 3), kwargs = {}) | |
| %_param_constant454 : [#users=1] = get_attr[target=_param_constant454] | |
| %unsqueeze_267 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant454, 0), kwargs = {}) | |
| %unsqueeze_268 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_267, 2), kwargs = {}) | |
| %unsqueeze_269 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_268, 3), kwargs = {}) | |
| %mul_159 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_387, %unsqueeze_269), kwargs = {}) | |
| %add_229 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_159, %unsqueeze_266), kwargs = {}) | |
| %convert_element_type_223 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_229, torch.float16), kwargs = {}) | |
| %convert_element_type_224 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_145, torch.float16), kwargs = {}) | |
| %convert_element_type_225 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_72, torch.float16), kwargs = {}) | |
| %squeeze_156 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_224, 3), kwargs = {}) | |
| %squeeze_157 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_156, 2), kwargs = {}) | |
| %squeeze_158 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_225, 3), kwargs = {}) | |
| %squeeze_159 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_158, 2), kwargs = {}) | |
| %detach_100 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_157,), kwargs = {}) | |
| %detach_101 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_159,), kwargs = {}) | |
| %silu_43 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_223,), kwargs = {}) | |
| %_param_constant455 : [#users=1] = get_attr[target=_param_constant455] | |
| %_param_constant456 : [#users=1] = get_attr[target=_param_constant456] | |
| %convolution_39 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_43, %_param_constant455, %_param_constant456, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_44 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant457 : [#users=1] = get_attr[target=_param_constant457] | |
| %t_148 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant457,), kwargs = {}) | |
| %_param_constant458 : [#users=1] = get_attr[target=_param_constant458] | |
| %addmm_71 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant458, %silu_44, %t_148), kwargs = {}) | |
| %slice_57 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_71, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_58 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_57, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_270 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_58, 2), kwargs = {}) | |
| %unsqueeze_271 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_270, 3), kwargs = {}) | |
| %add_230 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_39, %unsqueeze_271), kwargs = {}) | |
| %view_388 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_230, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_226 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_388, torch.float32), kwargs = {}) | |
| %var_mean_73 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_226, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_146 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_73, 0), kwargs = {}) | |
| %getitem_147 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_73, 1), kwargs = {}) | |
| %add_231 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_146, 1e-05), kwargs = {}) | |
| %rsqrt_73 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_231,), kwargs = {}) | |
| %sub_73 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_388, %getitem_147), kwargs = {}) | |
| %mul_160 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_73, %rsqrt_73), kwargs = {}) | |
| %view_389 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_160, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant459 : [#users=1] = get_attr[target=_param_constant459] | |
| %unsqueeze_272 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant459, 0), kwargs = {}) | |
| %unsqueeze_273 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_272, 2), kwargs = {}) | |
| %unsqueeze_274 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_273, 3), kwargs = {}) | |
| %_param_constant460 : [#users=1] = get_attr[target=_param_constant460] | |
| %unsqueeze_275 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant460, 0), kwargs = {}) | |
| %unsqueeze_276 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_275, 2), kwargs = {}) | |
| %unsqueeze_277 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_276, 3), kwargs = {}) | |
| %mul_161 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_389, %unsqueeze_277), kwargs = {}) | |
| %add_232 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_161, %unsqueeze_274), kwargs = {}) | |
| %convert_element_type_227 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_232, torch.float16), kwargs = {}) | |
| %convert_element_type_228 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_147, torch.float16), kwargs = {}) | |
| %convert_element_type_229 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_73, torch.float16), kwargs = {}) | |
| %squeeze_160 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_228, 3), kwargs = {}) | |
| %squeeze_161 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_160, 2), kwargs = {}) | |
| %squeeze_162 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_229, 3), kwargs = {}) | |
| %squeeze_163 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_162, 2), kwargs = {}) | |
| %detach_102 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_161,), kwargs = {}) | |
| %detach_103 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_163,), kwargs = {}) | |
| %silu_45 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_227,), kwargs = {}) | |
| %_param_constant461 : [#users=1] = get_attr[target=_param_constant461] | |
| %_param_constant462 : [#users=1] = get_attr[target=_param_constant462] | |
| %convolution_40 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_45, %_param_constant461, %_param_constant462, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant463 : [#users=1] = get_attr[target=_param_constant463] | |
| %_param_constant464 : [#users=1] = get_attr[target=_param_constant464] | |
| %convolution_41 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_6, %_param_constant463, %_param_constant464, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_233 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_41, %convolution_40), kwargs = {}) | |
| %div_15 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_233, 1.0), kwargs = {}) | |
| %view_390 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_15, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_230 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_390, torch.float32), kwargs = {}) | |
| %var_mean_74 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_230, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_148 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_74, 0), kwargs = {}) | |
| %getitem_149 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_74, 1), kwargs = {}) | |
| %add_234 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_148, 1e-06), kwargs = {}) | |
| %rsqrt_74 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_234,), kwargs = {}) | |
| %sub_74 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_390, %getitem_149), kwargs = {}) | |
| %mul_162 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_74, %rsqrt_74), kwargs = {}) | |
| %view_391 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_162, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant465 : [#users=1] = get_attr[target=_param_constant465] | |
| %unsqueeze_278 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant465, 0), kwargs = {}) | |
| %unsqueeze_279 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_278, 2), kwargs = {}) | |
| %unsqueeze_280 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_279, 3), kwargs = {}) | |
| %_param_constant466 : [#users=1] = get_attr[target=_param_constant466] | |
| %unsqueeze_281 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant466, 0), kwargs = {}) | |
| %unsqueeze_282 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_281, 2), kwargs = {}) | |
| %unsqueeze_283 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_282, 3), kwargs = {}) | |
| %mul_163 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_391, %unsqueeze_283), kwargs = {}) | |
| %add_235 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_163, %unsqueeze_280), kwargs = {}) | |
| %convert_element_type_231 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_235, torch.float16), kwargs = {}) | |
| %convert_element_type_232 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_149, torch.float16), kwargs = {}) | |
| %convert_element_type_233 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_74, torch.float16), kwargs = {}) | |
| %squeeze_164 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_232, 3), kwargs = {}) | |
| %squeeze_165 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_164, 2), kwargs = {}) | |
| %squeeze_166 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_233, 3), kwargs = {}) | |
| %squeeze_167 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_166, 2), kwargs = {}) | |
| %detach_104 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_165,), kwargs = {}) | |
| %detach_105 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_167,), kwargs = {}) | |
| %permute_110 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_231, [0, 2, 3, 1]), kwargs = {}) | |
| %view_392 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_110, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant467 : [#users=1] = get_attr[target=_param_constant467] | |
| %t_149 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant467,), kwargs = {}) | |
| %expand_23 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_392, [2, 1024, 512]), kwargs = {}) | |
| %view_393 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_23, [2, 1024, 512]), kwargs = {}) | |
| %expand_24 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_149, [2, 512, 512]), kwargs = {}) | |
| %view_394 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_24, [2, 512, 512]), kwargs = {}) | |
| %bmm_33 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_393, %view_394), kwargs = {}) | |
| %_unsafe_view_165 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_33, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant468 : [#users=1] = get_attr[target=_param_constant468] | |
| %add_236 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_165, %_param_constant468), kwargs = {}) | |
| %convert_element_type_234 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_236, torch.float32), kwargs = {}) | |
| %var_mean_75 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_234, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_150 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_75, 0), kwargs = {}) | |
| %getitem_151 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_75, 1), kwargs = {}) | |
| %add_237 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_150, 1e-05), kwargs = {}) | |
| %rsqrt_75 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_237,), kwargs = {}) | |
| %sub_75 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_236, %getitem_151), kwargs = {}) | |
| %mul_164 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_75, %rsqrt_75), kwargs = {}) | |
| %_param_constant469 : [#users=1] = get_attr[target=_param_constant469] | |
| %mul_165 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_164, %_param_constant469), kwargs = {}) | |
| %_param_constant470 : [#users=1] = get_attr[target=_param_constant470] | |
| %add_238 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_165, %_param_constant470), kwargs = {}) | |
| %convert_element_type_235 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_238, torch.float16), kwargs = {}) | |
| %_param_constant471 : [#users=1] = get_attr[target=_param_constant471] | |
| %t_150 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant471,), kwargs = {}) | |
| %view_395 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_235, [2048, 512]), kwargs = {}) | |
| %mm_66 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_395, %t_150), kwargs = {}) | |
| %_unsafe_view_166 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_66, [2, 1024, 512]), kwargs = {}) | |
| %view_396 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_166, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_111 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_396, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_99 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_111,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_167 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_99, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant472 : [#users=1] = get_attr[target=_param_constant472] | |
| %t_151 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant472,), kwargs = {}) | |
| %view_397 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_67 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_397, %t_151), kwargs = {}) | |
| %_unsafe_view_168 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_67, [2, 77, 512]), kwargs = {}) | |
| %_param_constant473 : [#users=1] = get_attr[target=_param_constant473] | |
| %t_152 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant473,), kwargs = {}) | |
| %view_398 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_68 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_398, %t_152), kwargs = {}) | |
| %_unsafe_view_169 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_68, [2, 77, 512]), kwargs = {}) | |
| %view_399 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_168, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_112 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_399, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_100 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_112,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_170 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_100, [16, 77, 64]), kwargs = {}) | |
| %view_400 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_169, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_113 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_400, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_101 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_113,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_171 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_101, [16, 77, 64]), kwargs = {}) | |
| %empty_22 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_22 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_170, -1, -2), kwargs = {}) | |
| %baddbmm_22 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_22, %_unsafe_view_167, %transpose_22), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_22 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_22, -1, False), kwargs = {}) | |
| %detach_106 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_22,), kwargs = {}) | |
| %bmm_34 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_22, %_unsafe_view_171), kwargs = {}) | |
| %view_401 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_34, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_114 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_401, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_102 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_114,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_172 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_102, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant474 : [#users=1] = get_attr[target=_param_constant474] | |
| %t_153 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant474,), kwargs = {}) | |
| %view_402 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_172, [2048, 512]), kwargs = {}) | |
| %_param_constant475 : [#users=1] = get_attr[target=_param_constant475] | |
| %addmm_72 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant475, %view_402, %t_153), kwargs = {}) | |
| %view_403 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_72, [2, 1024, 512]), kwargs = {}) | |
| %add_239 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_403, %add_236), kwargs = {}) | |
| %convert_element_type_236 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_239, torch.float32), kwargs = {}) | |
| %var_mean_76 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_236, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_152 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_76, 0), kwargs = {}) | |
| %getitem_153 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_76, 1), kwargs = {}) | |
| %add_240 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_152, 1e-05), kwargs = {}) | |
| %rsqrt_76 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_240,), kwargs = {}) | |
| %sub_76 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_239, %getitem_153), kwargs = {}) | |
| %mul_166 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_76, %rsqrt_76), kwargs = {}) | |
| %_param_constant476 : [#users=1] = get_attr[target=_param_constant476] | |
| %mul_167 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_166, %_param_constant476), kwargs = {}) | |
| %_param_constant477 : [#users=1] = get_attr[target=_param_constant477] | |
| %add_241 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_167, %_param_constant477), kwargs = {}) | |
| %convert_element_type_237 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_241, torch.float16), kwargs = {}) | |
| %_param_constant478 : [#users=1] = get_attr[target=_param_constant478] | |
| %t_154 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant478,), kwargs = {}) | |
| %view_404 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_237, [2048, 512]), kwargs = {}) | |
| %mm_69 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_404, %t_154), kwargs = {}) | |
| %_unsafe_view_173 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_69, [2, 1024, 512]), kwargs = {}) | |
| %view_405 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_173, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_115 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_405, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_103 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_115,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_174 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_103, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant479 : [#users=1] = get_attr[target=_param_constant479] | |
| %t_155 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant479,), kwargs = {}) | |
| %view_406 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_70 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_406, %t_155), kwargs = {}) | |
| %_unsafe_view_175 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_70, [2, 77, 512]), kwargs = {}) | |
| %_param_constant480 : [#users=1] = get_attr[target=_param_constant480] | |
| %t_156 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant480,), kwargs = {}) | |
| %view_407 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_71 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_407, %t_156), kwargs = {}) | |
| %_unsafe_view_176 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_71, [2, 77, 512]), kwargs = {}) | |
| %view_408 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_175, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_116 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_408, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_104 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_116,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_177 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_104, [16, 77, 64]), kwargs = {}) | |
| %view_409 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_176, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_117 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_409, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_105 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_117,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_178 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_105, [16, 77, 64]), kwargs = {}) | |
| %empty_23 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_23 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_177, -1, -2), kwargs = {}) | |
| %baddbmm_23 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_23, %_unsafe_view_174, %transpose_23), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_23 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_23, -1, False), kwargs = {}) | |
| %detach_107 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_23,), kwargs = {}) | |
| %bmm_35 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_23, %_unsafe_view_178), kwargs = {}) | |
| %view_410 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_35, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_118 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_410, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_106 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_118,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_179 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_106, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant481 : [#users=1] = get_attr[target=_param_constant481] | |
| %t_157 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant481,), kwargs = {}) | |
| %view_411 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_179, [2048, 512]), kwargs = {}) | |
| %_param_constant482 : [#users=1] = get_attr[target=_param_constant482] | |
| %addmm_73 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant482, %view_411, %t_157), kwargs = {}) | |
| %view_412 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_73, [2, 1024, 512]), kwargs = {}) | |
| %add_242 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_412, %add_239), kwargs = {}) | |
| %convert_element_type_238 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_242, torch.float32), kwargs = {}) | |
| %var_mean_77 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_238, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_154 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_77, 0), kwargs = {}) | |
| %getitem_155 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_77, 1), kwargs = {}) | |
| %add_243 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_154, 1e-05), kwargs = {}) | |
| %rsqrt_77 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_243,), kwargs = {}) | |
| %sub_77 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_242, %getitem_155), kwargs = {}) | |
| %mul_168 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_77, %rsqrt_77), kwargs = {}) | |
| %_param_constant483 : [#users=1] = get_attr[target=_param_constant483] | |
| %mul_169 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_168, %_param_constant483), kwargs = {}) | |
| %_param_constant484 : [#users=1] = get_attr[target=_param_constant484] | |
| %add_244 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_169, %_param_constant484), kwargs = {}) | |
| %convert_element_type_239 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_244, torch.float16), kwargs = {}) | |
| %_param_constant485 : [#users=1] = get_attr[target=_param_constant485] | |
| %t_158 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant485,), kwargs = {}) | |
| %view_413 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_239, [2048, 512]), kwargs = {}) | |
| %_param_constant486 : [#users=1] = get_attr[target=_param_constant486] | |
| %addmm_74 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant486, %view_413, %t_158), kwargs = {}) | |
| %view_414 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_74, [2, 1024, 4096]), kwargs = {}) | |
| %slice_59 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_414, -1, 0, 2048), kwargs = {}) | |
| %slice_60 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_414, -1, 2048, 4096), kwargs = {}) | |
| %gelu_11 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_60,), kwargs = {}) | |
| %mul_170 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_59, %gelu_11), kwargs = {}) | |
| %_param_constant487 : [#users=1] = get_attr[target=_param_constant487] | |
| %t_159 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant487,), kwargs = {}) | |
| %view_415 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_170, [2048, 2048]), kwargs = {}) | |
| %_param_constant488 : [#users=1] = get_attr[target=_param_constant488] | |
| %addmm_75 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant488, %view_415, %t_159), kwargs = {}) | |
| %view_416 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_75, [2, 1024, 512]), kwargs = {}) | |
| %add_245 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_416, %add_242), kwargs = {}) | |
| %_param_constant489 : [#users=1] = get_attr[target=_param_constant489] | |
| %t_160 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant489,), kwargs = {}) | |
| %view_417 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_245, [2048, 512]), kwargs = {}) | |
| %_param_constant490 : [#users=1] = get_attr[target=_param_constant490] | |
| %addmm_76 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant490, %view_417, %t_160), kwargs = {}) | |
| %view_418 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_76, [2, 1024, 512]), kwargs = {}) | |
| %view_419 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_418, [2, 32, 32, 512]), kwargs = {}) | |
| %permute_119 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_419, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_107 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_119,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_246 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_107, %div_15), kwargs = {}) | |
| %cat_7 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_246, %convolution_11], 1), kwargs = {}) | |
| %view_420 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_7, [2, 32, 32, 1024]), kwargs = {}) | |
| %convert_element_type_240 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_420, torch.float32), kwargs = {}) | |
| %var_mean_78 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_240, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_156 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_78, 0), kwargs = {}) | |
| %getitem_157 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_78, 1), kwargs = {}) | |
| %add_247 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_156, 1e-05), kwargs = {}) | |
| %rsqrt_78 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_247,), kwargs = {}) | |
| %sub_78 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_420, %getitem_157), kwargs = {}) | |
| %mul_171 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_78, %rsqrt_78), kwargs = {}) | |
| %view_421 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_171, [2, 1024, 32, 32]), kwargs = {}) | |
| %_param_constant491 : [#users=1] = get_attr[target=_param_constant491] | |
| %unsqueeze_284 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant491, 0), kwargs = {}) | |
| %unsqueeze_285 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_284, 2), kwargs = {}) | |
| %unsqueeze_286 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_285, 3), kwargs = {}) | |
| %_param_constant492 : [#users=1] = get_attr[target=_param_constant492] | |
| %unsqueeze_287 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant492, 0), kwargs = {}) | |
| %unsqueeze_288 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_287, 2), kwargs = {}) | |
| %unsqueeze_289 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_288, 3), kwargs = {}) | |
| %mul_172 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_421, %unsqueeze_289), kwargs = {}) | |
| %add_248 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_172, %unsqueeze_286), kwargs = {}) | |
| %convert_element_type_241 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_248, torch.float16), kwargs = {}) | |
| %convert_element_type_242 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_157, torch.float16), kwargs = {}) | |
| %convert_element_type_243 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_78, torch.float16), kwargs = {}) | |
| %squeeze_168 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_242, 3), kwargs = {}) | |
| %squeeze_169 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_168, 2), kwargs = {}) | |
| %squeeze_170 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_243, 3), kwargs = {}) | |
| %squeeze_171 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_170, 2), kwargs = {}) | |
| %detach_108 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_169,), kwargs = {}) | |
| %detach_109 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_171,), kwargs = {}) | |
| %silu_46 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_241,), kwargs = {}) | |
| %_param_constant493 : [#users=1] = get_attr[target=_param_constant493] | |
| %_param_constant494 : [#users=1] = get_attr[target=_param_constant494] | |
| %convolution_42 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_46, %_param_constant493, %_param_constant494, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_47 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant495 : [#users=1] = get_attr[target=_param_constant495] | |
| %t_161 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant495,), kwargs = {}) | |
| %_param_constant496 : [#users=1] = get_attr[target=_param_constant496] | |
| %addmm_77 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant496, %silu_47, %t_161), kwargs = {}) | |
| %slice_61 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_77, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_62 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_61, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_290 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_62, 2), kwargs = {}) | |
| %unsqueeze_291 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_290, 3), kwargs = {}) | |
| %add_249 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_42, %unsqueeze_291), kwargs = {}) | |
| %view_422 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_249, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_244 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_422, torch.float32), kwargs = {}) | |
| %var_mean_79 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_244, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_158 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_79, 0), kwargs = {}) | |
| %getitem_159 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_79, 1), kwargs = {}) | |
| %add_250 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_158, 1e-05), kwargs = {}) | |
| %rsqrt_79 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_250,), kwargs = {}) | |
| %sub_79 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_422, %getitem_159), kwargs = {}) | |
| %mul_173 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_79, %rsqrt_79), kwargs = {}) | |
| %view_423 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_173, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant497 : [#users=1] = get_attr[target=_param_constant497] | |
| %unsqueeze_292 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant497, 0), kwargs = {}) | |
| %unsqueeze_293 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_292, 2), kwargs = {}) | |
| %unsqueeze_294 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_293, 3), kwargs = {}) | |
| %_param_constant498 : [#users=1] = get_attr[target=_param_constant498] | |
| %unsqueeze_295 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant498, 0), kwargs = {}) | |
| %unsqueeze_296 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_295, 2), kwargs = {}) | |
| %unsqueeze_297 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_296, 3), kwargs = {}) | |
| %mul_174 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_423, %unsqueeze_297), kwargs = {}) | |
| %add_251 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_174, %unsqueeze_294), kwargs = {}) | |
| %convert_element_type_245 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_251, torch.float16), kwargs = {}) | |
| %convert_element_type_246 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_159, torch.float16), kwargs = {}) | |
| %convert_element_type_247 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_79, torch.float16), kwargs = {}) | |
| %squeeze_172 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_246, 3), kwargs = {}) | |
| %squeeze_173 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_172, 2), kwargs = {}) | |
| %squeeze_174 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_247, 3), kwargs = {}) | |
| %squeeze_175 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_174, 2), kwargs = {}) | |
| %detach_110 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_173,), kwargs = {}) | |
| %detach_111 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_175,), kwargs = {}) | |
| %silu_48 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_245,), kwargs = {}) | |
| %_param_constant499 : [#users=1] = get_attr[target=_param_constant499] | |
| %_param_constant500 : [#users=1] = get_attr[target=_param_constant500] | |
| %convolution_43 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_48, %_param_constant499, %_param_constant500, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant501 : [#users=1] = get_attr[target=_param_constant501] | |
| %_param_constant502 : [#users=1] = get_attr[target=_param_constant502] | |
| %convolution_44 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_7, %_param_constant501, %_param_constant502, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_252 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_44, %convolution_43), kwargs = {}) | |
| %div_16 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_252, 1.0), kwargs = {}) | |
| %view_424 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_16, [2, 32, 16, 1024]), kwargs = {}) | |
| %convert_element_type_248 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_424, torch.float32), kwargs = {}) | |
| %var_mean_80 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_248, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_160 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_80, 0), kwargs = {}) | |
| %getitem_161 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_80, 1), kwargs = {}) | |
| %add_253 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_160, 1e-06), kwargs = {}) | |
| %rsqrt_80 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_253,), kwargs = {}) | |
| %sub_80 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_424, %getitem_161), kwargs = {}) | |
| %mul_175 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_80, %rsqrt_80), kwargs = {}) | |
| %view_425 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_175, [2, 512, 32, 32]), kwargs = {}) | |
| %_param_constant503 : [#users=1] = get_attr[target=_param_constant503] | |
| %unsqueeze_298 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant503, 0), kwargs = {}) | |
| %unsqueeze_299 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_298, 2), kwargs = {}) | |
| %unsqueeze_300 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_299, 3), kwargs = {}) | |
| %_param_constant504 : [#users=1] = get_attr[target=_param_constant504] | |
| %unsqueeze_301 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant504, 0), kwargs = {}) | |
| %unsqueeze_302 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_301, 2), kwargs = {}) | |
| %unsqueeze_303 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_302, 3), kwargs = {}) | |
| %mul_176 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_425, %unsqueeze_303), kwargs = {}) | |
| %add_254 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_176, %unsqueeze_300), kwargs = {}) | |
| %convert_element_type_249 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_254, torch.float16), kwargs = {}) | |
| %convert_element_type_250 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_161, torch.float16), kwargs = {}) | |
| %convert_element_type_251 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_80, torch.float16), kwargs = {}) | |
| %squeeze_176 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_250, 3), kwargs = {}) | |
| %squeeze_177 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_176, 2), kwargs = {}) | |
| %squeeze_178 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_251, 3), kwargs = {}) | |
| %squeeze_179 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_178, 2), kwargs = {}) | |
| %detach_112 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_177,), kwargs = {}) | |
| %detach_113 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_179,), kwargs = {}) | |
| %permute_120 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_249, [0, 2, 3, 1]), kwargs = {}) | |
| %view_426 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_120, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant505 : [#users=1] = get_attr[target=_param_constant505] | |
| %t_162 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant505,), kwargs = {}) | |
| %expand_25 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_426, [2, 1024, 512]), kwargs = {}) | |
| %view_427 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_25, [2, 1024, 512]), kwargs = {}) | |
| %expand_26 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_162, [2, 512, 512]), kwargs = {}) | |
| %view_428 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_26, [2, 512, 512]), kwargs = {}) | |
| %bmm_36 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_427, %view_428), kwargs = {}) | |
| %_unsafe_view_180 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_36, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant506 : [#users=1] = get_attr[target=_param_constant506] | |
| %add_255 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_180, %_param_constant506), kwargs = {}) | |
| %convert_element_type_252 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_255, torch.float32), kwargs = {}) | |
| %var_mean_81 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_252, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_162 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_81, 0), kwargs = {}) | |
| %getitem_163 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_81, 1), kwargs = {}) | |
| %add_256 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_162, 1e-05), kwargs = {}) | |
| %rsqrt_81 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_256,), kwargs = {}) | |
| %sub_81 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_255, %getitem_163), kwargs = {}) | |
| %mul_177 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_81, %rsqrt_81), kwargs = {}) | |
| %_param_constant507 : [#users=1] = get_attr[target=_param_constant507] | |
| %mul_178 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_177, %_param_constant507), kwargs = {}) | |
| %_param_constant508 : [#users=1] = get_attr[target=_param_constant508] | |
| %add_257 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_178, %_param_constant508), kwargs = {}) | |
| %convert_element_type_253 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_257, torch.float16), kwargs = {}) | |
| %_param_constant509 : [#users=1] = get_attr[target=_param_constant509] | |
| %t_163 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant509,), kwargs = {}) | |
| %view_429 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_253, [2048, 512]), kwargs = {}) | |
| %mm_72 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_429, %t_163), kwargs = {}) | |
| %_unsafe_view_181 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_72, [2, 1024, 512]), kwargs = {}) | |
| %view_430 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_181, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_121 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_430, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_108 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_121,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_182 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_108, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant510 : [#users=1] = get_attr[target=_param_constant510] | |
| %t_164 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant510,), kwargs = {}) | |
| %view_431 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_73 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_431, %t_164), kwargs = {}) | |
| %_unsafe_view_183 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_73, [2, 77, 512]), kwargs = {}) | |
| %_param_constant511 : [#users=1] = get_attr[target=_param_constant511] | |
| %t_165 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant511,), kwargs = {}) | |
| %view_432 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_74 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_432, %t_165), kwargs = {}) | |
| %_unsafe_view_184 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_74, [2, 77, 512]), kwargs = {}) | |
| %view_433 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_183, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_122 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_433, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_109 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_122,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_185 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_109, [16, 77, 64]), kwargs = {}) | |
| %view_434 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_184, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_123 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_434, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_110 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_123,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_186 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_110, [16, 77, 64]), kwargs = {}) | |
| %empty_24 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_24 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_185, -1, -2), kwargs = {}) | |
| %baddbmm_24 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_24, %_unsafe_view_182, %transpose_24), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_24 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_24, -1, False), kwargs = {}) | |
| %detach_114 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_24,), kwargs = {}) | |
| %bmm_37 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_24, %_unsafe_view_186), kwargs = {}) | |
| %view_435 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_37, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_124 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_435, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_111 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_124,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_187 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_111, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant512 : [#users=1] = get_attr[target=_param_constant512] | |
| %t_166 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant512,), kwargs = {}) | |
| %view_436 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_187, [2048, 512]), kwargs = {}) | |
| %_param_constant513 : [#users=1] = get_attr[target=_param_constant513] | |
| %addmm_78 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant513, %view_436, %t_166), kwargs = {}) | |
| %view_437 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_78, [2, 1024, 512]), kwargs = {}) | |
| %add_258 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_437, %add_255), kwargs = {}) | |
| %convert_element_type_254 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_258, torch.float32), kwargs = {}) | |
| %var_mean_82 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_254, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_164 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_82, 0), kwargs = {}) | |
| %getitem_165 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_82, 1), kwargs = {}) | |
| %add_259 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_164, 1e-05), kwargs = {}) | |
| %rsqrt_82 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_259,), kwargs = {}) | |
| %sub_82 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_258, %getitem_165), kwargs = {}) | |
| %mul_179 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_82, %rsqrt_82), kwargs = {}) | |
| %_param_constant514 : [#users=1] = get_attr[target=_param_constant514] | |
| %mul_180 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_179, %_param_constant514), kwargs = {}) | |
| %_param_constant515 : [#users=1] = get_attr[target=_param_constant515] | |
| %add_260 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_180, %_param_constant515), kwargs = {}) | |
| %convert_element_type_255 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_260, torch.float16), kwargs = {}) | |
| %_param_constant516 : [#users=1] = get_attr[target=_param_constant516] | |
| %t_167 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant516,), kwargs = {}) | |
| %view_438 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_255, [2048, 512]), kwargs = {}) | |
| %mm_75 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_438, %t_167), kwargs = {}) | |
| %_unsafe_view_188 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_75, [2, 1024, 512]), kwargs = {}) | |
| %view_439 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_188, [2, 1024, 8, 64]), kwargs = {}) | |
| %permute_125 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_439, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_112 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_125,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_189 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_112, [16, 1024, 64]), kwargs = {}) | |
| %_param_constant517 : [#users=1] = get_attr[target=_param_constant517] | |
| %t_168 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant517,), kwargs = {}) | |
| %view_440 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_76 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_440, %t_168), kwargs = {}) | |
| %_unsafe_view_190 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_76, [2, 77, 512]), kwargs = {}) | |
| %_param_constant518 : [#users=1] = get_attr[target=_param_constant518] | |
| %t_169 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant518,), kwargs = {}) | |
| %view_441 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_77 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_441, %t_169), kwargs = {}) | |
| %_unsafe_view_191 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_77, [2, 77, 512]), kwargs = {}) | |
| %view_442 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_190, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_126 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_442, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_113 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_126,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_192 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_113, [16, 77, 64]), kwargs = {}) | |
| %view_443 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_191, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_127 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_443, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_114 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_127,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_193 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_114, [16, 77, 64]), kwargs = {}) | |
| %empty_25 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 1024, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_25 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_192, -1, -2), kwargs = {}) | |
| %baddbmm_25 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_25, %_unsafe_view_189, %transpose_25), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_25 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_25, -1, False), kwargs = {}) | |
| %detach_115 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_25,), kwargs = {}) | |
| %bmm_38 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_25, %_unsafe_view_193), kwargs = {}) | |
| %view_444 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_38, [2, 8, 1024, 64]), kwargs = {}) | |
| %permute_128 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_444, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_115 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_128,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_194 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_115, [2, 1024, 512]), kwargs = {}) | |
| %_param_constant519 : [#users=1] = get_attr[target=_param_constant519] | |
| %t_170 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant519,), kwargs = {}) | |
| %view_445 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_194, [2048, 512]), kwargs = {}) | |
| %_param_constant520 : [#users=1] = get_attr[target=_param_constant520] | |
| %addmm_79 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant520, %view_445, %t_170), kwargs = {}) | |
| %view_446 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_79, [2, 1024, 512]), kwargs = {}) | |
| %add_261 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_446, %add_258), kwargs = {}) | |
| %convert_element_type_256 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_261, torch.float32), kwargs = {}) | |
| %var_mean_83 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_256, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_166 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_83, 0), kwargs = {}) | |
| %getitem_167 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_83, 1), kwargs = {}) | |
| %add_262 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_166, 1e-05), kwargs = {}) | |
| %rsqrt_83 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_262,), kwargs = {}) | |
| %sub_83 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_261, %getitem_167), kwargs = {}) | |
| %mul_181 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_83, %rsqrt_83), kwargs = {}) | |
| %_param_constant521 : [#users=1] = get_attr[target=_param_constant521] | |
| %mul_182 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_181, %_param_constant521), kwargs = {}) | |
| %_param_constant522 : [#users=1] = get_attr[target=_param_constant522] | |
| %add_263 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_182, %_param_constant522), kwargs = {}) | |
| %convert_element_type_257 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_263, torch.float16), kwargs = {}) | |
| %_param_constant523 : [#users=1] = get_attr[target=_param_constant523] | |
| %t_171 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant523,), kwargs = {}) | |
| %view_447 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_257, [2048, 512]), kwargs = {}) | |
| %_param_constant524 : [#users=1] = get_attr[target=_param_constant524] | |
| %addmm_80 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant524, %view_447, %t_171), kwargs = {}) | |
| %view_448 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_80, [2, 1024, 4096]), kwargs = {}) | |
| %slice_63 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_448, -1, 0, 2048), kwargs = {}) | |
| %slice_64 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_448, -1, 2048, 4096), kwargs = {}) | |
| %gelu_12 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_64,), kwargs = {}) | |
| %mul_183 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_63, %gelu_12), kwargs = {}) | |
| %_param_constant525 : [#users=1] = get_attr[target=_param_constant525] | |
| %t_172 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant525,), kwargs = {}) | |
| %view_449 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_183, [2048, 2048]), kwargs = {}) | |
| %_param_constant526 : [#users=1] = get_attr[target=_param_constant526] | |
| %addmm_81 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant526, %view_449, %t_172), kwargs = {}) | |
| %view_450 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_81, [2, 1024, 512]), kwargs = {}) | |
| %add_264 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_450, %add_261), kwargs = {}) | |
| %_param_constant527 : [#users=1] = get_attr[target=_param_constant527] | |
| %t_173 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant527,), kwargs = {}) | |
| %view_451 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_264, [2048, 512]), kwargs = {}) | |
| %_param_constant528 : [#users=1] = get_attr[target=_param_constant528] | |
| %addmm_82 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant528, %view_451, %t_173), kwargs = {}) | |
| %view_452 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_82, [2, 1024, 512]), kwargs = {}) | |
| %view_453 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_452, [2, 32, 32, 512]), kwargs = {}) | |
| %permute_129 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_453, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_116 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_129,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_265 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_116, %div_16), kwargs = {}) | |
| %upsample_nearest2d_1 : [#users=1] = call_function[target=torch.ops.aten.upsample_nearest2d](args = (%add_265, [64, 64], 2.0, 2.0), kwargs = {}) | |
| %_param_constant529 : [#users=1] = get_attr[target=_param_constant529] | |
| %_param_constant530 : [#users=1] = get_attr[target=_param_constant530] | |
| %convolution_45 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%upsample_nearest2d_1, %_param_constant529, %_param_constant530, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %cat_8 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%convolution_45, %add_50], 1), kwargs = {}) | |
| %view_454 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_8, [2, 32, 32, 4096]), kwargs = {}) | |
| %convert_element_type_258 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_454, torch.float32), kwargs = {}) | |
| %var_mean_84 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_258, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_168 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_84, 0), kwargs = {}) | |
| %getitem_169 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_84, 1), kwargs = {}) | |
| %add_266 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_168, 1e-05), kwargs = {}) | |
| %rsqrt_84 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_266,), kwargs = {}) | |
| %sub_84 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_454, %getitem_169), kwargs = {}) | |
| %mul_184 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_84, %rsqrt_84), kwargs = {}) | |
| %view_455 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_184, [2, 1024, 64, 64]), kwargs = {}) | |
| %_param_constant531 : [#users=1] = get_attr[target=_param_constant531] | |
| %unsqueeze_304 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant531, 0), kwargs = {}) | |
| %unsqueeze_305 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_304, 2), kwargs = {}) | |
| %unsqueeze_306 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_305, 3), kwargs = {}) | |
| %_param_constant532 : [#users=1] = get_attr[target=_param_constant532] | |
| %unsqueeze_307 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant532, 0), kwargs = {}) | |
| %unsqueeze_308 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_307, 2), kwargs = {}) | |
| %unsqueeze_309 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_308, 3), kwargs = {}) | |
| %mul_185 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_455, %unsqueeze_309), kwargs = {}) | |
| %add_267 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_185, %unsqueeze_306), kwargs = {}) | |
| %convert_element_type_259 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_267, torch.float16), kwargs = {}) | |
| %convert_element_type_260 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_169, torch.float16), kwargs = {}) | |
| %convert_element_type_261 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_84, torch.float16), kwargs = {}) | |
| %squeeze_180 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_260, 3), kwargs = {}) | |
| %squeeze_181 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_180, 2), kwargs = {}) | |
| %squeeze_182 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_261, 3), kwargs = {}) | |
| %squeeze_183 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_182, 2), kwargs = {}) | |
| %detach_116 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_181,), kwargs = {}) | |
| %detach_117 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_183,), kwargs = {}) | |
| %silu_49 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_259,), kwargs = {}) | |
| %_param_constant533 : [#users=1] = get_attr[target=_param_constant533] | |
| %_param_constant534 : [#users=1] = get_attr[target=_param_constant534] | |
| %convolution_46 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_49, %_param_constant533, %_param_constant534, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_50 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant535 : [#users=1] = get_attr[target=_param_constant535] | |
| %t_174 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant535,), kwargs = {}) | |
| %_param_constant536 : [#users=1] = get_attr[target=_param_constant536] | |
| %addmm_83 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant536, %silu_50, %t_174), kwargs = {}) | |
| %slice_65 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_83, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_66 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_65, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_310 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_66, 2), kwargs = {}) | |
| %unsqueeze_311 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_310, 3), kwargs = {}) | |
| %add_268 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_46, %unsqueeze_311), kwargs = {}) | |
| %view_456 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_268, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_262 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_456, torch.float32), kwargs = {}) | |
| %var_mean_85 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_262, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_170 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_85, 0), kwargs = {}) | |
| %getitem_171 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_85, 1), kwargs = {}) | |
| %add_269 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_170, 1e-05), kwargs = {}) | |
| %rsqrt_85 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_269,), kwargs = {}) | |
| %sub_85 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_456, %getitem_171), kwargs = {}) | |
| %mul_186 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_85, %rsqrt_85), kwargs = {}) | |
| %view_457 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_186, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant537 : [#users=1] = get_attr[target=_param_constant537] | |
| %unsqueeze_312 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant537, 0), kwargs = {}) | |
| %unsqueeze_313 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_312, 2), kwargs = {}) | |
| %unsqueeze_314 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_313, 3), kwargs = {}) | |
| %_param_constant538 : [#users=1] = get_attr[target=_param_constant538] | |
| %unsqueeze_315 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant538, 0), kwargs = {}) | |
| %unsqueeze_316 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_315, 2), kwargs = {}) | |
| %unsqueeze_317 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_316, 3), kwargs = {}) | |
| %mul_187 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_457, %unsqueeze_317), kwargs = {}) | |
| %add_270 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_187, %unsqueeze_314), kwargs = {}) | |
| %convert_element_type_263 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_270, torch.float16), kwargs = {}) | |
| %convert_element_type_264 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_171, torch.float16), kwargs = {}) | |
| %convert_element_type_265 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_85, torch.float16), kwargs = {}) | |
| %squeeze_184 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_264, 3), kwargs = {}) | |
| %squeeze_185 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_184, 2), kwargs = {}) | |
| %squeeze_186 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_265, 3), kwargs = {}) | |
| %squeeze_187 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_186, 2), kwargs = {}) | |
| %detach_118 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_185,), kwargs = {}) | |
| %detach_119 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_187,), kwargs = {}) | |
| %silu_51 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_263,), kwargs = {}) | |
| %_param_constant539 : [#users=1] = get_attr[target=_param_constant539] | |
| %_param_constant540 : [#users=1] = get_attr[target=_param_constant540] | |
| %convolution_47 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_51, %_param_constant539, %_param_constant540, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant541 : [#users=1] = get_attr[target=_param_constant541] | |
| %_param_constant542 : [#users=1] = get_attr[target=_param_constant542] | |
| %convolution_48 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_8, %_param_constant541, %_param_constant542, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_271 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_48, %convolution_47), kwargs = {}) | |
| %div_17 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_271, 1.0), kwargs = {}) | |
| %view_458 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_17, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_266 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_458, torch.float32), kwargs = {}) | |
| %var_mean_86 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_266, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_172 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_86, 0), kwargs = {}) | |
| %getitem_173 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_86, 1), kwargs = {}) | |
| %add_272 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_172, 1e-06), kwargs = {}) | |
| %rsqrt_86 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_272,), kwargs = {}) | |
| %sub_86 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_458, %getitem_173), kwargs = {}) | |
| %mul_188 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_86, %rsqrt_86), kwargs = {}) | |
| %view_459 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_188, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant543 : [#users=1] = get_attr[target=_param_constant543] | |
| %unsqueeze_318 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant543, 0), kwargs = {}) | |
| %unsqueeze_319 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_318, 2), kwargs = {}) | |
| %unsqueeze_320 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_319, 3), kwargs = {}) | |
| %_param_constant544 : [#users=1] = get_attr[target=_param_constant544] | |
| %unsqueeze_321 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant544, 0), kwargs = {}) | |
| %unsqueeze_322 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_321, 2), kwargs = {}) | |
| %unsqueeze_323 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_322, 3), kwargs = {}) | |
| %mul_189 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_459, %unsqueeze_323), kwargs = {}) | |
| %add_273 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_189, %unsqueeze_320), kwargs = {}) | |
| %convert_element_type_267 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_273, torch.float16), kwargs = {}) | |
| %convert_element_type_268 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_173, torch.float16), kwargs = {}) | |
| %convert_element_type_269 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_86, torch.float16), kwargs = {}) | |
| %squeeze_188 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_268, 3), kwargs = {}) | |
| %squeeze_189 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_188, 2), kwargs = {}) | |
| %squeeze_190 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_269, 3), kwargs = {}) | |
| %squeeze_191 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_190, 2), kwargs = {}) | |
| %detach_120 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_189,), kwargs = {}) | |
| %detach_121 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_191,), kwargs = {}) | |
| %permute_130 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_267, [0, 2, 3, 1]), kwargs = {}) | |
| %view_460 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_130, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant545 : [#users=1] = get_attr[target=_param_constant545] | |
| %t_175 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant545,), kwargs = {}) | |
| %expand_27 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_460, [2, 4096, 512]), kwargs = {}) | |
| %view_461 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_27, [2, 4096, 512]), kwargs = {}) | |
| %expand_28 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_175, [2, 512, 512]), kwargs = {}) | |
| %view_462 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_28, [2, 512, 512]), kwargs = {}) | |
| %bmm_39 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_461, %view_462), kwargs = {}) | |
| %_unsafe_view_195 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_39, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant546 : [#users=1] = get_attr[target=_param_constant546] | |
| %add_274 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_195, %_param_constant546), kwargs = {}) | |
| %convert_element_type_270 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_274, torch.float32), kwargs = {}) | |
| %var_mean_87 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_270, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_174 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_87, 0), kwargs = {}) | |
| %getitem_175 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_87, 1), kwargs = {}) | |
| %add_275 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_174, 1e-05), kwargs = {}) | |
| %rsqrt_87 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_275,), kwargs = {}) | |
| %sub_87 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_274, %getitem_175), kwargs = {}) | |
| %mul_190 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_87, %rsqrt_87), kwargs = {}) | |
| %_param_constant547 : [#users=1] = get_attr[target=_param_constant547] | |
| %mul_191 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_190, %_param_constant547), kwargs = {}) | |
| %_param_constant548 : [#users=1] = get_attr[target=_param_constant548] | |
| %add_276 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_191, %_param_constant548), kwargs = {}) | |
| %convert_element_type_271 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_276, torch.float16), kwargs = {}) | |
| %_param_constant549 : [#users=1] = get_attr[target=_param_constant549] | |
| %t_176 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant549,), kwargs = {}) | |
| %view_463 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_271, [8192, 512]), kwargs = {}) | |
| %mm_78 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_463, %t_176), kwargs = {}) | |
| %_unsafe_view_196 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_78, [2, 4096, 512]), kwargs = {}) | |
| %view_464 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_196, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_131 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_464, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_117 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_131,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_197 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_117, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant550 : [#users=1] = get_attr[target=_param_constant550] | |
| %t_177 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant550,), kwargs = {}) | |
| %view_465 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_79 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_465, %t_177), kwargs = {}) | |
| %_unsafe_view_198 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_79, [2, 77, 512]), kwargs = {}) | |
| %_param_constant551 : [#users=1] = get_attr[target=_param_constant551] | |
| %t_178 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant551,), kwargs = {}) | |
| %view_466 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_80 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_466, %t_178), kwargs = {}) | |
| %_unsafe_view_199 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_80, [2, 77, 512]), kwargs = {}) | |
| %view_467 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_198, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_132 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_467, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_118 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_132,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_200 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_118, [16, 77, 64]), kwargs = {}) | |
| %view_468 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_199, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_133 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_468, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_119 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_133,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_201 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_119, [16, 77, 64]), kwargs = {}) | |
| %empty_26 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_26 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_200, -1, -2), kwargs = {}) | |
| %baddbmm_26 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_26, %_unsafe_view_197, %transpose_26), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_26 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_26, -1, False), kwargs = {}) | |
| %detach_122 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_26,), kwargs = {}) | |
| %bmm_40 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_26, %_unsafe_view_201), kwargs = {}) | |
| %view_469 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_40, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_134 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_469, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_120 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_134,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_202 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_120, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant552 : [#users=1] = get_attr[target=_param_constant552] | |
| %t_179 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant552,), kwargs = {}) | |
| %view_470 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_202, [8192, 512]), kwargs = {}) | |
| %_param_constant553 : [#users=1] = get_attr[target=_param_constant553] | |
| %addmm_84 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant553, %view_470, %t_179), kwargs = {}) | |
| %view_471 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_84, [2, 4096, 512]), kwargs = {}) | |
| %add_277 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_471, %add_274), kwargs = {}) | |
| %convert_element_type_272 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_277, torch.float32), kwargs = {}) | |
| %var_mean_88 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_272, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_176 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_88, 0), kwargs = {}) | |
| %getitem_177 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_88, 1), kwargs = {}) | |
| %add_278 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_176, 1e-05), kwargs = {}) | |
| %rsqrt_88 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_278,), kwargs = {}) | |
| %sub_88 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_277, %getitem_177), kwargs = {}) | |
| %mul_192 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_88, %rsqrt_88), kwargs = {}) | |
| %_param_constant554 : [#users=1] = get_attr[target=_param_constant554] | |
| %mul_193 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_192, %_param_constant554), kwargs = {}) | |
| %_param_constant555 : [#users=1] = get_attr[target=_param_constant555] | |
| %add_279 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_193, %_param_constant555), kwargs = {}) | |
| %convert_element_type_273 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_279, torch.float16), kwargs = {}) | |
| %_param_constant556 : [#users=1] = get_attr[target=_param_constant556] | |
| %t_180 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant556,), kwargs = {}) | |
| %view_472 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_273, [8192, 512]), kwargs = {}) | |
| %mm_81 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_472, %t_180), kwargs = {}) | |
| %_unsafe_view_203 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_81, [2, 4096, 512]), kwargs = {}) | |
| %view_473 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_203, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_135 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_473, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_121 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_135,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_204 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_121, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant557 : [#users=1] = get_attr[target=_param_constant557] | |
| %t_181 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant557,), kwargs = {}) | |
| %view_474 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_82 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_474, %t_181), kwargs = {}) | |
| %_unsafe_view_205 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_82, [2, 77, 512]), kwargs = {}) | |
| %_param_constant558 : [#users=1] = get_attr[target=_param_constant558] | |
| %t_182 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant558,), kwargs = {}) | |
| %view_475 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_83 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_475, %t_182), kwargs = {}) | |
| %_unsafe_view_206 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_83, [2, 77, 512]), kwargs = {}) | |
| %view_476 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_205, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_136 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_476, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_122 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_136,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_207 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_122, [16, 77, 64]), kwargs = {}) | |
| %view_477 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_206, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_137 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_477, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_123 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_137,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_208 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_123, [16, 77, 64]), kwargs = {}) | |
| %empty_27 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_27 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_207, -1, -2), kwargs = {}) | |
| %baddbmm_27 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_27, %_unsafe_view_204, %transpose_27), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_27 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_27, -1, False), kwargs = {}) | |
| %detach_123 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_27,), kwargs = {}) | |
| %bmm_41 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_27, %_unsafe_view_208), kwargs = {}) | |
| %view_478 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_41, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_138 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_478, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_124 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_138,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_209 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_124, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant559 : [#users=1] = get_attr[target=_param_constant559] | |
| %t_183 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant559,), kwargs = {}) | |
| %view_479 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_209, [8192, 512]), kwargs = {}) | |
| %_param_constant560 : [#users=1] = get_attr[target=_param_constant560] | |
| %addmm_85 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant560, %view_479, %t_183), kwargs = {}) | |
| %view_480 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_85, [2, 4096, 512]), kwargs = {}) | |
| %add_280 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_480, %add_277), kwargs = {}) | |
| %convert_element_type_274 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_280, torch.float32), kwargs = {}) | |
| %var_mean_89 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_274, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_178 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_89, 0), kwargs = {}) | |
| %getitem_179 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_89, 1), kwargs = {}) | |
| %add_281 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_178, 1e-05), kwargs = {}) | |
| %rsqrt_89 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_281,), kwargs = {}) | |
| %sub_89 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_280, %getitem_179), kwargs = {}) | |
| %mul_194 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_89, %rsqrt_89), kwargs = {}) | |
| %_param_constant561 : [#users=1] = get_attr[target=_param_constant561] | |
| %mul_195 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_194, %_param_constant561), kwargs = {}) | |
| %_param_constant562 : [#users=1] = get_attr[target=_param_constant562] | |
| %add_282 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_195, %_param_constant562), kwargs = {}) | |
| %convert_element_type_275 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_282, torch.float16), kwargs = {}) | |
| %_param_constant563 : [#users=1] = get_attr[target=_param_constant563] | |
| %t_184 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant563,), kwargs = {}) | |
| %view_481 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_275, [8192, 512]), kwargs = {}) | |
| %_param_constant564 : [#users=1] = get_attr[target=_param_constant564] | |
| %addmm_86 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant564, %view_481, %t_184), kwargs = {}) | |
| %view_482 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_86, [2, 4096, 4096]), kwargs = {}) | |
| %slice_67 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_482, -1, 0, 2048), kwargs = {}) | |
| %slice_68 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_482, -1, 2048, 4096), kwargs = {}) | |
| %gelu_13 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_68,), kwargs = {}) | |
| %mul_196 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_67, %gelu_13), kwargs = {}) | |
| %_param_constant565 : [#users=1] = get_attr[target=_param_constant565] | |
| %t_185 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant565,), kwargs = {}) | |
| %view_483 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_196, [8192, 2048]), kwargs = {}) | |
| %_param_constant566 : [#users=1] = get_attr[target=_param_constant566] | |
| %addmm_87 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant566, %view_483, %t_185), kwargs = {}) | |
| %view_484 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_87, [2, 4096, 512]), kwargs = {}) | |
| %add_283 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_484, %add_280), kwargs = {}) | |
| %_param_constant567 : [#users=1] = get_attr[target=_param_constant567] | |
| %t_186 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant567,), kwargs = {}) | |
| %view_485 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_283, [8192, 512]), kwargs = {}) | |
| %_param_constant568 : [#users=1] = get_attr[target=_param_constant568] | |
| %addmm_88 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant568, %view_485, %t_186), kwargs = {}) | |
| %view_486 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_88, [2, 4096, 512]), kwargs = {}) | |
| %view_487 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_486, [2, 64, 64, 512]), kwargs = {}) | |
| %permute_139 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_487, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_125 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_139,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_284 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_125, %div_17), kwargs = {}) | |
| %cat_9 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_284, %add_31], 1), kwargs = {}) | |
| %view_488 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_9, [2, 32, 32, 4096]), kwargs = {}) | |
| %convert_element_type_276 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_488, torch.float32), kwargs = {}) | |
| %var_mean_90 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_276, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_180 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_90, 0), kwargs = {}) | |
| %getitem_181 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_90, 1), kwargs = {}) | |
| %add_285 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_180, 1e-05), kwargs = {}) | |
| %rsqrt_90 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_285,), kwargs = {}) | |
| %sub_90 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_488, %getitem_181), kwargs = {}) | |
| %mul_197 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_90, %rsqrt_90), kwargs = {}) | |
| %view_489 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_197, [2, 1024, 64, 64]), kwargs = {}) | |
| %_param_constant569 : [#users=1] = get_attr[target=_param_constant569] | |
| %unsqueeze_324 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant569, 0), kwargs = {}) | |
| %unsqueeze_325 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_324, 2), kwargs = {}) | |
| %unsqueeze_326 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_325, 3), kwargs = {}) | |
| %_param_constant570 : [#users=1] = get_attr[target=_param_constant570] | |
| %unsqueeze_327 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant570, 0), kwargs = {}) | |
| %unsqueeze_328 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_327, 2), kwargs = {}) | |
| %unsqueeze_329 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_328, 3), kwargs = {}) | |
| %mul_198 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_489, %unsqueeze_329), kwargs = {}) | |
| %add_286 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_198, %unsqueeze_326), kwargs = {}) | |
| %convert_element_type_277 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_286, torch.float16), kwargs = {}) | |
| %convert_element_type_278 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_181, torch.float16), kwargs = {}) | |
| %convert_element_type_279 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_90, torch.float16), kwargs = {}) | |
| %squeeze_192 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_278, 3), kwargs = {}) | |
| %squeeze_193 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_192, 2), kwargs = {}) | |
| %squeeze_194 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_279, 3), kwargs = {}) | |
| %squeeze_195 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_194, 2), kwargs = {}) | |
| %detach_124 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_193,), kwargs = {}) | |
| %detach_125 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_195,), kwargs = {}) | |
| %silu_52 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_277,), kwargs = {}) | |
| %_param_constant571 : [#users=1] = get_attr[target=_param_constant571] | |
| %_param_constant572 : [#users=1] = get_attr[target=_param_constant572] | |
| %convolution_49 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_52, %_param_constant571, %_param_constant572, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_53 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant573 : [#users=1] = get_attr[target=_param_constant573] | |
| %t_187 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant573,), kwargs = {}) | |
| %_param_constant574 : [#users=1] = get_attr[target=_param_constant574] | |
| %addmm_89 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant574, %silu_53, %t_187), kwargs = {}) | |
| %slice_69 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_89, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_70 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_69, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_330 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_70, 2), kwargs = {}) | |
| %unsqueeze_331 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_330, 3), kwargs = {}) | |
| %add_287 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_49, %unsqueeze_331), kwargs = {}) | |
| %view_490 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_287, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_280 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_490, torch.float32), kwargs = {}) | |
| %var_mean_91 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_280, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_182 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_91, 0), kwargs = {}) | |
| %getitem_183 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_91, 1), kwargs = {}) | |
| %add_288 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_182, 1e-05), kwargs = {}) | |
| %rsqrt_91 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_288,), kwargs = {}) | |
| %sub_91 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_490, %getitem_183), kwargs = {}) | |
| %mul_199 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_91, %rsqrt_91), kwargs = {}) | |
| %view_491 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_199, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant575 : [#users=1] = get_attr[target=_param_constant575] | |
| %unsqueeze_332 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant575, 0), kwargs = {}) | |
| %unsqueeze_333 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_332, 2), kwargs = {}) | |
| %unsqueeze_334 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_333, 3), kwargs = {}) | |
| %_param_constant576 : [#users=1] = get_attr[target=_param_constant576] | |
| %unsqueeze_335 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant576, 0), kwargs = {}) | |
| %unsqueeze_336 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_335, 2), kwargs = {}) | |
| %unsqueeze_337 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_336, 3), kwargs = {}) | |
| %mul_200 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_491, %unsqueeze_337), kwargs = {}) | |
| %add_289 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_200, %unsqueeze_334), kwargs = {}) | |
| %convert_element_type_281 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_289, torch.float16), kwargs = {}) | |
| %convert_element_type_282 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_183, torch.float16), kwargs = {}) | |
| %convert_element_type_283 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_91, torch.float16), kwargs = {}) | |
| %squeeze_196 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_282, 3), kwargs = {}) | |
| %squeeze_197 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_196, 2), kwargs = {}) | |
| %squeeze_198 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_283, 3), kwargs = {}) | |
| %squeeze_199 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_198, 2), kwargs = {}) | |
| %detach_126 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_197,), kwargs = {}) | |
| %detach_127 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_199,), kwargs = {}) | |
| %silu_54 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_281,), kwargs = {}) | |
| %_param_constant577 : [#users=1] = get_attr[target=_param_constant577] | |
| %_param_constant578 : [#users=1] = get_attr[target=_param_constant578] | |
| %convolution_50 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_54, %_param_constant577, %_param_constant578, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant579 : [#users=1] = get_attr[target=_param_constant579] | |
| %_param_constant580 : [#users=1] = get_attr[target=_param_constant580] | |
| %convolution_51 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_9, %_param_constant579, %_param_constant580, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_290 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_51, %convolution_50), kwargs = {}) | |
| %div_18 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_290, 1.0), kwargs = {}) | |
| %view_492 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_18, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_284 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_492, torch.float32), kwargs = {}) | |
| %var_mean_92 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_284, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_184 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_92, 0), kwargs = {}) | |
| %getitem_185 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_92, 1), kwargs = {}) | |
| %add_291 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_184, 1e-06), kwargs = {}) | |
| %rsqrt_92 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_291,), kwargs = {}) | |
| %sub_92 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_492, %getitem_185), kwargs = {}) | |
| %mul_201 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_92, %rsqrt_92), kwargs = {}) | |
| %view_493 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_201, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant581 : [#users=1] = get_attr[target=_param_constant581] | |
| %unsqueeze_338 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant581, 0), kwargs = {}) | |
| %unsqueeze_339 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_338, 2), kwargs = {}) | |
| %unsqueeze_340 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_339, 3), kwargs = {}) | |
| %_param_constant582 : [#users=1] = get_attr[target=_param_constant582] | |
| %unsqueeze_341 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant582, 0), kwargs = {}) | |
| %unsqueeze_342 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_341, 2), kwargs = {}) | |
| %unsqueeze_343 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_342, 3), kwargs = {}) | |
| %mul_202 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_493, %unsqueeze_343), kwargs = {}) | |
| %add_292 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_202, %unsqueeze_340), kwargs = {}) | |
| %convert_element_type_285 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_292, torch.float16), kwargs = {}) | |
| %convert_element_type_286 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_185, torch.float16), kwargs = {}) | |
| %convert_element_type_287 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_92, torch.float16), kwargs = {}) | |
| %squeeze_200 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_286, 3), kwargs = {}) | |
| %squeeze_201 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_200, 2), kwargs = {}) | |
| %squeeze_202 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_287, 3), kwargs = {}) | |
| %squeeze_203 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_202, 2), kwargs = {}) | |
| %detach_128 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_201,), kwargs = {}) | |
| %detach_129 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_203,), kwargs = {}) | |
| %permute_140 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_285, [0, 2, 3, 1]), kwargs = {}) | |
| %view_494 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_140, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant583 : [#users=1] = get_attr[target=_param_constant583] | |
| %t_188 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant583,), kwargs = {}) | |
| %expand_29 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_494, [2, 4096, 512]), kwargs = {}) | |
| %view_495 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_29, [2, 4096, 512]), kwargs = {}) | |
| %expand_30 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_188, [2, 512, 512]), kwargs = {}) | |
| %view_496 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_30, [2, 512, 512]), kwargs = {}) | |
| %bmm_42 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_495, %view_496), kwargs = {}) | |
| %_unsafe_view_210 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_42, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant584 : [#users=1] = get_attr[target=_param_constant584] | |
| %add_293 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_210, %_param_constant584), kwargs = {}) | |
| %convert_element_type_288 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_293, torch.float32), kwargs = {}) | |
| %var_mean_93 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_288, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_186 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_93, 0), kwargs = {}) | |
| %getitem_187 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_93, 1), kwargs = {}) | |
| %add_294 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_186, 1e-05), kwargs = {}) | |
| %rsqrt_93 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_294,), kwargs = {}) | |
| %sub_93 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_293, %getitem_187), kwargs = {}) | |
| %mul_203 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_93, %rsqrt_93), kwargs = {}) | |
| %_param_constant585 : [#users=1] = get_attr[target=_param_constant585] | |
| %mul_204 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_203, %_param_constant585), kwargs = {}) | |
| %_param_constant586 : [#users=1] = get_attr[target=_param_constant586] | |
| %add_295 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_204, %_param_constant586), kwargs = {}) | |
| %convert_element_type_289 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_295, torch.float16), kwargs = {}) | |
| %_param_constant587 : [#users=1] = get_attr[target=_param_constant587] | |
| %t_189 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant587,), kwargs = {}) | |
| %view_497 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_289, [8192, 512]), kwargs = {}) | |
| %mm_84 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_497, %t_189), kwargs = {}) | |
| %_unsafe_view_211 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_84, [2, 4096, 512]), kwargs = {}) | |
| %view_498 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_211, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_141 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_498, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_126 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_141,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_212 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_126, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant588 : [#users=1] = get_attr[target=_param_constant588] | |
| %t_190 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant588,), kwargs = {}) | |
| %view_499 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_85 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_499, %t_190), kwargs = {}) | |
| %_unsafe_view_213 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_85, [2, 77, 512]), kwargs = {}) | |
| %_param_constant589 : [#users=1] = get_attr[target=_param_constant589] | |
| %t_191 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant589,), kwargs = {}) | |
| %view_500 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_86 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_500, %t_191), kwargs = {}) | |
| %_unsafe_view_214 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_86, [2, 77, 512]), kwargs = {}) | |
| %view_501 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_213, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_142 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_501, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_127 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_142,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_215 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_127, [16, 77, 64]), kwargs = {}) | |
| %view_502 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_214, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_143 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_502, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_128 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_143,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_216 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_128, [16, 77, 64]), kwargs = {}) | |
| %empty_28 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_28 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_215, -1, -2), kwargs = {}) | |
| %baddbmm_28 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_28, %_unsafe_view_212, %transpose_28), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_28 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_28, -1, False), kwargs = {}) | |
| %detach_130 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_28,), kwargs = {}) | |
| %bmm_43 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_28, %_unsafe_view_216), kwargs = {}) | |
| %view_503 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_43, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_144 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_503, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_129 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_144,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_217 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_129, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant590 : [#users=1] = get_attr[target=_param_constant590] | |
| %t_192 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant590,), kwargs = {}) | |
| %view_504 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_217, [8192, 512]), kwargs = {}) | |
| %_param_constant591 : [#users=1] = get_attr[target=_param_constant591] | |
| %addmm_90 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant591, %view_504, %t_192), kwargs = {}) | |
| %view_505 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_90, [2, 4096, 512]), kwargs = {}) | |
| %add_296 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_505, %add_293), kwargs = {}) | |
| %convert_element_type_290 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_296, torch.float32), kwargs = {}) | |
| %var_mean_94 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_290, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_188 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_94, 0), kwargs = {}) | |
| %getitem_189 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_94, 1), kwargs = {}) | |
| %add_297 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_188, 1e-05), kwargs = {}) | |
| %rsqrt_94 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_297,), kwargs = {}) | |
| %sub_94 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_296, %getitem_189), kwargs = {}) | |
| %mul_205 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_94, %rsqrt_94), kwargs = {}) | |
| %_param_constant592 : [#users=1] = get_attr[target=_param_constant592] | |
| %mul_206 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_205, %_param_constant592), kwargs = {}) | |
| %_param_constant593 : [#users=1] = get_attr[target=_param_constant593] | |
| %add_298 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_206, %_param_constant593), kwargs = {}) | |
| %convert_element_type_291 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_298, torch.float16), kwargs = {}) | |
| %_param_constant594 : [#users=1] = get_attr[target=_param_constant594] | |
| %t_193 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant594,), kwargs = {}) | |
| %view_506 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_291, [8192, 512]), kwargs = {}) | |
| %mm_87 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_506, %t_193), kwargs = {}) | |
| %_unsafe_view_218 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_87, [2, 4096, 512]), kwargs = {}) | |
| %view_507 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_218, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_145 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_507, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_130 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_145,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_219 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_130, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant595 : [#users=1] = get_attr[target=_param_constant595] | |
| %t_194 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant595,), kwargs = {}) | |
| %view_508 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_88 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_508, %t_194), kwargs = {}) | |
| %_unsafe_view_220 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_88, [2, 77, 512]), kwargs = {}) | |
| %_param_constant596 : [#users=1] = get_attr[target=_param_constant596] | |
| %t_195 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant596,), kwargs = {}) | |
| %view_509 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_89 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_509, %t_195), kwargs = {}) | |
| %_unsafe_view_221 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_89, [2, 77, 512]), kwargs = {}) | |
| %view_510 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_220, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_146 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_510, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_131 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_146,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_222 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_131, [16, 77, 64]), kwargs = {}) | |
| %view_511 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_221, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_147 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_511, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_132 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_147,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_223 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_132, [16, 77, 64]), kwargs = {}) | |
| %empty_29 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_29 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_222, -1, -2), kwargs = {}) | |
| %baddbmm_29 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_29, %_unsafe_view_219, %transpose_29), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_29 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_29, -1, False), kwargs = {}) | |
| %detach_131 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_29,), kwargs = {}) | |
| %bmm_44 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_29, %_unsafe_view_223), kwargs = {}) | |
| %view_512 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_44, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_148 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_512, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_133 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_148,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_224 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_133, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant597 : [#users=1] = get_attr[target=_param_constant597] | |
| %t_196 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant597,), kwargs = {}) | |
| %view_513 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_224, [8192, 512]), kwargs = {}) | |
| %_param_constant598 : [#users=1] = get_attr[target=_param_constant598] | |
| %addmm_91 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant598, %view_513, %t_196), kwargs = {}) | |
| %view_514 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_91, [2, 4096, 512]), kwargs = {}) | |
| %add_299 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_514, %add_296), kwargs = {}) | |
| %convert_element_type_292 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_299, torch.float32), kwargs = {}) | |
| %var_mean_95 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_292, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_190 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_95, 0), kwargs = {}) | |
| %getitem_191 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_95, 1), kwargs = {}) | |
| %add_300 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_190, 1e-05), kwargs = {}) | |
| %rsqrt_95 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_300,), kwargs = {}) | |
| %sub_95 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_299, %getitem_191), kwargs = {}) | |
| %mul_207 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_95, %rsqrt_95), kwargs = {}) | |
| %_param_constant599 : [#users=1] = get_attr[target=_param_constant599] | |
| %mul_208 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_207, %_param_constant599), kwargs = {}) | |
| %_param_constant600 : [#users=1] = get_attr[target=_param_constant600] | |
| %add_301 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_208, %_param_constant600), kwargs = {}) | |
| %convert_element_type_293 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_301, torch.float16), kwargs = {}) | |
| %_param_constant601 : [#users=1] = get_attr[target=_param_constant601] | |
| %t_197 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant601,), kwargs = {}) | |
| %view_515 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_293, [8192, 512]), kwargs = {}) | |
| %_param_constant602 : [#users=1] = get_attr[target=_param_constant602] | |
| %addmm_92 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant602, %view_515, %t_197), kwargs = {}) | |
| %view_516 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_92, [2, 4096, 4096]), kwargs = {}) | |
| %slice_71 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_516, -1, 0, 2048), kwargs = {}) | |
| %slice_72 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_516, -1, 2048, 4096), kwargs = {}) | |
| %gelu_14 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_72,), kwargs = {}) | |
| %mul_209 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_71, %gelu_14), kwargs = {}) | |
| %_param_constant603 : [#users=1] = get_attr[target=_param_constant603] | |
| %t_198 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant603,), kwargs = {}) | |
| %view_517 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_209, [8192, 2048]), kwargs = {}) | |
| %_param_constant604 : [#users=1] = get_attr[target=_param_constant604] | |
| %addmm_93 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant604, %view_517, %t_198), kwargs = {}) | |
| %view_518 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_93, [2, 4096, 512]), kwargs = {}) | |
| %add_302 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_518, %add_299), kwargs = {}) | |
| %_param_constant605 : [#users=1] = get_attr[target=_param_constant605] | |
| %t_199 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant605,), kwargs = {}) | |
| %view_519 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_302, [8192, 512]), kwargs = {}) | |
| %_param_constant606 : [#users=1] = get_attr[target=_param_constant606] | |
| %addmm_94 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant606, %view_519, %t_199), kwargs = {}) | |
| %view_520 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_94, [2, 4096, 512]), kwargs = {}) | |
| %view_521 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_520, [2, 64, 64, 512]), kwargs = {}) | |
| %permute_149 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_521, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_134 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_149,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_303 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_134, %div_18), kwargs = {}) | |
| %cat_10 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_303, %convolution_5], 1), kwargs = {}) | |
| %view_522 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_10, [2, 32, 24, 4096]), kwargs = {}) | |
| %convert_element_type_294 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_522, torch.float32), kwargs = {}) | |
| %var_mean_96 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_294, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_192 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_96, 0), kwargs = {}) | |
| %getitem_193 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_96, 1), kwargs = {}) | |
| %add_304 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_192, 1e-05), kwargs = {}) | |
| %rsqrt_96 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_304,), kwargs = {}) | |
| %sub_96 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_522, %getitem_193), kwargs = {}) | |
| %mul_210 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_96, %rsqrt_96), kwargs = {}) | |
| %view_523 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_210, [2, 768, 64, 64]), kwargs = {}) | |
| %_param_constant607 : [#users=1] = get_attr[target=_param_constant607] | |
| %unsqueeze_344 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant607, 0), kwargs = {}) | |
| %unsqueeze_345 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_344, 2), kwargs = {}) | |
| %unsqueeze_346 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_345, 3), kwargs = {}) | |
| %_param_constant608 : [#users=1] = get_attr[target=_param_constant608] | |
| %unsqueeze_347 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant608, 0), kwargs = {}) | |
| %unsqueeze_348 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_347, 2), kwargs = {}) | |
| %unsqueeze_349 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_348, 3), kwargs = {}) | |
| %mul_211 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_523, %unsqueeze_349), kwargs = {}) | |
| %add_305 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_211, %unsqueeze_346), kwargs = {}) | |
| %convert_element_type_295 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_305, torch.float16), kwargs = {}) | |
| %convert_element_type_296 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_193, torch.float16), kwargs = {}) | |
| %convert_element_type_297 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_96, torch.float16), kwargs = {}) | |
| %squeeze_204 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_296, 3), kwargs = {}) | |
| %squeeze_205 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_204, 2), kwargs = {}) | |
| %squeeze_206 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_297, 3), kwargs = {}) | |
| %squeeze_207 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_206, 2), kwargs = {}) | |
| %detach_132 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_205,), kwargs = {}) | |
| %detach_133 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_207,), kwargs = {}) | |
| %silu_55 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_295,), kwargs = {}) | |
| %_param_constant609 : [#users=1] = get_attr[target=_param_constant609] | |
| %_param_constant610 : [#users=1] = get_attr[target=_param_constant610] | |
| %convolution_52 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_55, %_param_constant609, %_param_constant610, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_56 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant611 : [#users=1] = get_attr[target=_param_constant611] | |
| %t_200 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant611,), kwargs = {}) | |
| %_param_constant612 : [#users=1] = get_attr[target=_param_constant612] | |
| %addmm_95 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant612, %silu_56, %t_200), kwargs = {}) | |
| %slice_73 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_95, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_74 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_73, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_350 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_74, 2), kwargs = {}) | |
| %unsqueeze_351 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_350, 3), kwargs = {}) | |
| %add_306 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_52, %unsqueeze_351), kwargs = {}) | |
| %view_524 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_306, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_298 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_524, torch.float32), kwargs = {}) | |
| %var_mean_97 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_298, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_194 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_97, 0), kwargs = {}) | |
| %getitem_195 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_97, 1), kwargs = {}) | |
| %add_307 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_194, 1e-05), kwargs = {}) | |
| %rsqrt_97 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_307,), kwargs = {}) | |
| %sub_97 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_524, %getitem_195), kwargs = {}) | |
| %mul_212 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_97, %rsqrt_97), kwargs = {}) | |
| %view_525 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_212, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant613 : [#users=1] = get_attr[target=_param_constant613] | |
| %unsqueeze_352 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant613, 0), kwargs = {}) | |
| %unsqueeze_353 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_352, 2), kwargs = {}) | |
| %unsqueeze_354 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_353, 3), kwargs = {}) | |
| %_param_constant614 : [#users=1] = get_attr[target=_param_constant614] | |
| %unsqueeze_355 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant614, 0), kwargs = {}) | |
| %unsqueeze_356 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_355, 2), kwargs = {}) | |
| %unsqueeze_357 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_356, 3), kwargs = {}) | |
| %mul_213 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_525, %unsqueeze_357), kwargs = {}) | |
| %add_308 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_213, %unsqueeze_354), kwargs = {}) | |
| %convert_element_type_299 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_308, torch.float16), kwargs = {}) | |
| %convert_element_type_300 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_195, torch.float16), kwargs = {}) | |
| %convert_element_type_301 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_97, torch.float16), kwargs = {}) | |
| %squeeze_208 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_300, 3), kwargs = {}) | |
| %squeeze_209 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_208, 2), kwargs = {}) | |
| %squeeze_210 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_301, 3), kwargs = {}) | |
| %squeeze_211 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_210, 2), kwargs = {}) | |
| %detach_134 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_209,), kwargs = {}) | |
| %detach_135 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_211,), kwargs = {}) | |
| %silu_57 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_299,), kwargs = {}) | |
| %_param_constant615 : [#users=1] = get_attr[target=_param_constant615] | |
| %_param_constant616 : [#users=1] = get_attr[target=_param_constant616] | |
| %convolution_53 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_57, %_param_constant615, %_param_constant616, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant617 : [#users=1] = get_attr[target=_param_constant617] | |
| %_param_constant618 : [#users=1] = get_attr[target=_param_constant618] | |
| %convolution_54 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_10, %_param_constant617, %_param_constant618, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_309 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_54, %convolution_53), kwargs = {}) | |
| %div_19 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_309, 1.0), kwargs = {}) | |
| %view_526 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_19, [2, 32, 16, 4096]), kwargs = {}) | |
| %convert_element_type_302 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_526, torch.float32), kwargs = {}) | |
| %var_mean_98 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_302, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_196 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_98, 0), kwargs = {}) | |
| %getitem_197 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_98, 1), kwargs = {}) | |
| %add_310 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_196, 1e-06), kwargs = {}) | |
| %rsqrt_98 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_310,), kwargs = {}) | |
| %sub_98 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_526, %getitem_197), kwargs = {}) | |
| %mul_214 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_98, %rsqrt_98), kwargs = {}) | |
| %view_527 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_214, [2, 512, 64, 64]), kwargs = {}) | |
| %_param_constant619 : [#users=1] = get_attr[target=_param_constant619] | |
| %unsqueeze_358 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant619, 0), kwargs = {}) | |
| %unsqueeze_359 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_358, 2), kwargs = {}) | |
| %unsqueeze_360 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_359, 3), kwargs = {}) | |
| %_param_constant620 : [#users=1] = get_attr[target=_param_constant620] | |
| %unsqueeze_361 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant620, 0), kwargs = {}) | |
| %unsqueeze_362 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_361, 2), kwargs = {}) | |
| %unsqueeze_363 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_362, 3), kwargs = {}) | |
| %mul_215 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_527, %unsqueeze_363), kwargs = {}) | |
| %add_311 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_215, %unsqueeze_360), kwargs = {}) | |
| %convert_element_type_303 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_311, torch.float16), kwargs = {}) | |
| %convert_element_type_304 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_197, torch.float16), kwargs = {}) | |
| %convert_element_type_305 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_98, torch.float16), kwargs = {}) | |
| %squeeze_212 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_304, 3), kwargs = {}) | |
| %squeeze_213 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_212, 2), kwargs = {}) | |
| %squeeze_214 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_305, 3), kwargs = {}) | |
| %squeeze_215 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_214, 2), kwargs = {}) | |
| %detach_136 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_213,), kwargs = {}) | |
| %detach_137 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_215,), kwargs = {}) | |
| %permute_150 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%convert_element_type_303, [0, 2, 3, 1]), kwargs = {}) | |
| %view_528 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_150, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant621 : [#users=1] = get_attr[target=_param_constant621] | |
| %t_201 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant621,), kwargs = {}) | |
| %expand_31 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_528, [2, 4096, 512]), kwargs = {}) | |
| %view_529 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_31, [2, 4096, 512]), kwargs = {}) | |
| %expand_32 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%t_201, [2, 512, 512]), kwargs = {}) | |
| %view_530 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%expand_32, [2, 512, 512]), kwargs = {}) | |
| %bmm_45 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_529, %view_530), kwargs = {}) | |
| %_unsafe_view_225 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%bmm_45, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant622 : [#users=1] = get_attr[target=_param_constant622] | |
| %add_312 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_225, %_param_constant622), kwargs = {}) | |
| %convert_element_type_306 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_312, torch.float32), kwargs = {}) | |
| %var_mean_99 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_306, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_198 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_99, 0), kwargs = {}) | |
| %getitem_199 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_99, 1), kwargs = {}) | |
| %add_313 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_198, 1e-05), kwargs = {}) | |
| %rsqrt_99 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_313,), kwargs = {}) | |
| %sub_99 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_312, %getitem_199), kwargs = {}) | |
| %mul_216 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_99, %rsqrt_99), kwargs = {}) | |
| %_param_constant623 : [#users=1] = get_attr[target=_param_constant623] | |
| %mul_217 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_216, %_param_constant623), kwargs = {}) | |
| %_param_constant624 : [#users=1] = get_attr[target=_param_constant624] | |
| %add_314 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_217, %_param_constant624), kwargs = {}) | |
| %convert_element_type_307 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_314, torch.float16), kwargs = {}) | |
| %_param_constant625 : [#users=1] = get_attr[target=_param_constant625] | |
| %t_202 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant625,), kwargs = {}) | |
| %view_531 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_307, [8192, 512]), kwargs = {}) | |
| %mm_90 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_531, %t_202), kwargs = {}) | |
| %_unsafe_view_226 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_90, [2, 4096, 512]), kwargs = {}) | |
| %view_532 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_226, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_151 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_532, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_135 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_151,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_227 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_135, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant626 : [#users=1] = get_attr[target=_param_constant626] | |
| %t_203 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant626,), kwargs = {}) | |
| %view_533 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_91 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_533, %t_203), kwargs = {}) | |
| %_unsafe_view_228 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_91, [2, 77, 512]), kwargs = {}) | |
| %_param_constant627 : [#users=1] = get_attr[target=_param_constant627] | |
| %t_204 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant627,), kwargs = {}) | |
| %view_534 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_92 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_534, %t_204), kwargs = {}) | |
| %_unsafe_view_229 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_92, [2, 77, 512]), kwargs = {}) | |
| %view_535 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_228, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_152 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_535, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_136 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_152,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_230 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_136, [16, 77, 64]), kwargs = {}) | |
| %view_536 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_229, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_153 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_536, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_137 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_153,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_231 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_137, [16, 77, 64]), kwargs = {}) | |
| %empty_30 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_30 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_230, -1, -2), kwargs = {}) | |
| %baddbmm_30 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_30, %_unsafe_view_227, %transpose_30), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_30 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_30, -1, False), kwargs = {}) | |
| %detach_138 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_30,), kwargs = {}) | |
| %bmm_46 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_30, %_unsafe_view_231), kwargs = {}) | |
| %view_537 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_46, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_154 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_537, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_138 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_154,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_232 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_138, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant628 : [#users=1] = get_attr[target=_param_constant628] | |
| %t_205 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant628,), kwargs = {}) | |
| %view_538 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_232, [8192, 512]), kwargs = {}) | |
| %_param_constant629 : [#users=1] = get_attr[target=_param_constant629] | |
| %addmm_96 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant629, %view_538, %t_205), kwargs = {}) | |
| %view_539 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_96, [2, 4096, 512]), kwargs = {}) | |
| %add_315 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_539, %add_312), kwargs = {}) | |
| %convert_element_type_308 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_315, torch.float32), kwargs = {}) | |
| %var_mean_100 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_308, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_200 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_100, 0), kwargs = {}) | |
| %getitem_201 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_100, 1), kwargs = {}) | |
| %add_316 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_200, 1e-05), kwargs = {}) | |
| %rsqrt_100 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_316,), kwargs = {}) | |
| %sub_100 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_315, %getitem_201), kwargs = {}) | |
| %mul_218 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_100, %rsqrt_100), kwargs = {}) | |
| %_param_constant630 : [#users=1] = get_attr[target=_param_constant630] | |
| %mul_219 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_218, %_param_constant630), kwargs = {}) | |
| %_param_constant631 : [#users=1] = get_attr[target=_param_constant631] | |
| %add_317 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_219, %_param_constant631), kwargs = {}) | |
| %convert_element_type_309 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_317, torch.float16), kwargs = {}) | |
| %_param_constant632 : [#users=1] = get_attr[target=_param_constant632] | |
| %t_206 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant632,), kwargs = {}) | |
| %view_540 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_309, [8192, 512]), kwargs = {}) | |
| %mm_93 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_540, %t_206), kwargs = {}) | |
| %_unsafe_view_233 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_93, [2, 4096, 512]), kwargs = {}) | |
| %view_541 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_233, [2, 4096, 8, 64]), kwargs = {}) | |
| %permute_155 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_541, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_139 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_155,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_234 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_139, [16, 4096, 64]), kwargs = {}) | |
| %_param_constant633 : [#users=1] = get_attr[target=_param_constant633] | |
| %t_207 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant633,), kwargs = {}) | |
| %view_542 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_94 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_542, %t_207), kwargs = {}) | |
| %_unsafe_view_235 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_94, [2, 77, 512]), kwargs = {}) | |
| %_param_constant634 : [#users=1] = get_attr[target=_param_constant634] | |
| %t_208 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant634,), kwargs = {}) | |
| %view_543 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {}) | |
| %mm_95 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_543, %t_208), kwargs = {}) | |
| %_unsafe_view_236 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_95, [2, 77, 512]), kwargs = {}) | |
| %view_544 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_235, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_156 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_544, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_140 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_156,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_237 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_140, [16, 77, 64]), kwargs = {}) | |
| %view_545 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_236, [2, 77, 8, 64]), kwargs = {}) | |
| %permute_157 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_545, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_141 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_157,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_238 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_141, [16, 77, 64]), kwargs = {}) | |
| %empty_31 : [#users=1] = call_function[target=torch.ops.aten.empty](args = ([16, 4096, 77],), kwargs = {dtype: torch.float16, device: cuda:0, pin_memory: False}) | |
| %transpose_31 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%_unsafe_view_237, -1, -2), kwargs = {}) | |
| %baddbmm_31 : [#users=1] = call_function[target=torch.ops.aten.baddbmm](args = (%empty_31, %_unsafe_view_234, %transpose_31), kwargs = {beta: 0, alpha: 0.125}) | |
| %_softmax_31 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%baddbmm_31, -1, False), kwargs = {}) | |
| %detach_139 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_31,), kwargs = {}) | |
| %bmm_47 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_softmax_31, %_unsafe_view_238), kwargs = {}) | |
| %view_546 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_47, [2, 8, 4096, 64]), kwargs = {}) | |
| %permute_158 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_546, [0, 2, 1, 3]), kwargs = {}) | |
| %clone_142 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_158,), kwargs = {memory_format: torch.contiguous_format}) | |
| %_unsafe_view_239 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_142, [2, 4096, 512]), kwargs = {}) | |
| %_param_constant635 : [#users=1] = get_attr[target=_param_constant635] | |
| %t_209 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant635,), kwargs = {}) | |
| %view_547 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_239, [8192, 512]), kwargs = {}) | |
| %_param_constant636 : [#users=1] = get_attr[target=_param_constant636] | |
| %addmm_97 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant636, %view_547, %t_209), kwargs = {}) | |
| %view_548 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_97, [2, 4096, 512]), kwargs = {}) | |
| %add_318 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_548, %add_315), kwargs = {}) | |
| %convert_element_type_310 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_318, torch.float32), kwargs = {}) | |
| %var_mean_101 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_310, [2]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_202 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_101, 0), kwargs = {}) | |
| %getitem_203 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_101, 1), kwargs = {}) | |
| %add_319 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_202, 1e-05), kwargs = {}) | |
| %rsqrt_101 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_319,), kwargs = {}) | |
| %sub_101 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_318, %getitem_203), kwargs = {}) | |
| %mul_220 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_101, %rsqrt_101), kwargs = {}) | |
| %_param_constant637 : [#users=1] = get_attr[target=_param_constant637] | |
| %mul_221 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_220, %_param_constant637), kwargs = {}) | |
| %_param_constant638 : [#users=1] = get_attr[target=_param_constant638] | |
| %add_320 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_221, %_param_constant638), kwargs = {}) | |
| %convert_element_type_311 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_320, torch.float16), kwargs = {}) | |
| %_param_constant639 : [#users=1] = get_attr[target=_param_constant639] | |
| %t_210 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant639,), kwargs = {}) | |
| %view_549 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%convert_element_type_311, [8192, 512]), kwargs = {}) | |
| %_param_constant640 : [#users=1] = get_attr[target=_param_constant640] | |
| %addmm_98 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant640, %view_549, %t_210), kwargs = {}) | |
| %view_550 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_98, [2, 4096, 4096]), kwargs = {}) | |
| %slice_75 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_550, -1, 0, 2048), kwargs = {}) | |
| %slice_76 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_550, -1, 2048, 4096), kwargs = {}) | |
| %gelu_15 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_76,), kwargs = {}) | |
| %mul_222 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_75, %gelu_15), kwargs = {}) | |
| %_param_constant641 : [#users=1] = get_attr[target=_param_constant641] | |
| %t_211 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant641,), kwargs = {}) | |
| %view_551 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_222, [8192, 2048]), kwargs = {}) | |
| %_param_constant642 : [#users=1] = get_attr[target=_param_constant642] | |
| %addmm_99 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant642, %view_551, %t_211), kwargs = {}) | |
| %view_552 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_99, [2, 4096, 512]), kwargs = {}) | |
| %add_321 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_552, %add_318), kwargs = {}) | |
| %_param_constant643 : [#users=1] = get_attr[target=_param_constant643] | |
| %t_212 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant643,), kwargs = {}) | |
| %view_553 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_321, [8192, 512]), kwargs = {}) | |
| %_param_constant644 : [#users=1] = get_attr[target=_param_constant644] | |
| %addmm_100 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant644, %view_553, %t_212), kwargs = {}) | |
| %view_554 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_100, [2, 4096, 512]), kwargs = {}) | |
| %view_555 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_554, [2, 64, 64, 512]), kwargs = {}) | |
| %permute_159 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_555, [0, 3, 1, 2]), kwargs = {}) | |
| %clone_143 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_159,), kwargs = {memory_format: torch.contiguous_format}) | |
| %add_322 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_143, %div_19), kwargs = {}) | |
| %upsample_nearest2d_2 : [#users=1] = call_function[target=torch.ops.aten.upsample_nearest2d](args = (%add_322, [128, 128], 2.0, 2.0), kwargs = {}) | |
| %_param_constant645 : [#users=1] = get_attr[target=_param_constant645] | |
| %_param_constant646 : [#users=1] = get_attr[target=_param_constant646] | |
| %convolution_55 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%upsample_nearest2d_2, %_param_constant645, %_param_constant646, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %cat_11 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%convolution_55, %div_2], 1), kwargs = {}) | |
| %view_556 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_11, [2, 32, 24, 16384]), kwargs = {}) | |
| %convert_element_type_312 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_556, torch.float32), kwargs = {}) | |
| %var_mean_102 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_312, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_204 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_102, 0), kwargs = {}) | |
| %getitem_205 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_102, 1), kwargs = {}) | |
| %add_323 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_204, 1e-05), kwargs = {}) | |
| %rsqrt_102 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_323,), kwargs = {}) | |
| %sub_102 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_556, %getitem_205), kwargs = {}) | |
| %mul_223 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_102, %rsqrt_102), kwargs = {}) | |
| %view_557 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_223, [2, 768, 128, 128]), kwargs = {}) | |
| %_param_constant647 : [#users=1] = get_attr[target=_param_constant647] | |
| %unsqueeze_364 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant647, 0), kwargs = {}) | |
| %unsqueeze_365 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_364, 2), kwargs = {}) | |
| %unsqueeze_366 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_365, 3), kwargs = {}) | |
| %_param_constant648 : [#users=1] = get_attr[target=_param_constant648] | |
| %unsqueeze_367 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant648, 0), kwargs = {}) | |
| %unsqueeze_368 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_367, 2), kwargs = {}) | |
| %unsqueeze_369 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_368, 3), kwargs = {}) | |
| %mul_224 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_557, %unsqueeze_369), kwargs = {}) | |
| %add_324 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_224, %unsqueeze_366), kwargs = {}) | |
| %convert_element_type_313 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_324, torch.float16), kwargs = {}) | |
| %convert_element_type_314 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_205, torch.float16), kwargs = {}) | |
| %convert_element_type_315 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_102, torch.float16), kwargs = {}) | |
| %squeeze_216 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_314, 3), kwargs = {}) | |
| %squeeze_217 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_216, 2), kwargs = {}) | |
| %squeeze_218 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_315, 3), kwargs = {}) | |
| %squeeze_219 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_218, 2), kwargs = {}) | |
| %detach_140 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_217,), kwargs = {}) | |
| %detach_141 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_219,), kwargs = {}) | |
| %silu_58 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_313,), kwargs = {}) | |
| %_param_constant649 : [#users=1] = get_attr[target=_param_constant649] | |
| %_param_constant650 : [#users=1] = get_attr[target=_param_constant650] | |
| %convolution_56 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_58, %_param_constant649, %_param_constant650, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_59 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant651 : [#users=1] = get_attr[target=_param_constant651] | |
| %t_213 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant651,), kwargs = {}) | |
| %_param_constant652 : [#users=1] = get_attr[target=_param_constant652] | |
| %addmm_101 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant652, %silu_59, %t_213), kwargs = {}) | |
| %slice_77 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_101, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_78 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_77, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_370 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_78, 2), kwargs = {}) | |
| %unsqueeze_371 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_370, 3), kwargs = {}) | |
| %add_325 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_56, %unsqueeze_371), kwargs = {}) | |
| %view_558 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_325, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_316 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_558, torch.float32), kwargs = {}) | |
| %var_mean_103 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_316, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_206 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_103, 0), kwargs = {}) | |
| %getitem_207 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_103, 1), kwargs = {}) | |
| %add_326 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_206, 1e-05), kwargs = {}) | |
| %rsqrt_103 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_326,), kwargs = {}) | |
| %sub_103 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_558, %getitem_207), kwargs = {}) | |
| %mul_225 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_103, %rsqrt_103), kwargs = {}) | |
| %view_559 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_225, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant653 : [#users=1] = get_attr[target=_param_constant653] | |
| %unsqueeze_372 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant653, 0), kwargs = {}) | |
| %unsqueeze_373 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_372, 2), kwargs = {}) | |
| %unsqueeze_374 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_373, 3), kwargs = {}) | |
| %_param_constant654 : [#users=1] = get_attr[target=_param_constant654] | |
| %unsqueeze_375 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant654, 0), kwargs = {}) | |
| %unsqueeze_376 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_375, 2), kwargs = {}) | |
| %unsqueeze_377 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_376, 3), kwargs = {}) | |
| %mul_226 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_559, %unsqueeze_377), kwargs = {}) | |
| %add_327 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_226, %unsqueeze_374), kwargs = {}) | |
| %convert_element_type_317 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_327, torch.float16), kwargs = {}) | |
| %convert_element_type_318 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_207, torch.float16), kwargs = {}) | |
| %convert_element_type_319 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_103, torch.float16), kwargs = {}) | |
| %squeeze_220 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_318, 3), kwargs = {}) | |
| %squeeze_221 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_220, 2), kwargs = {}) | |
| %squeeze_222 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_319, 3), kwargs = {}) | |
| %squeeze_223 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_222, 2), kwargs = {}) | |
| %detach_142 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_221,), kwargs = {}) | |
| %detach_143 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_223,), kwargs = {}) | |
| %silu_60 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_317,), kwargs = {}) | |
| %_param_constant655 : [#users=1] = get_attr[target=_param_constant655] | |
| %_param_constant656 : [#users=1] = get_attr[target=_param_constant656] | |
| %convolution_57 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_60, %_param_constant655, %_param_constant656, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant657 : [#users=1] = get_attr[target=_param_constant657] | |
| %_param_constant658 : [#users=1] = get_attr[target=_param_constant658] | |
| %convolution_58 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_11, %_param_constant657, %_param_constant658, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_328 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_58, %convolution_57), kwargs = {}) | |
| %div_20 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_328, 1.0), kwargs = {}) | |
| %cat_12 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%div_20, %div_1], 1), kwargs = {}) | |
| %view_560 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_12, [2, 32, 16, 16384]), kwargs = {}) | |
| %convert_element_type_320 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_560, torch.float32), kwargs = {}) | |
| %var_mean_104 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_320, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_208 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_104, 0), kwargs = {}) | |
| %getitem_209 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_104, 1), kwargs = {}) | |
| %add_329 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_208, 1e-05), kwargs = {}) | |
| %rsqrt_104 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_329,), kwargs = {}) | |
| %sub_104 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_560, %getitem_209), kwargs = {}) | |
| %mul_227 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_104, %rsqrt_104), kwargs = {}) | |
| %view_561 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_227, [2, 512, 128, 128]), kwargs = {}) | |
| %_param_constant659 : [#users=1] = get_attr[target=_param_constant659] | |
| %unsqueeze_378 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant659, 0), kwargs = {}) | |
| %unsqueeze_379 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_378, 2), kwargs = {}) | |
| %unsqueeze_380 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_379, 3), kwargs = {}) | |
| %_param_constant660 : [#users=1] = get_attr[target=_param_constant660] | |
| %unsqueeze_381 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant660, 0), kwargs = {}) | |
| %unsqueeze_382 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_381, 2), kwargs = {}) | |
| %unsqueeze_383 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_382, 3), kwargs = {}) | |
| %mul_228 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_561, %unsqueeze_383), kwargs = {}) | |
| %add_330 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_228, %unsqueeze_380), kwargs = {}) | |
| %convert_element_type_321 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_330, torch.float16), kwargs = {}) | |
| %convert_element_type_322 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_209, torch.float16), kwargs = {}) | |
| %convert_element_type_323 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_104, torch.float16), kwargs = {}) | |
| %squeeze_224 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_322, 3), kwargs = {}) | |
| %squeeze_225 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_224, 2), kwargs = {}) | |
| %squeeze_226 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_323, 3), kwargs = {}) | |
| %squeeze_227 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_226, 2), kwargs = {}) | |
| %detach_144 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_225,), kwargs = {}) | |
| %detach_145 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_227,), kwargs = {}) | |
| %silu_61 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_321,), kwargs = {}) | |
| %_param_constant661 : [#users=1] = get_attr[target=_param_constant661] | |
| %_param_constant662 : [#users=1] = get_attr[target=_param_constant662] | |
| %convolution_59 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_61, %_param_constant661, %_param_constant662, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_62 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant663 : [#users=1] = get_attr[target=_param_constant663] | |
| %t_214 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant663,), kwargs = {}) | |
| %_param_constant664 : [#users=1] = get_attr[target=_param_constant664] | |
| %addmm_102 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant664, %silu_62, %t_214), kwargs = {}) | |
| %slice_79 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_102, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_80 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_79, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_384 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_80, 2), kwargs = {}) | |
| %unsqueeze_385 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_384, 3), kwargs = {}) | |
| %add_331 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_59, %unsqueeze_385), kwargs = {}) | |
| %view_562 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_331, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_324 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_562, torch.float32), kwargs = {}) | |
| %var_mean_105 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_324, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_210 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_105, 0), kwargs = {}) | |
| %getitem_211 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_105, 1), kwargs = {}) | |
| %add_332 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_210, 1e-05), kwargs = {}) | |
| %rsqrt_105 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_332,), kwargs = {}) | |
| %sub_105 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_562, %getitem_211), kwargs = {}) | |
| %mul_229 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_105, %rsqrt_105), kwargs = {}) | |
| %view_563 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_229, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant665 : [#users=1] = get_attr[target=_param_constant665] | |
| %unsqueeze_386 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant665, 0), kwargs = {}) | |
| %unsqueeze_387 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_386, 2), kwargs = {}) | |
| %unsqueeze_388 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_387, 3), kwargs = {}) | |
| %_param_constant666 : [#users=1] = get_attr[target=_param_constant666] | |
| %unsqueeze_389 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant666, 0), kwargs = {}) | |
| %unsqueeze_390 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_389, 2), kwargs = {}) | |
| %unsqueeze_391 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_390, 3), kwargs = {}) | |
| %mul_230 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_563, %unsqueeze_391), kwargs = {}) | |
| %add_333 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_230, %unsqueeze_388), kwargs = {}) | |
| %convert_element_type_325 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_333, torch.float16), kwargs = {}) | |
| %convert_element_type_326 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_211, torch.float16), kwargs = {}) | |
| %convert_element_type_327 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_105, torch.float16), kwargs = {}) | |
| %squeeze_228 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_326, 3), kwargs = {}) | |
| %squeeze_229 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_228, 2), kwargs = {}) | |
| %squeeze_230 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_327, 3), kwargs = {}) | |
| %squeeze_231 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_230, 2), kwargs = {}) | |
| %detach_146 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_229,), kwargs = {}) | |
| %detach_147 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_231,), kwargs = {}) | |
| %silu_63 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_325,), kwargs = {}) | |
| %_param_constant667 : [#users=1] = get_attr[target=_param_constant667] | |
| %_param_constant668 : [#users=1] = get_attr[target=_param_constant668] | |
| %convolution_60 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_63, %_param_constant667, %_param_constant668, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant669 : [#users=1] = get_attr[target=_param_constant669] | |
| %_param_constant670 : [#users=1] = get_attr[target=_param_constant670] | |
| %convolution_61 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_12, %_param_constant669, %_param_constant670, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_334 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_61, %convolution_60), kwargs = {}) | |
| %div_21 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_334, 1.0), kwargs = {}) | |
| %cat_13 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%div_21, %convolution], 1), kwargs = {}) | |
| %view_564 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_13, [2, 32, 16, 16384]), kwargs = {}) | |
| %convert_element_type_328 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_564, torch.float32), kwargs = {}) | |
| %var_mean_106 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_328, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_212 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_106, 0), kwargs = {}) | |
| %getitem_213 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_106, 1), kwargs = {}) | |
| %add_335 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_212, 1e-05), kwargs = {}) | |
| %rsqrt_106 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_335,), kwargs = {}) | |
| %sub_106 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_564, %getitem_213), kwargs = {}) | |
| %mul_231 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_106, %rsqrt_106), kwargs = {}) | |
| %view_565 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_231, [2, 512, 128, 128]), kwargs = {}) | |
| %_param_constant671 : [#users=1] = get_attr[target=_param_constant671] | |
| %unsqueeze_392 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant671, 0), kwargs = {}) | |
| %unsqueeze_393 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_392, 2), kwargs = {}) | |
| %unsqueeze_394 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_393, 3), kwargs = {}) | |
| %_param_constant672 : [#users=1] = get_attr[target=_param_constant672] | |
| %unsqueeze_395 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant672, 0), kwargs = {}) | |
| %unsqueeze_396 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_395, 2), kwargs = {}) | |
| %unsqueeze_397 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_396, 3), kwargs = {}) | |
| %mul_232 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_565, %unsqueeze_397), kwargs = {}) | |
| %add_336 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_232, %unsqueeze_394), kwargs = {}) | |
| %convert_element_type_329 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_336, torch.float16), kwargs = {}) | |
| %convert_element_type_330 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_213, torch.float16), kwargs = {}) | |
| %convert_element_type_331 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_106, torch.float16), kwargs = {}) | |
| %squeeze_232 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_330, 3), kwargs = {}) | |
| %squeeze_233 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_232, 2), kwargs = {}) | |
| %squeeze_234 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_331, 3), kwargs = {}) | |
| %squeeze_235 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_234, 2), kwargs = {}) | |
| %detach_148 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_233,), kwargs = {}) | |
| %detach_149 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_235,), kwargs = {}) | |
| %silu_64 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_329,), kwargs = {}) | |
| %_param_constant673 : [#users=1] = get_attr[target=_param_constant673] | |
| %_param_constant674 : [#users=1] = get_attr[target=_param_constant674] | |
| %convolution_62 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_64, %_param_constant673, %_param_constant674, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %silu_65 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%add,), kwargs = {}) | |
| %_param_constant675 : [#users=1] = get_attr[target=_param_constant675] | |
| %t_215 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant675,), kwargs = {}) | |
| %_param_constant676 : [#users=1] = get_attr[target=_param_constant676] | |
| %addmm_103 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant676, %silu_65, %t_215), kwargs = {}) | |
| %slice_81 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_103, 0, 0, 9223372036854775807), kwargs = {}) | |
| %slice_82 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_81, 1, 0, 9223372036854775807), kwargs = {}) | |
| %unsqueeze_398 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_82, 2), kwargs = {}) | |
| %unsqueeze_399 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_398, 3), kwargs = {}) | |
| %add_337 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_62, %unsqueeze_399), kwargs = {}) | |
| %view_566 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_337, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_332 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_566, torch.float32), kwargs = {}) | |
| %var_mean_107 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_332, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_214 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_107, 0), kwargs = {}) | |
| %getitem_215 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_107, 1), kwargs = {}) | |
| %add_338 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_214, 1e-05), kwargs = {}) | |
| %rsqrt_107 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_338,), kwargs = {}) | |
| %sub_107 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_566, %getitem_215), kwargs = {}) | |
| %mul_233 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_107, %rsqrt_107), kwargs = {}) | |
| %view_567 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_233, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant677 : [#users=1] = get_attr[target=_param_constant677] | |
| %unsqueeze_400 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant677, 0), kwargs = {}) | |
| %unsqueeze_401 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_400, 2), kwargs = {}) | |
| %unsqueeze_402 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_401, 3), kwargs = {}) | |
| %_param_constant678 : [#users=1] = get_attr[target=_param_constant678] | |
| %unsqueeze_403 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant678, 0), kwargs = {}) | |
| %unsqueeze_404 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_403, 2), kwargs = {}) | |
| %unsqueeze_405 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_404, 3), kwargs = {}) | |
| %mul_234 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_567, %unsqueeze_405), kwargs = {}) | |
| %add_339 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_234, %unsqueeze_402), kwargs = {}) | |
| %convert_element_type_333 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_339, torch.float16), kwargs = {}) | |
| %convert_element_type_334 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_215, torch.float16), kwargs = {}) | |
| %convert_element_type_335 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_107, torch.float16), kwargs = {}) | |
| %squeeze_236 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_334, 3), kwargs = {}) | |
| %squeeze_237 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_236, 2), kwargs = {}) | |
| %squeeze_238 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_335, 3), kwargs = {}) | |
| %squeeze_239 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_238, 2), kwargs = {}) | |
| %detach_150 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_237,), kwargs = {}) | |
| %detach_151 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_239,), kwargs = {}) | |
| %silu_66 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_333,), kwargs = {}) | |
| %_param_constant679 : [#users=1] = get_attr[target=_param_constant679] | |
| %_param_constant680 : [#users=1] = get_attr[target=_param_constant680] | |
| %convolution_63 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_66, %_param_constant679, %_param_constant680, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %_param_constant681 : [#users=1] = get_attr[target=_param_constant681] | |
| %_param_constant682 : [#users=1] = get_attr[target=_param_constant682] | |
| %convolution_64 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_13, %_param_constant681, %_param_constant682, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| %add_340 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_64, %convolution_63), kwargs = {}) | |
| %div_22 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_340, 1.0), kwargs = {}) | |
| %view_568 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_22, [2, 32, 8, 16384]), kwargs = {}) | |
| %convert_element_type_336 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%view_568, torch.float32), kwargs = {}) | |
| %var_mean_108 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%convert_element_type_336, [2, 3]), kwargs = {correction: 0, keepdim: True}) | |
| %getitem_216 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_108, 0), kwargs = {}) | |
| %getitem_217 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_108, 1), kwargs = {}) | |
| %add_341 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_216, 1e-05), kwargs = {}) | |
| %rsqrt_108 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_341,), kwargs = {}) | |
| %sub_108 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_568, %getitem_217), kwargs = {}) | |
| %mul_235 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_108, %rsqrt_108), kwargs = {}) | |
| %view_569 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_235, [2, 256, 128, 128]), kwargs = {}) | |
| %_param_constant683 : [#users=1] = get_attr[target=_param_constant683] | |
| %unsqueeze_406 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant683, 0), kwargs = {}) | |
| %unsqueeze_407 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_406, 2), kwargs = {}) | |
| %unsqueeze_408 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_407, 3), kwargs = {}) | |
| %_param_constant684 : [#users=1] = get_attr[target=_param_constant684] | |
| %unsqueeze_409 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant684, 0), kwargs = {}) | |
| %unsqueeze_410 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_409, 2), kwargs = {}) | |
| %unsqueeze_411 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_410, 3), kwargs = {}) | |
| %mul_236 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_569, %unsqueeze_411), kwargs = {}) | |
| %add_342 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_236, %unsqueeze_408), kwargs = {}) | |
| %convert_element_type_337 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%add_342, torch.float16), kwargs = {}) | |
| %convert_element_type_338 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%getitem_217, torch.float16), kwargs = {}) | |
| %convert_element_type_339 : [#users=1] = call_function[target=torch.ops.prims.convert_element_type](args = (%rsqrt_108, torch.float16), kwargs = {}) | |
| %squeeze_240 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_338, 3), kwargs = {}) | |
| %squeeze_241 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_240, 2), kwargs = {}) | |
| %squeeze_242 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%convert_element_type_339, 3), kwargs = {}) | |
| %squeeze_243 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%squeeze_242, 2), kwargs = {}) | |
| %detach_152 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_241,), kwargs = {}) | |
| %detach_153 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_243,), kwargs = {}) | |
| %silu_67 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%convert_element_type_337,), kwargs = {}) | |
| %_param_constant685 : [#users=1] = get_attr[target=_param_constant685] | |
| %_param_constant686 : [#users=1] = get_attr[target=_param_constant686] | |
| %convolution_65 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_67, %_param_constant685, %_param_constant686, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) | |
| return convolution_65 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment