Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created April 5, 2023 16:59
Show Gist options
  • Save pashu123/d23855f9938e3e74222ab32c22b4889f to your computer and use it in GitHub Desktop.
Save pashu123/d23855f9938e3e74222ab32c22b4889f to your computer and use it in GitHub Desktop.
graph():
%arg0_1 : [#users=1] = placeholder[target=arg0_1]
%arg1_1 : [#users=1] = placeholder[target=arg1_1]
%arg2_1 : [#users=32] = placeholder[target=arg2_1]
%expand : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%arg1_1, [2]), kwargs = {})
%arange : [#users=1] = call_function[target=torch.ops.aten.arange](args = (0, 160), kwargs = {dtype: torch.float32, device: cuda:0, pin_memory: False})
%mul : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%arange, -9.210340371976184), kwargs = {})
%div : [#users=1] = call_function[target=torch.ops.aten.div](args = (%mul, 160), kwargs = {})
%exp : [#users=1] = call_function[target=torch.ops.aten.exp](args = (%div,), kwargs = {})
%slice_1 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%expand, 0, 0, 9223372036854775807), kwargs = {})
%unsqueeze : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_1, 1), kwargs = {})
%_to_copy : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%unsqueeze,), kwargs = {dtype: torch.float32})
%unsqueeze_1 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%exp, 0), kwargs = {})
%slice_2 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%unsqueeze_1, 1, 0, 9223372036854775807), kwargs = {})
%mul_1 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_to_copy, %slice_2), kwargs = {})
%mul_2 : [#users=2] = call_function[target=torch.ops.aten.mul](args = (%mul_1, 1), kwargs = {})
%sin : [#users=1] = call_function[target=torch.ops.aten.sin](args = (%mul_2,), kwargs = {})
%cos : [#users=1] = call_function[target=torch.ops.aten.cos](args = (%mul_2,), kwargs = {})
%cat : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%sin, %cos], -1), kwargs = {})
%slice_3 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%cat, 0, 0, 9223372036854775807), kwargs = {})
%slice_4 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_3, 1, 160, 9223372036854775807), kwargs = {})
%slice_5 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%cat, 0, 0, 9223372036854775807), kwargs = {})
%slice_6 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_5, 1, 0, 160), kwargs = {})
%cat_1 : [#users=1] = call_function[target=torch.ops.aten.cat](args = ([%slice_4, %slice_6], -1), kwargs = {})
%_to_copy_1 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%cat_1,), kwargs = {dtype: torch.float16})
%_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
%t : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant0,), kwargs = {})
%_param_constant1 : [#users=1] = get_attr[target=_param_constant1]
%addmm : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant1, %_to_copy_1, %t), kwargs = {})
%silu : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm,), kwargs = {})
%_param_constant2 : [#users=1] = get_attr[target=_param_constant2]
%t_1 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant2,), kwargs = {})
%_param_constant3 : [#users=1] = get_attr[target=_param_constant3]
%addmm_1 : [#users=22] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant3, %silu, %t_1), kwargs = {})
%_param_constant4 : [#users=1] = get_attr[target=_param_constant4]
%_param_constant5 : [#users=1] = get_attr[target=_param_constant5]
%convolution : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%arg0_1, %_param_constant4, %_param_constant5, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%view : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_2 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view,), kwargs = {dtype: torch.float32})
%var_mean : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_2, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem : [#users=1] = call_function[target=operator.getitem](args = (%var_mean, 0), kwargs = {})
%getitem_1 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean, 1), kwargs = {})
%add : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem, 1e-05), kwargs = {})
%rsqrt : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add,), kwargs = {})
%sub : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view, %getitem_1), kwargs = {})
%mul_3 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub, %rsqrt), kwargs = {})
%view_1 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_3, [2, 320, 96, 96]), kwargs = {})
%_param_constant6 : [#users=1] = get_attr[target=_param_constant6]
%unsqueeze_2 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant6, 0), kwargs = {})
%unsqueeze_3 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_2, 2), kwargs = {})
%unsqueeze_4 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_3, 3), kwargs = {})
%_param_constant7 : [#users=1] = get_attr[target=_param_constant7]
%unsqueeze_5 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant7, 0), kwargs = {})
%unsqueeze_6 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_5, 2), kwargs = {})
%unsqueeze_7 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_6, 3), kwargs = {})
%mul_4 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_1, %unsqueeze_7), kwargs = {})
%add_1 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_4, %unsqueeze_4), kwargs = {})
%_to_copy_3 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_1,), kwargs = {dtype: torch.float16})
%_to_copy_4 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_1,), kwargs = {dtype: torch.float16})
%_to_copy_5 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt,), kwargs = {dtype: torch.float16})
%squeeze : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_4, [2, 3]), kwargs = {})
%squeeze_1 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_5, [2, 3]), kwargs = {})
%detach : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze,), kwargs = {})
%detach_1 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_1,), kwargs = {})
%silu_1 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_3,), kwargs = {})
%_param_constant8 : [#users=1] = get_attr[target=_param_constant8]
%_param_constant9 : [#users=1] = get_attr[target=_param_constant9]
%convolution_1 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_1, %_param_constant8, %_param_constant9, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_2 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant10 : [#users=1] = get_attr[target=_param_constant10]
%t_2 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant10,), kwargs = {})
%_param_constant11 : [#users=1] = get_attr[target=_param_constant11]
%addmm_2 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant11, %silu_2, %t_2), kwargs = {})
%slice_7 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_2, 0, 0, 9223372036854775807), kwargs = {})
%slice_8 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_7, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_8 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_8, 2), kwargs = {})
%unsqueeze_9 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_8, 3), kwargs = {})
%add_2 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_1, %unsqueeze_9), kwargs = {})
%view_2 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_2, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_6 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_2,), kwargs = {dtype: torch.float32})
%var_mean_1 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_6, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_2 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_1, 0), kwargs = {})
%getitem_3 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_1, 1), kwargs = {})
%add_3 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_2, 1e-05), kwargs = {})
%rsqrt_1 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_3,), kwargs = {})
%sub_1 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_2, %getitem_3), kwargs = {})
%mul_5 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_1, %rsqrt_1), kwargs = {})
%view_3 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_5, [2, 320, 96, 96]), kwargs = {})
%_param_constant12 : [#users=1] = get_attr[target=_param_constant12]
%unsqueeze_10 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant12, 0), kwargs = {})
%unsqueeze_11 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_10, 2), kwargs = {})
%unsqueeze_12 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_11, 3), kwargs = {})
%_param_constant13 : [#users=1] = get_attr[target=_param_constant13]
%unsqueeze_13 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant13, 0), kwargs = {})
%unsqueeze_14 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_13, 2), kwargs = {})
%unsqueeze_15 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_14, 3), kwargs = {})
%mul_6 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_3, %unsqueeze_15), kwargs = {})
%add_4 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_6, %unsqueeze_12), kwargs = {})
%_to_copy_7 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_4,), kwargs = {dtype: torch.float16})
%_to_copy_8 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_3,), kwargs = {dtype: torch.float16})
%_to_copy_9 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_1,), kwargs = {dtype: torch.float16})
%squeeze_2 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_8, [2, 3]), kwargs = {})
%squeeze_3 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_9, [2, 3]), kwargs = {})
%detach_2 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_2,), kwargs = {})
%detach_3 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_3,), kwargs = {})
%silu_3 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_7,), kwargs = {})
%_param_constant14 : [#users=1] = get_attr[target=_param_constant14]
%_param_constant15 : [#users=1] = get_attr[target=_param_constant15]
%convolution_2 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_3, %_param_constant14, %_param_constant15, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_5 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution, %convolution_2), kwargs = {})
%div_1 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_5, 1.0), kwargs = {})
%view_4 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_1, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_10 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_4,), kwargs = {dtype: torch.float32})
%var_mean_2 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_10, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_2, 0), kwargs = {})
%getitem_5 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_2, 1), kwargs = {})
%add_6 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_4, 1e-06), kwargs = {})
%rsqrt_2 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_6,), kwargs = {})
%sub_2 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_4, %getitem_5), kwargs = {})
%mul_7 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_2, %rsqrt_2), kwargs = {})
%view_5 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_7, [2, 320, 96, 96]), kwargs = {})
%_param_constant16 : [#users=1] = get_attr[target=_param_constant16]
%unsqueeze_16 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant16, 0), kwargs = {})
%unsqueeze_17 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_16, 2), kwargs = {})
%unsqueeze_18 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_17, 3), kwargs = {})
%_param_constant17 : [#users=1] = get_attr[target=_param_constant17]
%unsqueeze_19 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant17, 0), kwargs = {})
%unsqueeze_20 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_19, 2), kwargs = {})
%unsqueeze_21 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_20, 3), kwargs = {})
%mul_8 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_5, %unsqueeze_21), kwargs = {})
%add_7 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_8, %unsqueeze_18), kwargs = {})
%_to_copy_11 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_7,), kwargs = {dtype: torch.float16})
%_to_copy_12 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_5,), kwargs = {dtype: torch.float16})
%_to_copy_13 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_2,), kwargs = {dtype: torch.float16})
%squeeze_4 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_12, [2, 3]), kwargs = {})
%squeeze_5 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_13, [2, 3]), kwargs = {})
%detach_4 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_4,), kwargs = {})
%detach_5 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_5,), kwargs = {})
%permute : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_11, [0, 2, 3, 1]), kwargs = {})
%view_6 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute, [2, 9216, 320]), kwargs = {})
%_param_constant18 : [#users=1] = get_attr[target=_param_constant18]
%t_3 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant18,), kwargs = {})
%clone : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_6,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone, [18432, 320]), kwargs = {})
%mm : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view, %t_3), kwargs = {})
%_unsafe_view_1 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm, [2, 9216, 320]), kwargs = {})
%_param_constant19 : [#users=1] = get_attr[target=_param_constant19]
%add_8 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_1, %_param_constant19), kwargs = {})
%_to_copy_14 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_8,), kwargs = {dtype: torch.float32})
%var_mean_3 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_14, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_6 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_3, 0), kwargs = {})
%getitem_7 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_3, 1), kwargs = {})
%add_9 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_6, 1e-05), kwargs = {})
%rsqrt_3 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_9,), kwargs = {})
%sub_3 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_8, %getitem_7), kwargs = {})
%mul_9 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_3, %rsqrt_3), kwargs = {})
%_param_constant20 : [#users=1] = get_attr[target=_param_constant20]
%mul_10 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_9, %_param_constant20), kwargs = {})
%_param_constant21 : [#users=1] = get_attr[target=_param_constant21]
%add_10 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_10, %_param_constant21), kwargs = {})
%_to_copy_15 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_10,), kwargs = {dtype: torch.float16})
%_param_constant22 : [#users=1] = get_attr[target=_param_constant22]
%t_4 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant22,), kwargs = {})
%view_7 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_15, [18432, 320]), kwargs = {})
%mm_1 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_7, %t_4), kwargs = {})
%_unsafe_view_2 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_1, [2, 9216, 320]), kwargs = {})
%_param_constant23 : [#users=1] = get_attr[target=_param_constant23]
%t_5 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant23,), kwargs = {})
%view_8 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_15, [18432, 320]), kwargs = {})
%mm_2 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_8, %t_5), kwargs = {})
%_unsafe_view_3 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_2, [2, 9216, 320]), kwargs = {})
%_param_constant24 : [#users=1] = get_attr[target=_param_constant24]
%t_6 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant24,), kwargs = {})
%view_9 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_15, [18432, 320]), kwargs = {})
%mm_3 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_9, %t_6), kwargs = {})
%_unsafe_view_4 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_3, [2, 9216, 320]), kwargs = {})
%view_10 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_2, [2, -1, 5, 64]), kwargs = {})
%transpose : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_10, 1, 2), kwargs = {})
%view_11 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_3, [2, -1, 5, 64]), kwargs = {})
%transpose_1 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_11, 1, 2), kwargs = {})
%view_12 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_4, [2, -1, 5, 64]), kwargs = {})
%transpose_2 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_12, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose, %transpose_1, %transpose_2, True), kwargs = {})
%getitem_8 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention, 0), kwargs = {})
%getitem_9 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention, 1), kwargs = {})
%detach_6 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_8,), kwargs = {})
%transpose_3 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_8, 1, 2), kwargs = {})
%view_13 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_3, [2, -1, 320]), kwargs = {})
%_param_constant25 : [#users=1] = get_attr[target=_param_constant25]
%t_7 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant25,), kwargs = {})
%view_14 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_13, [18432, 320]), kwargs = {})
%_param_constant26 : [#users=1] = get_attr[target=_param_constant26]
%addmm_3 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant26, %view_14, %t_7), kwargs = {})
%view_15 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_3, [2, 9216, 320]), kwargs = {})
%add_11 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_15, %add_8), kwargs = {})
%_to_copy_16 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_11,), kwargs = {dtype: torch.float32})
%var_mean_4 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_16, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_10 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_4, 0), kwargs = {})
%getitem_11 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_4, 1), kwargs = {})
%add_12 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_10, 1e-05), kwargs = {})
%rsqrt_4 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_12,), kwargs = {})
%sub_4 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_11, %getitem_11), kwargs = {})
%mul_11 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_4, %rsqrt_4), kwargs = {})
%_param_constant27 : [#users=1] = get_attr[target=_param_constant27]
%mul_12 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_11, %_param_constant27), kwargs = {})
%_param_constant28 : [#users=1] = get_attr[target=_param_constant28]
%add_13 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_12, %_param_constant28), kwargs = {})
%_to_copy_17 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_13,), kwargs = {dtype: torch.float16})
%_param_constant29 : [#users=1] = get_attr[target=_param_constant29]
%t_8 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant29,), kwargs = {})
%view_16 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_17, [18432, 320]), kwargs = {})
%mm_4 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_16, %t_8), kwargs = {})
%_unsafe_view_5 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_4, [2, 9216, 320]), kwargs = {})
%_param_constant30 : [#users=1] = get_attr[target=_param_constant30]
%t_9 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant30,), kwargs = {})
%view_17 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_5 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_17, %t_9), kwargs = {})
%_unsafe_view_6 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_5, [2, 77, 320]), kwargs = {})
%_param_constant31 : [#users=1] = get_attr[target=_param_constant31]
%t_10 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant31,), kwargs = {})
%view_18 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_6 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_18, %t_10), kwargs = {})
%_unsafe_view_7 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_6, [2, 77, 320]), kwargs = {})
%view_19 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_5, [2, -1, 5, 64]), kwargs = {})
%transpose_4 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_19, 1, 2), kwargs = {})
%view_20 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_6, [2, -1, 5, 64]), kwargs = {})
%transpose_5 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_20, 1, 2), kwargs = {})
%view_21 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_7, [2, -1, 5, 64]), kwargs = {})
%transpose_6 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_21, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_1 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_4, %transpose_5, %transpose_6, True), kwargs = {})
%getitem_12 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_1, 0), kwargs = {})
%getitem_13 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_1, 1), kwargs = {})
%detach_7 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_12,), kwargs = {})
%transpose_7 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_12, 1, 2), kwargs = {})
%view_22 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_7, [2, -1, 320]), kwargs = {})
%_param_constant32 : [#users=1] = get_attr[target=_param_constant32]
%t_11 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant32,), kwargs = {})
%view_23 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_22, [18432, 320]), kwargs = {})
%_param_constant33 : [#users=1] = get_attr[target=_param_constant33]
%addmm_4 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant33, %view_23, %t_11), kwargs = {})
%view_24 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_4, [2, 9216, 320]), kwargs = {})
%add_14 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_24, %add_11), kwargs = {})
%_to_copy_18 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_14,), kwargs = {dtype: torch.float32})
%var_mean_5 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_18, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_14 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_5, 0), kwargs = {})
%getitem_15 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_5, 1), kwargs = {})
%add_15 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_14, 1e-05), kwargs = {})
%rsqrt_5 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_15,), kwargs = {})
%sub_5 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_14, %getitem_15), kwargs = {})
%mul_13 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_5, %rsqrt_5), kwargs = {})
%_param_constant34 : [#users=1] = get_attr[target=_param_constant34]
%mul_14 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_13, %_param_constant34), kwargs = {})
%_param_constant35 : [#users=1] = get_attr[target=_param_constant35]
%add_16 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_14, %_param_constant35), kwargs = {})
%_to_copy_19 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_16,), kwargs = {dtype: torch.float16})
%_param_constant36 : [#users=1] = get_attr[target=_param_constant36]
%t_12 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant36,), kwargs = {})
%view_25 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_19, [18432, 320]), kwargs = {})
%_param_constant37 : [#users=1] = get_attr[target=_param_constant37]
%addmm_5 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant37, %view_25, %t_12), kwargs = {})
%view_26 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_5, [2, 9216, 2560]), kwargs = {})
%slice_9 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_26, -1, 0, 1280), kwargs = {})
%slice_10 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_26, -1, 1280, 2560), kwargs = {})
%gelu : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_10,), kwargs = {})
%mul_15 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_9, %gelu), kwargs = {})
%_param_constant38 : [#users=1] = get_attr[target=_param_constant38]
%t_13 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant38,), kwargs = {})
%view_27 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_15, [18432, 1280]), kwargs = {})
%_param_constant39 : [#users=1] = get_attr[target=_param_constant39]
%addmm_6 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant39, %view_27, %t_13), kwargs = {})
%view_28 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_6, [2, 9216, 320]), kwargs = {})
%add_17 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_28, %add_14), kwargs = {})
%_param_constant40 : [#users=1] = get_attr[target=_param_constant40]
%t_14 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant40,), kwargs = {})
%view_29 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_17, [18432, 320]), kwargs = {})
%_param_constant41 : [#users=1] = get_attr[target=_param_constant41]
%addmm_7 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant41, %view_29, %t_14), kwargs = {})
%view_30 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_7, [2, 9216, 320]), kwargs = {})
%view_31 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_30, [2, 96, 96, 320]), kwargs = {})
%permute_1 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_31, [0, 3, 1, 2]), kwargs = {})
%clone_1 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format})
%add_18 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_1, %div_1), kwargs = {})
%view_32 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_18, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_20 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_32,), kwargs = {dtype: torch.float32})
%var_mean_6 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_20, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_16 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_6, 0), kwargs = {})
%getitem_17 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_6, 1), kwargs = {})
%add_19 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_16, 1e-05), kwargs = {})
%rsqrt_6 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_19,), kwargs = {})
%sub_6 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_32, %getitem_17), kwargs = {})
%mul_16 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_6, %rsqrt_6), kwargs = {})
%view_33 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_16, [2, 320, 96, 96]), kwargs = {})
%_param_constant42 : [#users=1] = get_attr[target=_param_constant42]
%unsqueeze_22 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant42, 0), kwargs = {})
%unsqueeze_23 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_22, 2), kwargs = {})
%unsqueeze_24 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_23, 3), kwargs = {})
%_param_constant43 : [#users=1] = get_attr[target=_param_constant43]
%unsqueeze_25 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant43, 0), kwargs = {})
%unsqueeze_26 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_25, 2), kwargs = {})
%unsqueeze_27 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_26, 3), kwargs = {})
%mul_17 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_33, %unsqueeze_27), kwargs = {})
%add_20 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_17, %unsqueeze_24), kwargs = {})
%_to_copy_21 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_20,), kwargs = {dtype: torch.float16})
%_to_copy_22 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_17,), kwargs = {dtype: torch.float16})
%_to_copy_23 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_6,), kwargs = {dtype: torch.float16})
%squeeze_6 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_22, [2, 3]), kwargs = {})
%squeeze_7 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_23, [2, 3]), kwargs = {})
%detach_8 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_6,), kwargs = {})
%detach_9 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_7,), kwargs = {})
%silu_4 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_21,), kwargs = {})
%_param_constant44 : [#users=1] = get_attr[target=_param_constant44]
%_param_constant45 : [#users=1] = get_attr[target=_param_constant45]
%convolution_3 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_4, %_param_constant44, %_param_constant45, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_5 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant46 : [#users=1] = get_attr[target=_param_constant46]
%t_15 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant46,), kwargs = {})
%_param_constant47 : [#users=1] = get_attr[target=_param_constant47]
%addmm_8 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant47, %silu_5, %t_15), kwargs = {})
%slice_11 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_8, 0, 0, 9223372036854775807), kwargs = {})
%slice_12 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_11, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_28 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_12, 2), kwargs = {})
%unsqueeze_29 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_28, 3), kwargs = {})
%add_21 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_3, %unsqueeze_29), kwargs = {})
%view_34 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_21, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_24 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_34,), kwargs = {dtype: torch.float32})
%var_mean_7 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_24, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_18 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_7, 0), kwargs = {})
%getitem_19 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_7, 1), kwargs = {})
%add_22 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_18, 1e-05), kwargs = {})
%rsqrt_7 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_22,), kwargs = {})
%sub_7 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_34, %getitem_19), kwargs = {})
%mul_18 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_7, %rsqrt_7), kwargs = {})
%view_35 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_18, [2, 320, 96, 96]), kwargs = {})
%_param_constant48 : [#users=1] = get_attr[target=_param_constant48]
%unsqueeze_30 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant48, 0), kwargs = {})
%unsqueeze_31 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_30, 2), kwargs = {})
%unsqueeze_32 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_31, 3), kwargs = {})
%_param_constant49 : [#users=1] = get_attr[target=_param_constant49]
%unsqueeze_33 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant49, 0), kwargs = {})
%unsqueeze_34 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_33, 2), kwargs = {})
%unsqueeze_35 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_34, 3), kwargs = {})
%mul_19 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_35, %unsqueeze_35), kwargs = {})
%add_23 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_19, %unsqueeze_32), kwargs = {})
%_to_copy_25 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_23,), kwargs = {dtype: torch.float16})
%_to_copy_26 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_19,), kwargs = {dtype: torch.float16})
%_to_copy_27 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_7,), kwargs = {dtype: torch.float16})
%squeeze_8 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_26, [2, 3]), kwargs = {})
%squeeze_9 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_27, [2, 3]), kwargs = {})
%detach_10 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_8,), kwargs = {})
%detach_11 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_9,), kwargs = {})
%silu_6 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_25,), kwargs = {})
%_param_constant50 : [#users=1] = get_attr[target=_param_constant50]
%_param_constant51 : [#users=1] = get_attr[target=_param_constant51]
%convolution_4 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_6, %_param_constant50, %_param_constant51, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_24 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_18, %convolution_4), kwargs = {})
%div_2 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_24, 1.0), kwargs = {})
%view_36 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_2, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_28 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_36,), kwargs = {dtype: torch.float32})
%var_mean_8 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_28, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_20 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_8, 0), kwargs = {})
%getitem_21 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_8, 1), kwargs = {})
%add_25 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_20, 1e-06), kwargs = {})
%rsqrt_8 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_25,), kwargs = {})
%sub_8 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_36, %getitem_21), kwargs = {})
%mul_20 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_8, %rsqrt_8), kwargs = {})
%view_37 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_20, [2, 320, 96, 96]), kwargs = {})
%_param_constant52 : [#users=1] = get_attr[target=_param_constant52]
%unsqueeze_36 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant52, 0), kwargs = {})
%unsqueeze_37 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_36, 2), kwargs = {})
%unsqueeze_38 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_37, 3), kwargs = {})
%_param_constant53 : [#users=1] = get_attr[target=_param_constant53]
%unsqueeze_39 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant53, 0), kwargs = {})
%unsqueeze_40 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_39, 2), kwargs = {})
%unsqueeze_41 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_40, 3), kwargs = {})
%mul_21 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_37, %unsqueeze_41), kwargs = {})
%add_26 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_21, %unsqueeze_38), kwargs = {})
%_to_copy_29 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_26,), kwargs = {dtype: torch.float16})
%_to_copy_30 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_21,), kwargs = {dtype: torch.float16})
%_to_copy_31 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_8,), kwargs = {dtype: torch.float16})
%squeeze_10 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_30, [2, 3]), kwargs = {})
%squeeze_11 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_31, [2, 3]), kwargs = {})
%detach_12 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_10,), kwargs = {})
%detach_13 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_11,), kwargs = {})
%permute_2 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_29, [0, 2, 3, 1]), kwargs = {})
%view_38 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_2, [2, 9216, 320]), kwargs = {})
%_param_constant54 : [#users=1] = get_attr[target=_param_constant54]
%t_16 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant54,), kwargs = {})
%clone_2 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_38,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_8 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_2, [18432, 320]), kwargs = {})
%mm_7 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_8, %t_16), kwargs = {})
%_unsafe_view_9 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_7, [2, 9216, 320]), kwargs = {})
%_param_constant55 : [#users=1] = get_attr[target=_param_constant55]
%add_27 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_9, %_param_constant55), kwargs = {})
%_to_copy_32 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_27,), kwargs = {dtype: torch.float32})
%var_mean_9 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_32, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_22 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_9, 0), kwargs = {})
%getitem_23 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_9, 1), kwargs = {})
%add_28 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_22, 1e-05), kwargs = {})
%rsqrt_9 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_28,), kwargs = {})
%sub_9 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_27, %getitem_23), kwargs = {})
%mul_22 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_9, %rsqrt_9), kwargs = {})
%_param_constant56 : [#users=1] = get_attr[target=_param_constant56]
%mul_23 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_22, %_param_constant56), kwargs = {})
%_param_constant57 : [#users=1] = get_attr[target=_param_constant57]
%add_29 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_23, %_param_constant57), kwargs = {})
%_to_copy_33 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_29,), kwargs = {dtype: torch.float16})
%_param_constant58 : [#users=1] = get_attr[target=_param_constant58]
%t_17 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant58,), kwargs = {})
%view_39 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_33, [18432, 320]), kwargs = {})
%mm_8 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_39, %t_17), kwargs = {})
%_unsafe_view_10 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_8, [2, 9216, 320]), kwargs = {})
%_param_constant59 : [#users=1] = get_attr[target=_param_constant59]
%t_18 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant59,), kwargs = {})
%view_40 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_33, [18432, 320]), kwargs = {})
%mm_9 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_40, %t_18), kwargs = {})
%_unsafe_view_11 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_9, [2, 9216, 320]), kwargs = {})
%_param_constant60 : [#users=1] = get_attr[target=_param_constant60]
%t_19 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant60,), kwargs = {})
%view_41 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_33, [18432, 320]), kwargs = {})
%mm_10 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_41, %t_19), kwargs = {})
%_unsafe_view_12 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_10, [2, 9216, 320]), kwargs = {})
%view_42 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_10, [2, -1, 5, 64]), kwargs = {})
%transpose_8 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_42, 1, 2), kwargs = {})
%view_43 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_11, [2, -1, 5, 64]), kwargs = {})
%transpose_9 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_43, 1, 2), kwargs = {})
%view_44 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_12, [2, -1, 5, 64]), kwargs = {})
%transpose_10 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_44, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_2 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_8, %transpose_9, %transpose_10, True), kwargs = {})
%getitem_24 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_2, 0), kwargs = {})
%getitem_25 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_2, 1), kwargs = {})
%detach_14 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_24,), kwargs = {})
%transpose_11 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_24, 1, 2), kwargs = {})
%view_45 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_11, [2, -1, 320]), kwargs = {})
%_param_constant61 : [#users=1] = get_attr[target=_param_constant61]
%t_20 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant61,), kwargs = {})
%view_46 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_45, [18432, 320]), kwargs = {})
%_param_constant62 : [#users=1] = get_attr[target=_param_constant62]
%addmm_9 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant62, %view_46, %t_20), kwargs = {})
%view_47 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_9, [2, 9216, 320]), kwargs = {})
%add_30 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_47, %add_27), kwargs = {})
%_to_copy_34 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_30,), kwargs = {dtype: torch.float32})
%var_mean_10 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_34, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_26 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_10, 0), kwargs = {})
%getitem_27 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_10, 1), kwargs = {})
%add_31 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_26, 1e-05), kwargs = {})
%rsqrt_10 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_31,), kwargs = {})
%sub_10 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_30, %getitem_27), kwargs = {})
%mul_24 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_10, %rsqrt_10), kwargs = {})
%_param_constant63 : [#users=1] = get_attr[target=_param_constant63]
%mul_25 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_24, %_param_constant63), kwargs = {})
%_param_constant64 : [#users=1] = get_attr[target=_param_constant64]
%add_32 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_25, %_param_constant64), kwargs = {})
%_to_copy_35 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_32,), kwargs = {dtype: torch.float16})
%_param_constant65 : [#users=1] = get_attr[target=_param_constant65]
%t_21 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant65,), kwargs = {})
%view_48 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_35, [18432, 320]), kwargs = {})
%mm_11 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_48, %t_21), kwargs = {})
%_unsafe_view_13 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_11, [2, 9216, 320]), kwargs = {})
%_param_constant66 : [#users=1] = get_attr[target=_param_constant66]
%t_22 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant66,), kwargs = {})
%view_49 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_12 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_49, %t_22), kwargs = {})
%_unsafe_view_14 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_12, [2, 77, 320]), kwargs = {})
%_param_constant67 : [#users=1] = get_attr[target=_param_constant67]
%t_23 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant67,), kwargs = {})
%view_50 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_13 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_50, %t_23), kwargs = {})
%_unsafe_view_15 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_13, [2, 77, 320]), kwargs = {})
%view_51 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_13, [2, -1, 5, 64]), kwargs = {})
%transpose_12 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_51, 1, 2), kwargs = {})
%view_52 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_14, [2, -1, 5, 64]), kwargs = {})
%transpose_13 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_52, 1, 2), kwargs = {})
%view_53 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_15, [2, -1, 5, 64]), kwargs = {})
%transpose_14 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_53, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_3 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_12, %transpose_13, %transpose_14, True), kwargs = {})
%getitem_28 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_3, 0), kwargs = {})
%getitem_29 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_3, 1), kwargs = {})
%detach_15 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_28,), kwargs = {})
%transpose_15 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_28, 1, 2), kwargs = {})
%view_54 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_15, [2, -1, 320]), kwargs = {})
%_param_constant68 : [#users=1] = get_attr[target=_param_constant68]
%t_24 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant68,), kwargs = {})
%view_55 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_54, [18432, 320]), kwargs = {})
%_param_constant69 : [#users=1] = get_attr[target=_param_constant69]
%addmm_10 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant69, %view_55, %t_24), kwargs = {})
%view_56 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_10, [2, 9216, 320]), kwargs = {})
%add_33 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_56, %add_30), kwargs = {})
%_to_copy_36 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_33,), kwargs = {dtype: torch.float32})
%var_mean_11 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_36, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_30 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_11, 0), kwargs = {})
%getitem_31 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_11, 1), kwargs = {})
%add_34 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_30, 1e-05), kwargs = {})
%rsqrt_11 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_34,), kwargs = {})
%sub_11 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_33, %getitem_31), kwargs = {})
%mul_26 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_11, %rsqrt_11), kwargs = {})
%_param_constant70 : [#users=1] = get_attr[target=_param_constant70]
%mul_27 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_26, %_param_constant70), kwargs = {})
%_param_constant71 : [#users=1] = get_attr[target=_param_constant71]
%add_35 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_27, %_param_constant71), kwargs = {})
%_to_copy_37 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_35,), kwargs = {dtype: torch.float16})
%_param_constant72 : [#users=1] = get_attr[target=_param_constant72]
%t_25 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant72,), kwargs = {})
%view_57 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_37, [18432, 320]), kwargs = {})
%_param_constant73 : [#users=1] = get_attr[target=_param_constant73]
%addmm_11 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant73, %view_57, %t_25), kwargs = {})
%view_58 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_11, [2, 9216, 2560]), kwargs = {})
%slice_13 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_58, -1, 0, 1280), kwargs = {})
%slice_14 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_58, -1, 1280, 2560), kwargs = {})
%gelu_1 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_14,), kwargs = {})
%mul_28 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_13, %gelu_1), kwargs = {})
%_param_constant74 : [#users=1] = get_attr[target=_param_constant74]
%t_26 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant74,), kwargs = {})
%view_59 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_28, [18432, 1280]), kwargs = {})
%_param_constant75 : [#users=1] = get_attr[target=_param_constant75]
%addmm_12 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant75, %view_59, %t_26), kwargs = {})
%view_60 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_12, [2, 9216, 320]), kwargs = {})
%add_36 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_60, %add_33), kwargs = {})
%_param_constant76 : [#users=1] = get_attr[target=_param_constant76]
%t_27 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant76,), kwargs = {})
%view_61 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_36, [18432, 320]), kwargs = {})
%_param_constant77 : [#users=1] = get_attr[target=_param_constant77]
%addmm_13 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant77, %view_61, %t_27), kwargs = {})
%view_62 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_13, [2, 9216, 320]), kwargs = {})
%view_63 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_62, [2, 96, 96, 320]), kwargs = {})
%permute_3 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_63, [0, 3, 1, 2]), kwargs = {})
%clone_3 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format})
%add_37 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_3, %div_2), kwargs = {})
%_param_constant78 : [#users=1] = get_attr[target=_param_constant78]
%_param_constant79 : [#users=1] = get_attr[target=_param_constant79]
%convolution_5 : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%add_37, %_param_constant78, %_param_constant79, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%view_64 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution_5, [2, 32, 10, 2304]), kwargs = {})
%_to_copy_38 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_64,), kwargs = {dtype: torch.float32})
%var_mean_12 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_38, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_32 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_12, 0), kwargs = {})
%getitem_33 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_12, 1), kwargs = {})
%add_38 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_32, 1e-05), kwargs = {})
%rsqrt_12 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_38,), kwargs = {})
%sub_12 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_64, %getitem_33), kwargs = {})
%mul_29 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_12, %rsqrt_12), kwargs = {})
%view_65 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_29, [2, 320, 48, 48]), kwargs = {})
%_param_constant80 : [#users=1] = get_attr[target=_param_constant80]
%unsqueeze_42 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant80, 0), kwargs = {})
%unsqueeze_43 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_42, 2), kwargs = {})
%unsqueeze_44 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_43, 3), kwargs = {})
%_param_constant81 : [#users=1] = get_attr[target=_param_constant81]
%unsqueeze_45 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant81, 0), kwargs = {})
%unsqueeze_46 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_45, 2), kwargs = {})
%unsqueeze_47 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_46, 3), kwargs = {})
%mul_30 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_65, %unsqueeze_47), kwargs = {})
%add_39 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_30, %unsqueeze_44), kwargs = {})
%_to_copy_39 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_39,), kwargs = {dtype: torch.float16})
%_to_copy_40 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_33,), kwargs = {dtype: torch.float16})
%_to_copy_41 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_12,), kwargs = {dtype: torch.float16})
%squeeze_12 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_40, [2, 3]), kwargs = {})
%squeeze_13 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_41, [2, 3]), kwargs = {})
%detach_16 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_12,), kwargs = {})
%detach_17 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_13,), kwargs = {})
%silu_7 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_39,), kwargs = {})
%_param_constant82 : [#users=1] = get_attr[target=_param_constant82]
%_param_constant83 : [#users=1] = get_attr[target=_param_constant83]
%convolution_6 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_7, %_param_constant82, %_param_constant83, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_8 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant84 : [#users=1] = get_attr[target=_param_constant84]
%t_28 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant84,), kwargs = {})
%_param_constant85 : [#users=1] = get_attr[target=_param_constant85]
%addmm_14 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant85, %silu_8, %t_28), kwargs = {})
%slice_15 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_14, 0, 0, 9223372036854775807), kwargs = {})
%slice_16 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_15, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_48 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_16, 2), kwargs = {})
%unsqueeze_49 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_48, 3), kwargs = {})
%add_40 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_6, %unsqueeze_49), kwargs = {})
%view_66 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_40, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_42 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_66,), kwargs = {dtype: torch.float32})
%var_mean_13 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_42, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_34 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_13, 0), kwargs = {})
%getitem_35 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_13, 1), kwargs = {})
%add_41 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_34, 1e-05), kwargs = {})
%rsqrt_13 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_41,), kwargs = {})
%sub_13 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_66, %getitem_35), kwargs = {})
%mul_31 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_13, %rsqrt_13), kwargs = {})
%view_67 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_31, [2, 640, 48, 48]), kwargs = {})
%_param_constant86 : [#users=1] = get_attr[target=_param_constant86]
%unsqueeze_50 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant86, 0), kwargs = {})
%unsqueeze_51 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_50, 2), kwargs = {})
%unsqueeze_52 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_51, 3), kwargs = {})
%_param_constant87 : [#users=1] = get_attr[target=_param_constant87]
%unsqueeze_53 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant87, 0), kwargs = {})
%unsqueeze_54 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_53, 2), kwargs = {})
%unsqueeze_55 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_54, 3), kwargs = {})
%mul_32 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_67, %unsqueeze_55), kwargs = {})
%add_42 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_32, %unsqueeze_52), kwargs = {})
%_to_copy_43 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_42,), kwargs = {dtype: torch.float16})
%_to_copy_44 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_35,), kwargs = {dtype: torch.float16})
%_to_copy_45 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_13,), kwargs = {dtype: torch.float16})
%squeeze_14 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_44, [2, 3]), kwargs = {})
%squeeze_15 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_45, [2, 3]), kwargs = {})
%detach_18 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_14,), kwargs = {})
%detach_19 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_15,), kwargs = {})
%silu_9 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_43,), kwargs = {})
%_param_constant88 : [#users=1] = get_attr[target=_param_constant88]
%_param_constant89 : [#users=1] = get_attr[target=_param_constant89]
%convolution_7 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_9, %_param_constant88, %_param_constant89, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant90 : [#users=1] = get_attr[target=_param_constant90]
%_param_constant91 : [#users=1] = get_attr[target=_param_constant91]
%convolution_8 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%convolution_5, %_param_constant90, %_param_constant91, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_43 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_8, %convolution_7), kwargs = {})
%div_3 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_43, 1.0), kwargs = {})
%view_68 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_3, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_46 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_68,), kwargs = {dtype: torch.float32})
%var_mean_14 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_46, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_36 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_14, 0), kwargs = {})
%getitem_37 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_14, 1), kwargs = {})
%add_44 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_36, 1e-06), kwargs = {})
%rsqrt_14 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_44,), kwargs = {})
%sub_14 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_68, %getitem_37), kwargs = {})
%mul_33 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_14, %rsqrt_14), kwargs = {})
%view_69 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_33, [2, 640, 48, 48]), kwargs = {})
%_param_constant92 : [#users=1] = get_attr[target=_param_constant92]
%unsqueeze_56 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant92, 0), kwargs = {})
%unsqueeze_57 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_56, 2), kwargs = {})
%unsqueeze_58 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_57, 3), kwargs = {})
%_param_constant93 : [#users=1] = get_attr[target=_param_constant93]
%unsqueeze_59 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant93, 0), kwargs = {})
%unsqueeze_60 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_59, 2), kwargs = {})
%unsqueeze_61 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_60, 3), kwargs = {})
%mul_34 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_69, %unsqueeze_61), kwargs = {})
%add_45 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_34, %unsqueeze_58), kwargs = {})
%_to_copy_47 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_45,), kwargs = {dtype: torch.float16})
%_to_copy_48 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_37,), kwargs = {dtype: torch.float16})
%_to_copy_49 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_14,), kwargs = {dtype: torch.float16})
%squeeze_16 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_48, [2, 3]), kwargs = {})
%squeeze_17 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_49, [2, 3]), kwargs = {})
%detach_20 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_16,), kwargs = {})
%detach_21 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_17,), kwargs = {})
%permute_4 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_47, [0, 2, 3, 1]), kwargs = {})
%view_70 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_4, [2, 2304, 640]), kwargs = {})
%_param_constant94 : [#users=1] = get_attr[target=_param_constant94]
%t_29 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant94,), kwargs = {})
%clone_4 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_70,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_16 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_4, [4608, 640]), kwargs = {})
%mm_14 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_16, %t_29), kwargs = {})
%_unsafe_view_17 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_14, [2, 2304, 640]), kwargs = {})
%_param_constant95 : [#users=1] = get_attr[target=_param_constant95]
%add_46 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_17, %_param_constant95), kwargs = {})
%_to_copy_50 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_46,), kwargs = {dtype: torch.float32})
%var_mean_15 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_50, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_38 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_15, 0), kwargs = {})
%getitem_39 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_15, 1), kwargs = {})
%add_47 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_38, 1e-05), kwargs = {})
%rsqrt_15 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_47,), kwargs = {})
%sub_15 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_46, %getitem_39), kwargs = {})
%mul_35 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_15, %rsqrt_15), kwargs = {})
%_param_constant96 : [#users=1] = get_attr[target=_param_constant96]
%mul_36 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_35, %_param_constant96), kwargs = {})
%_param_constant97 : [#users=1] = get_attr[target=_param_constant97]
%add_48 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_36, %_param_constant97), kwargs = {})
%_to_copy_51 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_48,), kwargs = {dtype: torch.float16})
%_param_constant98 : [#users=1] = get_attr[target=_param_constant98]
%t_30 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant98,), kwargs = {})
%view_71 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_51, [4608, 640]), kwargs = {})
%mm_15 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_71, %t_30), kwargs = {})
%_unsafe_view_18 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_15, [2, 2304, 640]), kwargs = {})
%_param_constant99 : [#users=1] = get_attr[target=_param_constant99]
%t_31 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant99,), kwargs = {})
%view_72 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_51, [4608, 640]), kwargs = {})
%mm_16 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_72, %t_31), kwargs = {})
%_unsafe_view_19 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_16, [2, 2304, 640]), kwargs = {})
%_param_constant100 : [#users=1] = get_attr[target=_param_constant100]
%t_32 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant100,), kwargs = {})
%view_73 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_51, [4608, 640]), kwargs = {})
%mm_17 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_73, %t_32), kwargs = {})
%_unsafe_view_20 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_17, [2, 2304, 640]), kwargs = {})
%view_74 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_18, [2, -1, 10, 64]), kwargs = {})
%transpose_16 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_74, 1, 2), kwargs = {})
%view_75 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_19, [2, -1, 10, 64]), kwargs = {})
%transpose_17 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_75, 1, 2), kwargs = {})
%view_76 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_20, [2, -1, 10, 64]), kwargs = {})
%transpose_18 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_76, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_4 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_16, %transpose_17, %transpose_18, True), kwargs = {})
%getitem_40 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_4, 0), kwargs = {})
%getitem_41 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_4, 1), kwargs = {})
%detach_22 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_40,), kwargs = {})
%transpose_19 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_40, 1, 2), kwargs = {})
%view_77 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_19, [2, -1, 640]), kwargs = {})
%_param_constant101 : [#users=1] = get_attr[target=_param_constant101]
%t_33 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant101,), kwargs = {})
%view_78 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_77, [4608, 640]), kwargs = {})
%_param_constant102 : [#users=1] = get_attr[target=_param_constant102]
%addmm_15 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant102, %view_78, %t_33), kwargs = {})
%view_79 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_15, [2, 2304, 640]), kwargs = {})
%add_49 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_79, %add_46), kwargs = {})
%_to_copy_52 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_49,), kwargs = {dtype: torch.float32})
%var_mean_16 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_52, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_42 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_16, 0), kwargs = {})
%getitem_43 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_16, 1), kwargs = {})
%add_50 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_42, 1e-05), kwargs = {})
%rsqrt_16 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_50,), kwargs = {})
%sub_16 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_49, %getitem_43), kwargs = {})
%mul_37 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_16, %rsqrt_16), kwargs = {})
%_param_constant103 : [#users=1] = get_attr[target=_param_constant103]
%mul_38 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_37, %_param_constant103), kwargs = {})
%_param_constant104 : [#users=1] = get_attr[target=_param_constant104]
%add_51 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_38, %_param_constant104), kwargs = {})
%_to_copy_53 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_51,), kwargs = {dtype: torch.float16})
%_param_constant105 : [#users=1] = get_attr[target=_param_constant105]
%t_34 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant105,), kwargs = {})
%view_80 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_53, [4608, 640]), kwargs = {})
%mm_18 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_80, %t_34), kwargs = {})
%_unsafe_view_21 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_18, [2, 2304, 640]), kwargs = {})
%_param_constant106 : [#users=1] = get_attr[target=_param_constant106]
%t_35 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant106,), kwargs = {})
%view_81 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_19 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_81, %t_35), kwargs = {})
%_unsafe_view_22 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_19, [2, 77, 640]), kwargs = {})
%_param_constant107 : [#users=1] = get_attr[target=_param_constant107]
%t_36 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant107,), kwargs = {})
%view_82 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_20 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_82, %t_36), kwargs = {})
%_unsafe_view_23 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_20, [2, 77, 640]), kwargs = {})
%view_83 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_21, [2, -1, 10, 64]), kwargs = {})
%transpose_20 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_83, 1, 2), kwargs = {})
%view_84 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_22, [2, -1, 10, 64]), kwargs = {})
%transpose_21 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_84, 1, 2), kwargs = {})
%view_85 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_23, [2, -1, 10, 64]), kwargs = {})
%transpose_22 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_85, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_5 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_20, %transpose_21, %transpose_22, True), kwargs = {})
%getitem_44 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_5, 0), kwargs = {})
%getitem_45 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_5, 1), kwargs = {})
%detach_23 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_44,), kwargs = {})
%transpose_23 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_44, 1, 2), kwargs = {})
%view_86 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_23, [2, -1, 640]), kwargs = {})
%_param_constant108 : [#users=1] = get_attr[target=_param_constant108]
%t_37 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant108,), kwargs = {})
%view_87 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_86, [4608, 640]), kwargs = {})
%_param_constant109 : [#users=1] = get_attr[target=_param_constant109]
%addmm_16 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant109, %view_87, %t_37), kwargs = {})
%view_88 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_16, [2, 2304, 640]), kwargs = {})
%add_52 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_88, %add_49), kwargs = {})
%_to_copy_54 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_52,), kwargs = {dtype: torch.float32})
%var_mean_17 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_54, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_46 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_17, 0), kwargs = {})
%getitem_47 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_17, 1), kwargs = {})
%add_53 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_46, 1e-05), kwargs = {})
%rsqrt_17 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_53,), kwargs = {})
%sub_17 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_52, %getitem_47), kwargs = {})
%mul_39 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_17, %rsqrt_17), kwargs = {})
%_param_constant110 : [#users=1] = get_attr[target=_param_constant110]
%mul_40 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_39, %_param_constant110), kwargs = {})
%_param_constant111 : [#users=1] = get_attr[target=_param_constant111]
%add_54 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_40, %_param_constant111), kwargs = {})
%_to_copy_55 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_54,), kwargs = {dtype: torch.float16})
%_param_constant112 : [#users=1] = get_attr[target=_param_constant112]
%t_38 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant112,), kwargs = {})
%view_89 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_55, [4608, 640]), kwargs = {})
%_param_constant113 : [#users=1] = get_attr[target=_param_constant113]
%addmm_17 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant113, %view_89, %t_38), kwargs = {})
%view_90 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_17, [2, 2304, 5120]), kwargs = {})
%slice_17 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_90, -1, 0, 2560), kwargs = {})
%slice_18 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_90, -1, 2560, 5120), kwargs = {})
%gelu_2 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_18,), kwargs = {})
%mul_41 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_17, %gelu_2), kwargs = {})
%_param_constant114 : [#users=1] = get_attr[target=_param_constant114]
%t_39 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant114,), kwargs = {})
%view_91 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_41, [4608, 2560]), kwargs = {})
%_param_constant115 : [#users=1] = get_attr[target=_param_constant115]
%addmm_18 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant115, %view_91, %t_39), kwargs = {})
%view_92 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_18, [2, 2304, 640]), kwargs = {})
%add_55 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_92, %add_52), kwargs = {})
%_param_constant116 : [#users=1] = get_attr[target=_param_constant116]
%t_40 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant116,), kwargs = {})
%view_93 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_55, [4608, 640]), kwargs = {})
%_param_constant117 : [#users=1] = get_attr[target=_param_constant117]
%addmm_19 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant117, %view_93, %t_40), kwargs = {})
%view_94 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_19, [2, 2304, 640]), kwargs = {})
%view_95 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_94, [2, 48, 48, 640]), kwargs = {})
%permute_5 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_95, [0, 3, 1, 2]), kwargs = {})
%clone_5 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_5,), kwargs = {memory_format: torch.contiguous_format})
%add_56 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_5, %div_3), kwargs = {})
%view_96 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_56, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_56 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_96,), kwargs = {dtype: torch.float32})
%var_mean_18 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_56, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_48 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_18, 0), kwargs = {})
%getitem_49 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_18, 1), kwargs = {})
%add_57 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_48, 1e-05), kwargs = {})
%rsqrt_18 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_57,), kwargs = {})
%sub_18 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_96, %getitem_49), kwargs = {})
%mul_42 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_18, %rsqrt_18), kwargs = {})
%view_97 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_42, [2, 640, 48, 48]), kwargs = {})
%_param_constant118 : [#users=1] = get_attr[target=_param_constant118]
%unsqueeze_62 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant118, 0), kwargs = {})
%unsqueeze_63 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_62, 2), kwargs = {})
%unsqueeze_64 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_63, 3), kwargs = {})
%_param_constant119 : [#users=1] = get_attr[target=_param_constant119]
%unsqueeze_65 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant119, 0), kwargs = {})
%unsqueeze_66 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_65, 2), kwargs = {})
%unsqueeze_67 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_66, 3), kwargs = {})
%mul_43 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_97, %unsqueeze_67), kwargs = {})
%add_58 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_43, %unsqueeze_64), kwargs = {})
%_to_copy_57 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_58,), kwargs = {dtype: torch.float16})
%_to_copy_58 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_49,), kwargs = {dtype: torch.float16})
%_to_copy_59 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_18,), kwargs = {dtype: torch.float16})
%squeeze_18 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_58, [2, 3]), kwargs = {})
%squeeze_19 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_59, [2, 3]), kwargs = {})
%detach_24 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_18,), kwargs = {})
%detach_25 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_19,), kwargs = {})
%silu_10 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_57,), kwargs = {})
%_param_constant120 : [#users=1] = get_attr[target=_param_constant120]
%_param_constant121 : [#users=1] = get_attr[target=_param_constant121]
%convolution_9 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_10, %_param_constant120, %_param_constant121, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_11 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant122 : [#users=1] = get_attr[target=_param_constant122]
%t_41 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant122,), kwargs = {})
%_param_constant123 : [#users=1] = get_attr[target=_param_constant123]
%addmm_20 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant123, %silu_11, %t_41), kwargs = {})
%slice_19 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_20, 0, 0, 9223372036854775807), kwargs = {})
%slice_20 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_19, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_68 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_20, 2), kwargs = {})
%unsqueeze_69 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_68, 3), kwargs = {})
%add_59 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_9, %unsqueeze_69), kwargs = {})
%view_98 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_59, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_60 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_98,), kwargs = {dtype: torch.float32})
%var_mean_19 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_60, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_50 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_19, 0), kwargs = {})
%getitem_51 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_19, 1), kwargs = {})
%add_60 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_50, 1e-05), kwargs = {})
%rsqrt_19 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_60,), kwargs = {})
%sub_19 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_98, %getitem_51), kwargs = {})
%mul_44 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_19, %rsqrt_19), kwargs = {})
%view_99 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_44, [2, 640, 48, 48]), kwargs = {})
%_param_constant124 : [#users=1] = get_attr[target=_param_constant124]
%unsqueeze_70 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant124, 0), kwargs = {})
%unsqueeze_71 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_70, 2), kwargs = {})
%unsqueeze_72 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_71, 3), kwargs = {})
%_param_constant125 : [#users=1] = get_attr[target=_param_constant125]
%unsqueeze_73 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant125, 0), kwargs = {})
%unsqueeze_74 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_73, 2), kwargs = {})
%unsqueeze_75 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_74, 3), kwargs = {})
%mul_45 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_99, %unsqueeze_75), kwargs = {})
%add_61 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_45, %unsqueeze_72), kwargs = {})
%_to_copy_61 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_61,), kwargs = {dtype: torch.float16})
%_to_copy_62 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_51,), kwargs = {dtype: torch.float16})
%_to_copy_63 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_19,), kwargs = {dtype: torch.float16})
%squeeze_20 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_62, [2, 3]), kwargs = {})
%squeeze_21 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_63, [2, 3]), kwargs = {})
%detach_26 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_20,), kwargs = {})
%detach_27 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_21,), kwargs = {})
%silu_12 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_61,), kwargs = {})
%_param_constant126 : [#users=1] = get_attr[target=_param_constant126]
%_param_constant127 : [#users=1] = get_attr[target=_param_constant127]
%convolution_10 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_12, %_param_constant126, %_param_constant127, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_62 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_56, %convolution_10), kwargs = {})
%div_4 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_62, 1.0), kwargs = {})
%view_100 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_4, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_64 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_100,), kwargs = {dtype: torch.float32})
%var_mean_20 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_64, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_52 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_20, 0), kwargs = {})
%getitem_53 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_20, 1), kwargs = {})
%add_63 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_52, 1e-06), kwargs = {})
%rsqrt_20 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_63,), kwargs = {})
%sub_20 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_100, %getitem_53), kwargs = {})
%mul_46 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_20, %rsqrt_20), kwargs = {})
%view_101 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_46, [2, 640, 48, 48]), kwargs = {})
%_param_constant128 : [#users=1] = get_attr[target=_param_constant128]
%unsqueeze_76 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant128, 0), kwargs = {})
%unsqueeze_77 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_76, 2), kwargs = {})
%unsqueeze_78 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_77, 3), kwargs = {})
%_param_constant129 : [#users=1] = get_attr[target=_param_constant129]
%unsqueeze_79 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant129, 0), kwargs = {})
%unsqueeze_80 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_79, 2), kwargs = {})
%unsqueeze_81 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_80, 3), kwargs = {})
%mul_47 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_101, %unsqueeze_81), kwargs = {})
%add_64 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_47, %unsqueeze_78), kwargs = {})
%_to_copy_65 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_64,), kwargs = {dtype: torch.float16})
%_to_copy_66 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_53,), kwargs = {dtype: torch.float16})
%_to_copy_67 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_20,), kwargs = {dtype: torch.float16})
%squeeze_22 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_66, [2, 3]), kwargs = {})
%squeeze_23 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_67, [2, 3]), kwargs = {})
%detach_28 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_22,), kwargs = {})
%detach_29 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_23,), kwargs = {})
%permute_6 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_65, [0, 2, 3, 1]), kwargs = {})
%view_102 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_6, [2, 2304, 640]), kwargs = {})
%_param_constant130 : [#users=1] = get_attr[target=_param_constant130]
%t_42 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant130,), kwargs = {})
%clone_6 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_102,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_24 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_6, [4608, 640]), kwargs = {})
%mm_21 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_24, %t_42), kwargs = {})
%_unsafe_view_25 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_21, [2, 2304, 640]), kwargs = {})
%_param_constant131 : [#users=1] = get_attr[target=_param_constant131]
%add_65 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_25, %_param_constant131), kwargs = {})
%_to_copy_68 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_65,), kwargs = {dtype: torch.float32})
%var_mean_21 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_68, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_54 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_21, 0), kwargs = {})
%getitem_55 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_21, 1), kwargs = {})
%add_66 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_54, 1e-05), kwargs = {})
%rsqrt_21 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_66,), kwargs = {})
%sub_21 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_65, %getitem_55), kwargs = {})
%mul_48 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_21, %rsqrt_21), kwargs = {})
%_param_constant132 : [#users=1] = get_attr[target=_param_constant132]
%mul_49 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_48, %_param_constant132), kwargs = {})
%_param_constant133 : [#users=1] = get_attr[target=_param_constant133]
%add_67 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_49, %_param_constant133), kwargs = {})
%_to_copy_69 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_67,), kwargs = {dtype: torch.float16})
%_param_constant134 : [#users=1] = get_attr[target=_param_constant134]
%t_43 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant134,), kwargs = {})
%view_103 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_69, [4608, 640]), kwargs = {})
%mm_22 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_103, %t_43), kwargs = {})
%_unsafe_view_26 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_22, [2, 2304, 640]), kwargs = {})
%_param_constant135 : [#users=1] = get_attr[target=_param_constant135]
%t_44 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant135,), kwargs = {})
%view_104 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_69, [4608, 640]), kwargs = {})
%mm_23 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_104, %t_44), kwargs = {})
%_unsafe_view_27 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_23, [2, 2304, 640]), kwargs = {})
%_param_constant136 : [#users=1] = get_attr[target=_param_constant136]
%t_45 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant136,), kwargs = {})
%view_105 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_69, [4608, 640]), kwargs = {})
%mm_24 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_105, %t_45), kwargs = {})
%_unsafe_view_28 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_24, [2, 2304, 640]), kwargs = {})
%view_106 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_26, [2, -1, 10, 64]), kwargs = {})
%transpose_24 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_106, 1, 2), kwargs = {})
%view_107 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_27, [2, -1, 10, 64]), kwargs = {})
%transpose_25 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_107, 1, 2), kwargs = {})
%view_108 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_28, [2, -1, 10, 64]), kwargs = {})
%transpose_26 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_108, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_6 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_24, %transpose_25, %transpose_26, True), kwargs = {})
%getitem_56 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_6, 0), kwargs = {})
%getitem_57 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_6, 1), kwargs = {})
%detach_30 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_56,), kwargs = {})
%transpose_27 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_56, 1, 2), kwargs = {})
%view_109 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_27, [2, -1, 640]), kwargs = {})
%_param_constant137 : [#users=1] = get_attr[target=_param_constant137]
%t_46 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant137,), kwargs = {})
%view_110 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_109, [4608, 640]), kwargs = {})
%_param_constant138 : [#users=1] = get_attr[target=_param_constant138]
%addmm_21 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant138, %view_110, %t_46), kwargs = {})
%view_111 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_21, [2, 2304, 640]), kwargs = {})
%add_68 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_111, %add_65), kwargs = {})
%_to_copy_70 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_68,), kwargs = {dtype: torch.float32})
%var_mean_22 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_70, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_58 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_22, 0), kwargs = {})
%getitem_59 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_22, 1), kwargs = {})
%add_69 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_58, 1e-05), kwargs = {})
%rsqrt_22 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_69,), kwargs = {})
%sub_22 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_68, %getitem_59), kwargs = {})
%mul_50 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_22, %rsqrt_22), kwargs = {})
%_param_constant139 : [#users=1] = get_attr[target=_param_constant139]
%mul_51 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_50, %_param_constant139), kwargs = {})
%_param_constant140 : [#users=1] = get_attr[target=_param_constant140]
%add_70 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_51, %_param_constant140), kwargs = {})
%_to_copy_71 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_70,), kwargs = {dtype: torch.float16})
%_param_constant141 : [#users=1] = get_attr[target=_param_constant141]
%t_47 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant141,), kwargs = {})
%view_112 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_71, [4608, 640]), kwargs = {})
%mm_25 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_112, %t_47), kwargs = {})
%_unsafe_view_29 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_25, [2, 2304, 640]), kwargs = {})
%_param_constant142 : [#users=1] = get_attr[target=_param_constant142]
%t_48 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant142,), kwargs = {})
%view_113 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_26 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_113, %t_48), kwargs = {})
%_unsafe_view_30 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_26, [2, 77, 640]), kwargs = {})
%_param_constant143 : [#users=1] = get_attr[target=_param_constant143]
%t_49 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant143,), kwargs = {})
%view_114 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_27 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_114, %t_49), kwargs = {})
%_unsafe_view_31 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_27, [2, 77, 640]), kwargs = {})
%view_115 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_29, [2, -1, 10, 64]), kwargs = {})
%transpose_28 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_115, 1, 2), kwargs = {})
%view_116 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_30, [2, -1, 10, 64]), kwargs = {})
%transpose_29 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_116, 1, 2), kwargs = {})
%view_117 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_31, [2, -1, 10, 64]), kwargs = {})
%transpose_30 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_117, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_7 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_28, %transpose_29, %transpose_30, True), kwargs = {})
%getitem_60 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_7, 0), kwargs = {})
%getitem_61 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_7, 1), kwargs = {})
%detach_31 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_60,), kwargs = {})
%transpose_31 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_60, 1, 2), kwargs = {})
%view_118 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_31, [2, -1, 640]), kwargs = {})
%_param_constant144 : [#users=1] = get_attr[target=_param_constant144]
%t_50 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant144,), kwargs = {})
%view_119 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_118, [4608, 640]), kwargs = {})
%_param_constant145 : [#users=1] = get_attr[target=_param_constant145]
%addmm_22 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant145, %view_119, %t_50), kwargs = {})
%view_120 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_22, [2, 2304, 640]), kwargs = {})
%add_71 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_120, %add_68), kwargs = {})
%_to_copy_72 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_71,), kwargs = {dtype: torch.float32})
%var_mean_23 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_72, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_62 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_23, 0), kwargs = {})
%getitem_63 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_23, 1), kwargs = {})
%add_72 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_62, 1e-05), kwargs = {})
%rsqrt_23 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_72,), kwargs = {})
%sub_23 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_71, %getitem_63), kwargs = {})
%mul_52 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_23, %rsqrt_23), kwargs = {})
%_param_constant146 : [#users=1] = get_attr[target=_param_constant146]
%mul_53 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_52, %_param_constant146), kwargs = {})
%_param_constant147 : [#users=1] = get_attr[target=_param_constant147]
%add_73 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_53, %_param_constant147), kwargs = {})
%_to_copy_73 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_73,), kwargs = {dtype: torch.float16})
%_param_constant148 : [#users=1] = get_attr[target=_param_constant148]
%t_51 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant148,), kwargs = {})
%view_121 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_73, [4608, 640]), kwargs = {})
%_param_constant149 : [#users=1] = get_attr[target=_param_constant149]
%addmm_23 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant149, %view_121, %t_51), kwargs = {})
%view_122 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_23, [2, 2304, 5120]), kwargs = {})
%slice_21 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_122, -1, 0, 2560), kwargs = {})
%slice_22 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_122, -1, 2560, 5120), kwargs = {})
%gelu_3 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_22,), kwargs = {})
%mul_54 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_21, %gelu_3), kwargs = {})
%_param_constant150 : [#users=1] = get_attr[target=_param_constant150]
%t_52 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant150,), kwargs = {})
%view_123 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_54, [4608, 2560]), kwargs = {})
%_param_constant151 : [#users=1] = get_attr[target=_param_constant151]
%addmm_24 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant151, %view_123, %t_52), kwargs = {})
%view_124 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_24, [2, 2304, 640]), kwargs = {})
%add_74 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_124, %add_71), kwargs = {})
%_param_constant152 : [#users=1] = get_attr[target=_param_constant152]
%t_53 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant152,), kwargs = {})
%view_125 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_74, [4608, 640]), kwargs = {})
%_param_constant153 : [#users=1] = get_attr[target=_param_constant153]
%addmm_25 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant153, %view_125, %t_53), kwargs = {})
%view_126 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_25, [2, 2304, 640]), kwargs = {})
%view_127 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_126, [2, 48, 48, 640]), kwargs = {})
%permute_7 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_127, [0, 3, 1, 2]), kwargs = {})
%clone_7 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_7,), kwargs = {memory_format: torch.contiguous_format})
%add_75 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_7, %div_4), kwargs = {})
%_param_constant154 : [#users=1] = get_attr[target=_param_constant154]
%_param_constant155 : [#users=1] = get_attr[target=_param_constant155]
%convolution_11 : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%add_75, %_param_constant154, %_param_constant155, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%view_128 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution_11, [2, 32, 20, 576]), kwargs = {})
%_to_copy_74 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_128,), kwargs = {dtype: torch.float32})
%var_mean_24 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_74, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_64 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_24, 0), kwargs = {})
%getitem_65 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_24, 1), kwargs = {})
%add_76 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_64, 1e-05), kwargs = {})
%rsqrt_24 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_76,), kwargs = {})
%sub_24 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_128, %getitem_65), kwargs = {})
%mul_55 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_24, %rsqrt_24), kwargs = {})
%view_129 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_55, [2, 640, 24, 24]), kwargs = {})
%_param_constant156 : [#users=1] = get_attr[target=_param_constant156]
%unsqueeze_82 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant156, 0), kwargs = {})
%unsqueeze_83 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_82, 2), kwargs = {})
%unsqueeze_84 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_83, 3), kwargs = {})
%_param_constant157 : [#users=1] = get_attr[target=_param_constant157]
%unsqueeze_85 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant157, 0), kwargs = {})
%unsqueeze_86 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_85, 2), kwargs = {})
%unsqueeze_87 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_86, 3), kwargs = {})
%mul_56 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_129, %unsqueeze_87), kwargs = {})
%add_77 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_56, %unsqueeze_84), kwargs = {})
%_to_copy_75 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_77,), kwargs = {dtype: torch.float16})
%_to_copy_76 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_65,), kwargs = {dtype: torch.float16})
%_to_copy_77 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_24,), kwargs = {dtype: torch.float16})
%squeeze_24 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_76, [2, 3]), kwargs = {})
%squeeze_25 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_77, [2, 3]), kwargs = {})
%detach_32 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_24,), kwargs = {})
%detach_33 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_25,), kwargs = {})
%silu_13 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_75,), kwargs = {})
%_param_constant158 : [#users=1] = get_attr[target=_param_constant158]
%_param_constant159 : [#users=1] = get_attr[target=_param_constant159]
%convolution_12 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_13, %_param_constant158, %_param_constant159, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_14 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant160 : [#users=1] = get_attr[target=_param_constant160]
%t_54 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant160,), kwargs = {})
%_param_constant161 : [#users=1] = get_attr[target=_param_constant161]
%addmm_26 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant161, %silu_14, %t_54), kwargs = {})
%slice_23 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_26, 0, 0, 9223372036854775807), kwargs = {})
%slice_24 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_23, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_88 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_24, 2), kwargs = {})
%unsqueeze_89 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_88, 3), kwargs = {})
%add_78 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_12, %unsqueeze_89), kwargs = {})
%view_130 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_78, [2, 32, 40, 576]), kwargs = {})
%_to_copy_78 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_130,), kwargs = {dtype: torch.float32})
%var_mean_25 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_78, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_66 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_25, 0), kwargs = {})
%getitem_67 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_25, 1), kwargs = {})
%add_79 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_66, 1e-05), kwargs = {})
%rsqrt_25 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_79,), kwargs = {})
%sub_25 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_130, %getitem_67), kwargs = {})
%mul_57 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_25, %rsqrt_25), kwargs = {})
%view_131 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_57, [2, 1280, 24, 24]), kwargs = {})
%_param_constant162 : [#users=1] = get_attr[target=_param_constant162]
%unsqueeze_90 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant162, 0), kwargs = {})
%unsqueeze_91 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_90, 2), kwargs = {})
%unsqueeze_92 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_91, 3), kwargs = {})
%_param_constant163 : [#users=1] = get_attr[target=_param_constant163]
%unsqueeze_93 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant163, 0), kwargs = {})
%unsqueeze_94 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_93, 2), kwargs = {})
%unsqueeze_95 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_94, 3), kwargs = {})
%mul_58 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_131, %unsqueeze_95), kwargs = {})
%add_80 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_58, %unsqueeze_92), kwargs = {})
%_to_copy_79 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_80,), kwargs = {dtype: torch.float16})
%_to_copy_80 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_67,), kwargs = {dtype: torch.float16})
%_to_copy_81 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_25,), kwargs = {dtype: torch.float16})
%squeeze_26 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_80, [2, 3]), kwargs = {})
%squeeze_27 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_81, [2, 3]), kwargs = {})
%detach_34 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_26,), kwargs = {})
%detach_35 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_27,), kwargs = {})
%silu_15 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_79,), kwargs = {})
%_param_constant164 : [#users=1] = get_attr[target=_param_constant164]
%_param_constant165 : [#users=1] = get_attr[target=_param_constant165]
%convolution_13 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_15, %_param_constant164, %_param_constant165, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant166 : [#users=1] = get_attr[target=_param_constant166]
%_param_constant167 : [#users=1] = get_attr[target=_param_constant167]
%convolution_14 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%convolution_11, %_param_constant166, %_param_constant167, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_81 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_14, %convolution_13), kwargs = {})
%div_5 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_81, 1.0), kwargs = {})
%view_132 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_5, [2, 32, 40, 576]), kwargs = {})
%_to_copy_82 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_132,), kwargs = {dtype: torch.float32})
%var_mean_26 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_82, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_68 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_26, 0), kwargs = {})
%getitem_69 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_26, 1), kwargs = {})
%add_82 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_68, 1e-06), kwargs = {})
%rsqrt_26 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_82,), kwargs = {})
%sub_26 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_132, %getitem_69), kwargs = {})
%mul_59 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_26, %rsqrt_26), kwargs = {})
%view_133 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_59, [2, 1280, 24, 24]), kwargs = {})
%_param_constant168 : [#users=1] = get_attr[target=_param_constant168]
%unsqueeze_96 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant168, 0), kwargs = {})
%unsqueeze_97 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_96, 2), kwargs = {})
%unsqueeze_98 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_97, 3), kwargs = {})
%_param_constant169 : [#users=1] = get_attr[target=_param_constant169]
%unsqueeze_99 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant169, 0), kwargs = {})
%unsqueeze_100 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_99, 2), kwargs = {})
%unsqueeze_101 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_100, 3), kwargs = {})
%mul_60 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_133, %unsqueeze_101), kwargs = {})
%add_83 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_60, %unsqueeze_98), kwargs = {})
%_to_copy_83 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_83,), kwargs = {dtype: torch.float16})
%_to_copy_84 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_69,), kwargs = {dtype: torch.float16})
%_to_copy_85 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_26,), kwargs = {dtype: torch.float16})
%squeeze_28 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_84, [2, 3]), kwargs = {})
%squeeze_29 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_85, [2, 3]), kwargs = {})
%detach_36 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_28,), kwargs = {})
%detach_37 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_29,), kwargs = {})
%permute_8 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_83, [0, 2, 3, 1]), kwargs = {})
%view_134 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_8, [2, 576, 1280]), kwargs = {})
%_param_constant170 : [#users=1] = get_attr[target=_param_constant170]
%t_55 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant170,), kwargs = {})
%clone_8 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_134,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_32 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_8, [1152, 1280]), kwargs = {})
%mm_28 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_32, %t_55), kwargs = {})
%_unsafe_view_33 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_28, [2, 576, 1280]), kwargs = {})
%_param_constant171 : [#users=1] = get_attr[target=_param_constant171]
%add_84 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_33, %_param_constant171), kwargs = {})
%_to_copy_86 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_84,), kwargs = {dtype: torch.float32})
%var_mean_27 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_86, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_70 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_27, 0), kwargs = {})
%getitem_71 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_27, 1), kwargs = {})
%add_85 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_70, 1e-05), kwargs = {})
%rsqrt_27 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_85,), kwargs = {})
%sub_27 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_84, %getitem_71), kwargs = {})
%mul_61 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_27, %rsqrt_27), kwargs = {})
%_param_constant172 : [#users=1] = get_attr[target=_param_constant172]
%mul_62 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_61, %_param_constant172), kwargs = {})
%_param_constant173 : [#users=1] = get_attr[target=_param_constant173]
%add_86 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_62, %_param_constant173), kwargs = {})
%_to_copy_87 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_86,), kwargs = {dtype: torch.float16})
%_param_constant174 : [#users=1] = get_attr[target=_param_constant174]
%t_56 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant174,), kwargs = {})
%view_135 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_87, [1152, 1280]), kwargs = {})
%mm_29 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_135, %t_56), kwargs = {})
%_unsafe_view_34 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_29, [2, 576, 1280]), kwargs = {})
%_param_constant175 : [#users=1] = get_attr[target=_param_constant175]
%t_57 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant175,), kwargs = {})
%view_136 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_87, [1152, 1280]), kwargs = {})
%mm_30 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_136, %t_57), kwargs = {})
%_unsafe_view_35 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_30, [2, 576, 1280]), kwargs = {})
%_param_constant176 : [#users=1] = get_attr[target=_param_constant176]
%t_58 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant176,), kwargs = {})
%view_137 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_87, [1152, 1280]), kwargs = {})
%mm_31 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_137, %t_58), kwargs = {})
%_unsafe_view_36 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_31, [2, 576, 1280]), kwargs = {})
%view_138 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_34, [2, -1, 20, 64]), kwargs = {})
%transpose_32 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_138, 1, 2), kwargs = {})
%view_139 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_35, [2, -1, 20, 64]), kwargs = {})
%transpose_33 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_139, 1, 2), kwargs = {})
%view_140 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_36, [2, -1, 20, 64]), kwargs = {})
%transpose_34 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_140, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_8 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_32, %transpose_33, %transpose_34, True), kwargs = {})
%getitem_72 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_8, 0), kwargs = {})
%getitem_73 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_8, 1), kwargs = {})
%detach_38 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_72,), kwargs = {})
%transpose_35 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_72, 1, 2), kwargs = {})
%view_141 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_35, [2, -1, 1280]), kwargs = {})
%_param_constant177 : [#users=1] = get_attr[target=_param_constant177]
%t_59 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant177,), kwargs = {})
%view_142 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_141, [1152, 1280]), kwargs = {})
%_param_constant178 : [#users=1] = get_attr[target=_param_constant178]
%addmm_27 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant178, %view_142, %t_59), kwargs = {})
%view_143 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_27, [2, 576, 1280]), kwargs = {})
%add_87 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_143, %add_84), kwargs = {})
%_to_copy_88 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_87,), kwargs = {dtype: torch.float32})
%var_mean_28 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_88, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_74 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_28, 0), kwargs = {})
%getitem_75 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_28, 1), kwargs = {})
%add_88 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_74, 1e-05), kwargs = {})
%rsqrt_28 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_88,), kwargs = {})
%sub_28 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_87, %getitem_75), kwargs = {})
%mul_63 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_28, %rsqrt_28), kwargs = {})
%_param_constant179 : [#users=1] = get_attr[target=_param_constant179]
%mul_64 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_63, %_param_constant179), kwargs = {})
%_param_constant180 : [#users=1] = get_attr[target=_param_constant180]
%add_89 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_64, %_param_constant180), kwargs = {})
%_to_copy_89 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_89,), kwargs = {dtype: torch.float16})
%_param_constant181 : [#users=1] = get_attr[target=_param_constant181]
%t_60 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant181,), kwargs = {})
%view_144 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_89, [1152, 1280]), kwargs = {})
%mm_32 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_144, %t_60), kwargs = {})
%_unsafe_view_37 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_32, [2, 576, 1280]), kwargs = {})
%_param_constant182 : [#users=1] = get_attr[target=_param_constant182]
%t_61 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant182,), kwargs = {})
%view_145 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_33 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_145, %t_61), kwargs = {})
%_unsafe_view_38 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_33, [2, 77, 1280]), kwargs = {})
%_param_constant183 : [#users=1] = get_attr[target=_param_constant183]
%t_62 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant183,), kwargs = {})
%view_146 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_34 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_146, %t_62), kwargs = {})
%_unsafe_view_39 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_34, [2, 77, 1280]), kwargs = {})
%view_147 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_37, [2, -1, 20, 64]), kwargs = {})
%transpose_36 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_147, 1, 2), kwargs = {})
%view_148 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_38, [2, -1, 20, 64]), kwargs = {})
%transpose_37 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_148, 1, 2), kwargs = {})
%view_149 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_39, [2, -1, 20, 64]), kwargs = {})
%transpose_38 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_149, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_9 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_36, %transpose_37, %transpose_38, True), kwargs = {})
%getitem_76 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_9, 0), kwargs = {})
%getitem_77 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_9, 1), kwargs = {})
%detach_39 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_76,), kwargs = {})
%transpose_39 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_76, 1, 2), kwargs = {})
%view_150 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_39, [2, -1, 1280]), kwargs = {})
%_param_constant184 : [#users=1] = get_attr[target=_param_constant184]
%t_63 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant184,), kwargs = {})
%view_151 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_150, [1152, 1280]), kwargs = {})
%_param_constant185 : [#users=1] = get_attr[target=_param_constant185]
%addmm_28 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant185, %view_151, %t_63), kwargs = {})
%view_152 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_28, [2, 576, 1280]), kwargs = {})
%add_90 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_152, %add_87), kwargs = {})
%_to_copy_90 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_90,), kwargs = {dtype: torch.float32})
%var_mean_29 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_90, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_78 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_29, 0), kwargs = {})
%getitem_79 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_29, 1), kwargs = {})
%add_91 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_78, 1e-05), kwargs = {})
%rsqrt_29 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_91,), kwargs = {})
%sub_29 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_90, %getitem_79), kwargs = {})
%mul_65 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_29, %rsqrt_29), kwargs = {})
%_param_constant186 : [#users=1] = get_attr[target=_param_constant186]
%mul_66 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_65, %_param_constant186), kwargs = {})
%_param_constant187 : [#users=1] = get_attr[target=_param_constant187]
%add_92 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_66, %_param_constant187), kwargs = {})
%_to_copy_91 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_92,), kwargs = {dtype: torch.float16})
%_param_constant188 : [#users=1] = get_attr[target=_param_constant188]
%t_64 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant188,), kwargs = {})
%view_153 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_91, [1152, 1280]), kwargs = {})
%_param_constant189 : [#users=1] = get_attr[target=_param_constant189]
%addmm_29 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant189, %view_153, %t_64), kwargs = {})
%view_154 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_29, [2, 576, 10240]), kwargs = {})
%slice_25 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_154, -1, 0, 5120), kwargs = {})
%slice_26 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_154, -1, 5120, 10240), kwargs = {})
%gelu_4 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_26,), kwargs = {})
%mul_67 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_25, %gelu_4), kwargs = {})
%_param_constant190 : [#users=1] = get_attr[target=_param_constant190]
%t_65 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant190,), kwargs = {})
%view_155 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_67, [1152, 5120]), kwargs = {})
%_param_constant191 : [#users=1] = get_attr[target=_param_constant191]
%addmm_30 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant191, %view_155, %t_65), kwargs = {})
%view_156 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_30, [2, 576, 1280]), kwargs = {})
%add_93 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_156, %add_90), kwargs = {})
%_param_constant192 : [#users=1] = get_attr[target=_param_constant192]
%t_66 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant192,), kwargs = {})
%view_157 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_93, [1152, 1280]), kwargs = {})
%_param_constant193 : [#users=1] = get_attr[target=_param_constant193]
%addmm_31 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant193, %view_157, %t_66), kwargs = {})
%view_158 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_31, [2, 576, 1280]), kwargs = {})
%view_159 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_158, [2, 24, 24, 1280]), kwargs = {})
%permute_9 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_159, [0, 3, 1, 2]), kwargs = {})
%clone_9 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_9,), kwargs = {memory_format: torch.contiguous_format})
%add_94 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%clone_9, %div_5), kwargs = {})
%view_160 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_94, [2, 32, 40, 576]), kwargs = {})
%_to_copy_92 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_160,), kwargs = {dtype: torch.float32})
%var_mean_30 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_92, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_80 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_30, 0), kwargs = {})
%getitem_81 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_30, 1), kwargs = {})
%add_95 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_80, 1e-05), kwargs = {})
%rsqrt_30 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_95,), kwargs = {})
%sub_30 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_160, %getitem_81), kwargs = {})
%mul_68 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_30, %rsqrt_30), kwargs = {})
%view_161 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_68, [2, 1280, 24, 24]), kwargs = {})
%_param_constant194 : [#users=1] = get_attr[target=_param_constant194]
%unsqueeze_102 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant194, 0), kwargs = {})
%unsqueeze_103 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_102, 2), kwargs = {})
%unsqueeze_104 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_103, 3), kwargs = {})
%_param_constant195 : [#users=1] = get_attr[target=_param_constant195]
%unsqueeze_105 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant195, 0), kwargs = {})
%unsqueeze_106 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_105, 2), kwargs = {})
%unsqueeze_107 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_106, 3), kwargs = {})
%mul_69 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_161, %unsqueeze_107), kwargs = {})
%add_96 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_69, %unsqueeze_104), kwargs = {})
%_to_copy_93 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_96,), kwargs = {dtype: torch.float16})
%_to_copy_94 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_81,), kwargs = {dtype: torch.float16})
%_to_copy_95 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_30,), kwargs = {dtype: torch.float16})
%squeeze_30 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_94, [2, 3]), kwargs = {})
%squeeze_31 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_95, [2, 3]), kwargs = {})
%detach_40 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_30,), kwargs = {})
%detach_41 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_31,), kwargs = {})
%silu_16 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_93,), kwargs = {})
%_param_constant196 : [#users=1] = get_attr[target=_param_constant196]
%_param_constant197 : [#users=1] = get_attr[target=_param_constant197]
%convolution_15 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_16, %_param_constant196, %_param_constant197, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_17 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant198 : [#users=1] = get_attr[target=_param_constant198]
%t_67 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant198,), kwargs = {})
%_param_constant199 : [#users=1] = get_attr[target=_param_constant199]
%addmm_32 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant199, %silu_17, %t_67), kwargs = {})
%slice_27 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_32, 0, 0, 9223372036854775807), kwargs = {})
%slice_28 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_27, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_108 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_28, 2), kwargs = {})
%unsqueeze_109 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_108, 3), kwargs = {})
%add_97 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_15, %unsqueeze_109), kwargs = {})
%view_162 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_97, [2, 32, 40, 576]), kwargs = {})
%_to_copy_96 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_162,), kwargs = {dtype: torch.float32})
%var_mean_31 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_96, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_82 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_31, 0), kwargs = {})
%getitem_83 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_31, 1), kwargs = {})
%add_98 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_82, 1e-05), kwargs = {})
%rsqrt_31 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_98,), kwargs = {})
%sub_31 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_162, %getitem_83), kwargs = {})
%mul_70 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_31, %rsqrt_31), kwargs = {})
%view_163 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_70, [2, 1280, 24, 24]), kwargs = {})
%_param_constant200 : [#users=1] = get_attr[target=_param_constant200]
%unsqueeze_110 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant200, 0), kwargs = {})
%unsqueeze_111 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_110, 2), kwargs = {})
%unsqueeze_112 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_111, 3), kwargs = {})
%_param_constant201 : [#users=1] = get_attr[target=_param_constant201]
%unsqueeze_113 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant201, 0), kwargs = {})
%unsqueeze_114 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_113, 2), kwargs = {})
%unsqueeze_115 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_114, 3), kwargs = {})
%mul_71 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_163, %unsqueeze_115), kwargs = {})
%add_99 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_71, %unsqueeze_112), kwargs = {})
%_to_copy_97 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_99,), kwargs = {dtype: torch.float16})
%_to_copy_98 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_83,), kwargs = {dtype: torch.float16})
%_to_copy_99 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_31,), kwargs = {dtype: torch.float16})
%squeeze_32 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_98, [2, 3]), kwargs = {})
%squeeze_33 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_99, [2, 3]), kwargs = {})
%detach_42 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_32,), kwargs = {})
%detach_43 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_33,), kwargs = {})
%silu_18 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_97,), kwargs = {})
%_param_constant202 : [#users=1] = get_attr[target=_param_constant202]
%_param_constant203 : [#users=1] = get_attr[target=_param_constant203]
%convolution_16 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_18, %_param_constant202, %_param_constant203, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_100 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_94, %convolution_16), kwargs = {})
%div_6 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_100, 1.0), kwargs = {})
%view_164 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_6, [2, 32, 40, 576]), kwargs = {})
%_to_copy_100 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_164,), kwargs = {dtype: torch.float32})
%var_mean_32 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_100, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_84 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_32, 0), kwargs = {})
%getitem_85 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_32, 1), kwargs = {})
%add_101 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_84, 1e-06), kwargs = {})
%rsqrt_32 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_101,), kwargs = {})
%sub_32 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_164, %getitem_85), kwargs = {})
%mul_72 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_32, %rsqrt_32), kwargs = {})
%view_165 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_72, [2, 1280, 24, 24]), kwargs = {})
%_param_constant204 : [#users=1] = get_attr[target=_param_constant204]
%unsqueeze_116 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant204, 0), kwargs = {})
%unsqueeze_117 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_116, 2), kwargs = {})
%unsqueeze_118 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_117, 3), kwargs = {})
%_param_constant205 : [#users=1] = get_attr[target=_param_constant205]
%unsqueeze_119 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant205, 0), kwargs = {})
%unsqueeze_120 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_119, 2), kwargs = {})
%unsqueeze_121 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_120, 3), kwargs = {})
%mul_73 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_165, %unsqueeze_121), kwargs = {})
%add_102 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_73, %unsqueeze_118), kwargs = {})
%_to_copy_101 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_102,), kwargs = {dtype: torch.float16})
%_to_copy_102 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_85,), kwargs = {dtype: torch.float16})
%_to_copy_103 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_32,), kwargs = {dtype: torch.float16})
%squeeze_34 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_102, [2, 3]), kwargs = {})
%squeeze_35 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_103, [2, 3]), kwargs = {})
%detach_44 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_34,), kwargs = {})
%detach_45 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_35,), kwargs = {})
%permute_10 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_101, [0, 2, 3, 1]), kwargs = {})
%view_166 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_10, [2, 576, 1280]), kwargs = {})
%_param_constant206 : [#users=1] = get_attr[target=_param_constant206]
%t_68 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant206,), kwargs = {})
%clone_10 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_166,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_40 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_10, [1152, 1280]), kwargs = {})
%mm_35 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_40, %t_68), kwargs = {})
%_unsafe_view_41 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_35, [2, 576, 1280]), kwargs = {})
%_param_constant207 : [#users=1] = get_attr[target=_param_constant207]
%add_103 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_41, %_param_constant207), kwargs = {})
%_to_copy_104 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_103,), kwargs = {dtype: torch.float32})
%var_mean_33 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_104, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_86 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_33, 0), kwargs = {})
%getitem_87 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_33, 1), kwargs = {})
%add_104 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_86, 1e-05), kwargs = {})
%rsqrt_33 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_104,), kwargs = {})
%sub_33 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_103, %getitem_87), kwargs = {})
%mul_74 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_33, %rsqrt_33), kwargs = {})
%_param_constant208 : [#users=1] = get_attr[target=_param_constant208]
%mul_75 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_74, %_param_constant208), kwargs = {})
%_param_constant209 : [#users=1] = get_attr[target=_param_constant209]
%add_105 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_75, %_param_constant209), kwargs = {})
%_to_copy_105 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_105,), kwargs = {dtype: torch.float16})
%_param_constant210 : [#users=1] = get_attr[target=_param_constant210]
%t_69 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant210,), kwargs = {})
%view_167 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_105, [1152, 1280]), kwargs = {})
%mm_36 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_167, %t_69), kwargs = {})
%_unsafe_view_42 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_36, [2, 576, 1280]), kwargs = {})
%_param_constant211 : [#users=1] = get_attr[target=_param_constant211]
%t_70 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant211,), kwargs = {})
%view_168 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_105, [1152, 1280]), kwargs = {})
%mm_37 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_168, %t_70), kwargs = {})
%_unsafe_view_43 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_37, [2, 576, 1280]), kwargs = {})
%_param_constant212 : [#users=1] = get_attr[target=_param_constant212]
%t_71 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant212,), kwargs = {})
%view_169 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_105, [1152, 1280]), kwargs = {})
%mm_38 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_169, %t_71), kwargs = {})
%_unsafe_view_44 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_38, [2, 576, 1280]), kwargs = {})
%view_170 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_42, [2, -1, 20, 64]), kwargs = {})
%transpose_40 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_170, 1, 2), kwargs = {})
%view_171 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_43, [2, -1, 20, 64]), kwargs = {})
%transpose_41 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_171, 1, 2), kwargs = {})
%view_172 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_44, [2, -1, 20, 64]), kwargs = {})
%transpose_42 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_172, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_10 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_40, %transpose_41, %transpose_42, True), kwargs = {})
%getitem_88 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_10, 0), kwargs = {})
%getitem_89 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_10, 1), kwargs = {})
%detach_46 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_88,), kwargs = {})
%transpose_43 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_88, 1, 2), kwargs = {})
%view_173 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_43, [2, -1, 1280]), kwargs = {})
%_param_constant213 : [#users=1] = get_attr[target=_param_constant213]
%t_72 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant213,), kwargs = {})
%view_174 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_173, [1152, 1280]), kwargs = {})
%_param_constant214 : [#users=1] = get_attr[target=_param_constant214]
%addmm_33 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant214, %view_174, %t_72), kwargs = {})
%view_175 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_33, [2, 576, 1280]), kwargs = {})
%add_106 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_175, %add_103), kwargs = {})
%_to_copy_106 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_106,), kwargs = {dtype: torch.float32})
%var_mean_34 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_106, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_90 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_34, 0), kwargs = {})
%getitem_91 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_34, 1), kwargs = {})
%add_107 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_90, 1e-05), kwargs = {})
%rsqrt_34 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_107,), kwargs = {})
%sub_34 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_106, %getitem_91), kwargs = {})
%mul_76 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_34, %rsqrt_34), kwargs = {})
%_param_constant215 : [#users=1] = get_attr[target=_param_constant215]
%mul_77 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_76, %_param_constant215), kwargs = {})
%_param_constant216 : [#users=1] = get_attr[target=_param_constant216]
%add_108 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_77, %_param_constant216), kwargs = {})
%_to_copy_107 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_108,), kwargs = {dtype: torch.float16})
%_param_constant217 : [#users=1] = get_attr[target=_param_constant217]
%t_73 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant217,), kwargs = {})
%view_176 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_107, [1152, 1280]), kwargs = {})
%mm_39 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_176, %t_73), kwargs = {})
%_unsafe_view_45 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_39, [2, 576, 1280]), kwargs = {})
%_param_constant218 : [#users=1] = get_attr[target=_param_constant218]
%t_74 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant218,), kwargs = {})
%view_177 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_40 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_177, %t_74), kwargs = {})
%_unsafe_view_46 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_40, [2, 77, 1280]), kwargs = {})
%_param_constant219 : [#users=1] = get_attr[target=_param_constant219]
%t_75 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant219,), kwargs = {})
%view_178 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_41 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_178, %t_75), kwargs = {})
%_unsafe_view_47 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_41, [2, 77, 1280]), kwargs = {})
%view_179 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_45, [2, -1, 20, 64]), kwargs = {})
%transpose_44 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_179, 1, 2), kwargs = {})
%view_180 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_46, [2, -1, 20, 64]), kwargs = {})
%transpose_45 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_180, 1, 2), kwargs = {})
%view_181 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_47, [2, -1, 20, 64]), kwargs = {})
%transpose_46 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_181, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_11 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_44, %transpose_45, %transpose_46, True), kwargs = {})
%getitem_92 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_11, 0), kwargs = {})
%getitem_93 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_11, 1), kwargs = {})
%detach_47 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_92,), kwargs = {})
%transpose_47 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_92, 1, 2), kwargs = {})
%view_182 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_47, [2, -1, 1280]), kwargs = {})
%_param_constant220 : [#users=1] = get_attr[target=_param_constant220]
%t_76 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant220,), kwargs = {})
%view_183 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_182, [1152, 1280]), kwargs = {})
%_param_constant221 : [#users=1] = get_attr[target=_param_constant221]
%addmm_34 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant221, %view_183, %t_76), kwargs = {})
%view_184 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_34, [2, 576, 1280]), kwargs = {})
%add_109 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_184, %add_106), kwargs = {})
%_to_copy_108 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_109,), kwargs = {dtype: torch.float32})
%var_mean_35 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_108, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_94 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_35, 0), kwargs = {})
%getitem_95 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_35, 1), kwargs = {})
%add_110 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_94, 1e-05), kwargs = {})
%rsqrt_35 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_110,), kwargs = {})
%sub_35 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_109, %getitem_95), kwargs = {})
%mul_78 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_35, %rsqrt_35), kwargs = {})
%_param_constant222 : [#users=1] = get_attr[target=_param_constant222]
%mul_79 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_78, %_param_constant222), kwargs = {})
%_param_constant223 : [#users=1] = get_attr[target=_param_constant223]
%add_111 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_79, %_param_constant223), kwargs = {})
%_to_copy_109 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_111,), kwargs = {dtype: torch.float16})
%_param_constant224 : [#users=1] = get_attr[target=_param_constant224]
%t_77 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant224,), kwargs = {})
%view_185 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_109, [1152, 1280]), kwargs = {})
%_param_constant225 : [#users=1] = get_attr[target=_param_constant225]
%addmm_35 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant225, %view_185, %t_77), kwargs = {})
%view_186 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_35, [2, 576, 10240]), kwargs = {})
%slice_29 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_186, -1, 0, 5120), kwargs = {})
%slice_30 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_186, -1, 5120, 10240), kwargs = {})
%gelu_5 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_30,), kwargs = {})
%mul_80 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_29, %gelu_5), kwargs = {})
%_param_constant226 : [#users=1] = get_attr[target=_param_constant226]
%t_78 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant226,), kwargs = {})
%view_187 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_80, [1152, 5120]), kwargs = {})
%_param_constant227 : [#users=1] = get_attr[target=_param_constant227]
%addmm_36 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant227, %view_187, %t_78), kwargs = {})
%view_188 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_36, [2, 576, 1280]), kwargs = {})
%add_112 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_188, %add_109), kwargs = {})
%_param_constant228 : [#users=1] = get_attr[target=_param_constant228]
%t_79 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant228,), kwargs = {})
%view_189 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_112, [1152, 1280]), kwargs = {})
%_param_constant229 : [#users=1] = get_attr[target=_param_constant229]
%addmm_37 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant229, %view_189, %t_79), kwargs = {})
%view_190 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_37, [2, 576, 1280]), kwargs = {})
%view_191 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_190, [2, 24, 24, 1280]), kwargs = {})
%permute_11 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_191, [0, 3, 1, 2]), kwargs = {})
%clone_11 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_11,), kwargs = {memory_format: torch.contiguous_format})
%add_113 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_11, %div_6), kwargs = {})
%_param_constant230 : [#users=1] = get_attr[target=_param_constant230]
%_param_constant231 : [#users=1] = get_attr[target=_param_constant231]
%convolution_17 : [#users=3] = call_function[target=torch.ops.aten.convolution](args = (%add_113, %_param_constant230, %_param_constant231, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%view_192 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%convolution_17, [2, 32, 40, 144]), kwargs = {})
%_to_copy_110 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_192,), kwargs = {dtype: torch.float32})
%var_mean_36 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_110, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_96 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_36, 0), kwargs = {})
%getitem_97 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_36, 1), kwargs = {})
%add_114 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_96, 1e-05), kwargs = {})
%rsqrt_36 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_114,), kwargs = {})
%sub_36 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_192, %getitem_97), kwargs = {})
%mul_81 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_36, %rsqrt_36), kwargs = {})
%view_193 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_81, [2, 1280, 12, 12]), kwargs = {})
%_param_constant232 : [#users=1] = get_attr[target=_param_constant232]
%unsqueeze_122 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant232, 0), kwargs = {})
%unsqueeze_123 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_122, 2), kwargs = {})
%unsqueeze_124 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_123, 3), kwargs = {})
%_param_constant233 : [#users=1] = get_attr[target=_param_constant233]
%unsqueeze_125 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant233, 0), kwargs = {})
%unsqueeze_126 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_125, 2), kwargs = {})
%unsqueeze_127 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_126, 3), kwargs = {})
%mul_82 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_193, %unsqueeze_127), kwargs = {})
%add_115 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_82, %unsqueeze_124), kwargs = {})
%_to_copy_111 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_115,), kwargs = {dtype: torch.float16})
%_to_copy_112 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_97,), kwargs = {dtype: torch.float16})
%_to_copy_113 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_36,), kwargs = {dtype: torch.float16})
%squeeze_36 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_112, [2, 3]), kwargs = {})
%squeeze_37 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_113, [2, 3]), kwargs = {})
%detach_48 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_36,), kwargs = {})
%detach_49 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_37,), kwargs = {})
%silu_19 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_111,), kwargs = {})
%_param_constant234 : [#users=1] = get_attr[target=_param_constant234]
%_param_constant235 : [#users=1] = get_attr[target=_param_constant235]
%convolution_18 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_19, %_param_constant234, %_param_constant235, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_20 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant236 : [#users=1] = get_attr[target=_param_constant236]
%t_80 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant236,), kwargs = {})
%_param_constant237 : [#users=1] = get_attr[target=_param_constant237]
%addmm_38 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant237, %silu_20, %t_80), kwargs = {})
%slice_31 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_38, 0, 0, 9223372036854775807), kwargs = {})
%slice_32 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_31, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_128 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_32, 2), kwargs = {})
%unsqueeze_129 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_128, 3), kwargs = {})
%add_116 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_18, %unsqueeze_129), kwargs = {})
%view_194 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_116, [2, 32, 40, 144]), kwargs = {})
%_to_copy_114 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_194,), kwargs = {dtype: torch.float32})
%var_mean_37 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_114, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_98 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_37, 0), kwargs = {})
%getitem_99 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_37, 1), kwargs = {})
%add_117 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_98, 1e-05), kwargs = {})
%rsqrt_37 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_117,), kwargs = {})
%sub_37 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_194, %getitem_99), kwargs = {})
%mul_83 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_37, %rsqrt_37), kwargs = {})
%view_195 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_83, [2, 1280, 12, 12]), kwargs = {})
%_param_constant238 : [#users=1] = get_attr[target=_param_constant238]
%unsqueeze_130 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant238, 0), kwargs = {})
%unsqueeze_131 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_130, 2), kwargs = {})
%unsqueeze_132 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_131, 3), kwargs = {})
%_param_constant239 : [#users=1] = get_attr[target=_param_constant239]
%unsqueeze_133 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant239, 0), kwargs = {})
%unsqueeze_134 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_133, 2), kwargs = {})
%unsqueeze_135 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_134, 3), kwargs = {})
%mul_84 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_195, %unsqueeze_135), kwargs = {})
%add_118 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_84, %unsqueeze_132), kwargs = {})
%_to_copy_115 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_118,), kwargs = {dtype: torch.float16})
%_to_copy_116 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_99,), kwargs = {dtype: torch.float16})
%_to_copy_117 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_37,), kwargs = {dtype: torch.float16})
%squeeze_38 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_116, [2, 3]), kwargs = {})
%squeeze_39 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_117, [2, 3]), kwargs = {})
%detach_50 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_38,), kwargs = {})
%detach_51 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_39,), kwargs = {})
%silu_21 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_115,), kwargs = {})
%_param_constant240 : [#users=1] = get_attr[target=_param_constant240]
%_param_constant241 : [#users=1] = get_attr[target=_param_constant241]
%convolution_19 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_21, %_param_constant240, %_param_constant241, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_119 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_17, %convolution_19), kwargs = {})
%div_7 : [#users=3] = call_function[target=torch.ops.aten.div](args = (%add_119, 1.0), kwargs = {})
%view_196 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_7, [2, 32, 40, 144]), kwargs = {})
%_to_copy_118 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_196,), kwargs = {dtype: torch.float32})
%var_mean_38 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_118, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_100 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_38, 0), kwargs = {})
%getitem_101 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_38, 1), kwargs = {})
%add_120 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_100, 1e-05), kwargs = {})
%rsqrt_38 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_120,), kwargs = {})
%sub_38 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_196, %getitem_101), kwargs = {})
%mul_85 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_38, %rsqrt_38), kwargs = {})
%view_197 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_85, [2, 1280, 12, 12]), kwargs = {})
%_param_constant242 : [#users=1] = get_attr[target=_param_constant242]
%unsqueeze_136 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant242, 0), kwargs = {})
%unsqueeze_137 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_136, 2), kwargs = {})
%unsqueeze_138 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_137, 3), kwargs = {})
%_param_constant243 : [#users=1] = get_attr[target=_param_constant243]
%unsqueeze_139 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant243, 0), kwargs = {})
%unsqueeze_140 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_139, 2), kwargs = {})
%unsqueeze_141 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_140, 3), kwargs = {})
%mul_86 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_197, %unsqueeze_141), kwargs = {})
%add_121 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_86, %unsqueeze_138), kwargs = {})
%_to_copy_119 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_121,), kwargs = {dtype: torch.float16})
%_to_copy_120 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_101,), kwargs = {dtype: torch.float16})
%_to_copy_121 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_38,), kwargs = {dtype: torch.float16})
%squeeze_40 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_120, [2, 3]), kwargs = {})
%squeeze_41 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_121, [2, 3]), kwargs = {})
%detach_52 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_40,), kwargs = {})
%detach_53 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_41,), kwargs = {})
%silu_22 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_119,), kwargs = {})
%_param_constant244 : [#users=1] = get_attr[target=_param_constant244]
%_param_constant245 : [#users=1] = get_attr[target=_param_constant245]
%convolution_20 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_22, %_param_constant244, %_param_constant245, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_23 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant246 : [#users=1] = get_attr[target=_param_constant246]
%t_81 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant246,), kwargs = {})
%_param_constant247 : [#users=1] = get_attr[target=_param_constant247]
%addmm_39 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant247, %silu_23, %t_81), kwargs = {})
%slice_33 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_39, 0, 0, 9223372036854775807), kwargs = {})
%slice_34 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_33, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_142 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_34, 2), kwargs = {})
%unsqueeze_143 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_142, 3), kwargs = {})
%add_122 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_20, %unsqueeze_143), kwargs = {})
%view_198 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_122, [2, 32, 40, 144]), kwargs = {})
%_to_copy_122 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_198,), kwargs = {dtype: torch.float32})
%var_mean_39 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_122, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_102 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_39, 0), kwargs = {})
%getitem_103 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_39, 1), kwargs = {})
%add_123 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_102, 1e-05), kwargs = {})
%rsqrt_39 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_123,), kwargs = {})
%sub_39 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_198, %getitem_103), kwargs = {})
%mul_87 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_39, %rsqrt_39), kwargs = {})
%view_199 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_87, [2, 1280, 12, 12]), kwargs = {})
%_param_constant248 : [#users=1] = get_attr[target=_param_constant248]
%unsqueeze_144 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant248, 0), kwargs = {})
%unsqueeze_145 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_144, 2), kwargs = {})
%unsqueeze_146 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_145, 3), kwargs = {})
%_param_constant249 : [#users=1] = get_attr[target=_param_constant249]
%unsqueeze_147 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant249, 0), kwargs = {})
%unsqueeze_148 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_147, 2), kwargs = {})
%unsqueeze_149 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_148, 3), kwargs = {})
%mul_88 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_199, %unsqueeze_149), kwargs = {})
%add_124 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_88, %unsqueeze_146), kwargs = {})
%_to_copy_123 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_124,), kwargs = {dtype: torch.float16})
%_to_copy_124 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_103,), kwargs = {dtype: torch.float16})
%_to_copy_125 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_39,), kwargs = {dtype: torch.float16})
%squeeze_42 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_124, [2, 3]), kwargs = {})
%squeeze_43 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_125, [2, 3]), kwargs = {})
%detach_54 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_42,), kwargs = {})
%detach_55 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_43,), kwargs = {})
%silu_24 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_123,), kwargs = {})
%_param_constant250 : [#users=1] = get_attr[target=_param_constant250]
%_param_constant251 : [#users=1] = get_attr[target=_param_constant251]
%convolution_21 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_24, %_param_constant250, %_param_constant251, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_125 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%div_7, %convolution_21), kwargs = {})
%div_8 : [#users=3] = call_function[target=torch.ops.aten.div](args = (%add_125, 1.0), kwargs = {})
%view_200 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_8, [2, 32, 40, 144]), kwargs = {})
%_to_copy_126 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_200,), kwargs = {dtype: torch.float32})
%var_mean_40 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_126, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_104 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_40, 0), kwargs = {})
%getitem_105 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_40, 1), kwargs = {})
%add_126 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_104, 1e-05), kwargs = {})
%rsqrt_40 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_126,), kwargs = {})
%sub_40 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_200, %getitem_105), kwargs = {})
%mul_89 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_40, %rsqrt_40), kwargs = {})
%view_201 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_89, [2, 1280, 12, 12]), kwargs = {})
%_param_constant252 : [#users=1] = get_attr[target=_param_constant252]
%unsqueeze_150 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant252, 0), kwargs = {})
%unsqueeze_151 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_150, 2), kwargs = {})
%unsqueeze_152 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_151, 3), kwargs = {})
%_param_constant253 : [#users=1] = get_attr[target=_param_constant253]
%unsqueeze_153 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant253, 0), kwargs = {})
%unsqueeze_154 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_153, 2), kwargs = {})
%unsqueeze_155 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_154, 3), kwargs = {})
%mul_90 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_201, %unsqueeze_155), kwargs = {})
%add_127 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_90, %unsqueeze_152), kwargs = {})
%_to_copy_127 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_127,), kwargs = {dtype: torch.float16})
%_to_copy_128 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_105,), kwargs = {dtype: torch.float16})
%_to_copy_129 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_40,), kwargs = {dtype: torch.float16})
%squeeze_44 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_128, [2, 3]), kwargs = {})
%squeeze_45 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_129, [2, 3]), kwargs = {})
%detach_56 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_44,), kwargs = {})
%detach_57 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_45,), kwargs = {})
%silu_25 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_127,), kwargs = {})
%_param_constant254 : [#users=1] = get_attr[target=_param_constant254]
%_param_constant255 : [#users=1] = get_attr[target=_param_constant255]
%convolution_22 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_25, %_param_constant254, %_param_constant255, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_26 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant256 : [#users=1] = get_attr[target=_param_constant256]
%t_82 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant256,), kwargs = {})
%_param_constant257 : [#users=1] = get_attr[target=_param_constant257]
%addmm_40 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant257, %silu_26, %t_82), kwargs = {})
%slice_35 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_40, 0, 0, 9223372036854775807), kwargs = {})
%slice_36 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_35, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_156 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_36, 2), kwargs = {})
%unsqueeze_157 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_156, 3), kwargs = {})
%add_128 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_22, %unsqueeze_157), kwargs = {})
%view_202 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_128, [2, 32, 40, 144]), kwargs = {})
%_to_copy_130 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_202,), kwargs = {dtype: torch.float32})
%var_mean_41 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_130, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_106 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_41, 0), kwargs = {})
%getitem_107 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_41, 1), kwargs = {})
%add_129 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_106, 1e-05), kwargs = {})
%rsqrt_41 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_129,), kwargs = {})
%sub_41 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_202, %getitem_107), kwargs = {})
%mul_91 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_41, %rsqrt_41), kwargs = {})
%view_203 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_91, [2, 1280, 12, 12]), kwargs = {})
%_param_constant258 : [#users=1] = get_attr[target=_param_constant258]
%unsqueeze_158 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant258, 0), kwargs = {})
%unsqueeze_159 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_158, 2), kwargs = {})
%unsqueeze_160 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_159, 3), kwargs = {})
%_param_constant259 : [#users=1] = get_attr[target=_param_constant259]
%unsqueeze_161 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant259, 0), kwargs = {})
%unsqueeze_162 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_161, 2), kwargs = {})
%unsqueeze_163 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_162, 3), kwargs = {})
%mul_92 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_203, %unsqueeze_163), kwargs = {})
%add_130 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_92, %unsqueeze_160), kwargs = {})
%_to_copy_131 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_130,), kwargs = {dtype: torch.float16})
%_to_copy_132 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_107,), kwargs = {dtype: torch.float16})
%_to_copy_133 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_41,), kwargs = {dtype: torch.float16})
%squeeze_46 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_132, [2, 3]), kwargs = {})
%squeeze_47 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_133, [2, 3]), kwargs = {})
%detach_58 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_46,), kwargs = {})
%detach_59 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_47,), kwargs = {})
%silu_27 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_131,), kwargs = {})
%_param_constant260 : [#users=1] = get_attr[target=_param_constant260]
%_param_constant261 : [#users=1] = get_attr[target=_param_constant261]
%convolution_23 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_27, %_param_constant260, %_param_constant261, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_131 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%div_8, %convolution_23), kwargs = {})
%div_9 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_131, 1), kwargs = {})
%view_204 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_9, [2, 32, 40, 144]), kwargs = {})
%_to_copy_134 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_204,), kwargs = {dtype: torch.float32})
%var_mean_42 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_134, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_108 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_42, 0), kwargs = {})
%getitem_109 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_42, 1), kwargs = {})
%add_132 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_108, 1e-06), kwargs = {})
%rsqrt_42 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_132,), kwargs = {})
%sub_42 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_204, %getitem_109), kwargs = {})
%mul_93 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_42, %rsqrt_42), kwargs = {})
%view_205 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_93, [2, 1280, 12, 12]), kwargs = {})
%_param_constant262 : [#users=1] = get_attr[target=_param_constant262]
%unsqueeze_164 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant262, 0), kwargs = {})
%unsqueeze_165 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_164, 2), kwargs = {})
%unsqueeze_166 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_165, 3), kwargs = {})
%_param_constant263 : [#users=1] = get_attr[target=_param_constant263]
%unsqueeze_167 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant263, 0), kwargs = {})
%unsqueeze_168 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_167, 2), kwargs = {})
%unsqueeze_169 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_168, 3), kwargs = {})
%mul_94 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_205, %unsqueeze_169), kwargs = {})
%add_133 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_94, %unsqueeze_166), kwargs = {})
%_to_copy_135 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_133,), kwargs = {dtype: torch.float16})
%_to_copy_136 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_109,), kwargs = {dtype: torch.float16})
%_to_copy_137 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_42,), kwargs = {dtype: torch.float16})
%squeeze_48 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_136, [2, 3]), kwargs = {})
%squeeze_49 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_137, [2, 3]), kwargs = {})
%detach_60 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_48,), kwargs = {})
%detach_61 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_49,), kwargs = {})
%permute_12 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_135, [0, 2, 3, 1]), kwargs = {})
%view_206 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_12, [2, 144, 1280]), kwargs = {})
%_param_constant264 : [#users=1] = get_attr[target=_param_constant264]
%t_83 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant264,), kwargs = {})
%clone_12 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_206,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_48 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_12, [288, 1280]), kwargs = {})
%mm_42 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_48, %t_83), kwargs = {})
%_unsafe_view_49 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_42, [2, 144, 1280]), kwargs = {})
%_param_constant265 : [#users=1] = get_attr[target=_param_constant265]
%add_134 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_49, %_param_constant265), kwargs = {})
%_to_copy_138 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_134,), kwargs = {dtype: torch.float32})
%var_mean_43 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_138, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_110 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_43, 0), kwargs = {})
%getitem_111 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_43, 1), kwargs = {})
%add_135 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_110, 1e-05), kwargs = {})
%rsqrt_43 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_135,), kwargs = {})
%sub_43 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_134, %getitem_111), kwargs = {})
%mul_95 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_43, %rsqrt_43), kwargs = {})
%_param_constant266 : [#users=1] = get_attr[target=_param_constant266]
%mul_96 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_95, %_param_constant266), kwargs = {})
%_param_constant267 : [#users=1] = get_attr[target=_param_constant267]
%add_136 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_96, %_param_constant267), kwargs = {})
%_to_copy_139 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_136,), kwargs = {dtype: torch.float16})
%_param_constant268 : [#users=1] = get_attr[target=_param_constant268]
%t_84 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant268,), kwargs = {})
%view_207 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_139, [288, 1280]), kwargs = {})
%mm_43 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_207, %t_84), kwargs = {})
%_unsafe_view_50 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_43, [2, 144, 1280]), kwargs = {})
%_param_constant269 : [#users=1] = get_attr[target=_param_constant269]
%t_85 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant269,), kwargs = {})
%view_208 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_139, [288, 1280]), kwargs = {})
%mm_44 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_208, %t_85), kwargs = {})
%_unsafe_view_51 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_44, [2, 144, 1280]), kwargs = {})
%_param_constant270 : [#users=1] = get_attr[target=_param_constant270]
%t_86 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant270,), kwargs = {})
%view_209 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_139, [288, 1280]), kwargs = {})
%mm_45 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_209, %t_86), kwargs = {})
%_unsafe_view_52 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_45, [2, 144, 1280]), kwargs = {})
%view_210 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_50, [2, -1, 20, 64]), kwargs = {})
%transpose_48 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_210, 1, 2), kwargs = {})
%view_211 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_51, [2, -1, 20, 64]), kwargs = {})
%transpose_49 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_211, 1, 2), kwargs = {})
%view_212 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_52, [2, -1, 20, 64]), kwargs = {})
%transpose_50 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_212, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_12 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_48, %transpose_49, %transpose_50, True), kwargs = {})
%getitem_112 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_12, 0), kwargs = {})
%getitem_113 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_12, 1), kwargs = {})
%detach_62 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_112,), kwargs = {})
%transpose_51 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_112, 1, 2), kwargs = {})
%view_213 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_51, [2, -1, 1280]), kwargs = {})
%_param_constant271 : [#users=1] = get_attr[target=_param_constant271]
%t_87 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant271,), kwargs = {})
%view_214 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_213, [288, 1280]), kwargs = {})
%_param_constant272 : [#users=1] = get_attr[target=_param_constant272]
%addmm_41 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant272, %view_214, %t_87), kwargs = {})
%view_215 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_41, [2, 144, 1280]), kwargs = {})
%add_137 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_215, %add_134), kwargs = {})
%_to_copy_140 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_137,), kwargs = {dtype: torch.float32})
%var_mean_44 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_140, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_114 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_44, 0), kwargs = {})
%getitem_115 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_44, 1), kwargs = {})
%add_138 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_114, 1e-05), kwargs = {})
%rsqrt_44 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_138,), kwargs = {})
%sub_44 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_137, %getitem_115), kwargs = {})
%mul_97 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_44, %rsqrt_44), kwargs = {})
%_param_constant273 : [#users=1] = get_attr[target=_param_constant273]
%mul_98 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_97, %_param_constant273), kwargs = {})
%_param_constant274 : [#users=1] = get_attr[target=_param_constant274]
%add_139 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_98, %_param_constant274), kwargs = {})
%_to_copy_141 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_139,), kwargs = {dtype: torch.float16})
%_param_constant275 : [#users=1] = get_attr[target=_param_constant275]
%t_88 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant275,), kwargs = {})
%view_216 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_141, [288, 1280]), kwargs = {})
%mm_46 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_216, %t_88), kwargs = {})
%_unsafe_view_53 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_46, [2, 144, 1280]), kwargs = {})
%_param_constant276 : [#users=1] = get_attr[target=_param_constant276]
%t_89 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant276,), kwargs = {})
%view_217 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_47 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_217, %t_89), kwargs = {})
%_unsafe_view_54 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_47, [2, 77, 1280]), kwargs = {})
%_param_constant277 : [#users=1] = get_attr[target=_param_constant277]
%t_90 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant277,), kwargs = {})
%view_218 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_48 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_218, %t_90), kwargs = {})
%_unsafe_view_55 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_48, [2, 77, 1280]), kwargs = {})
%view_219 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_53, [2, -1, 20, 64]), kwargs = {})
%transpose_52 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_219, 1, 2), kwargs = {})
%view_220 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_54, [2, -1, 20, 64]), kwargs = {})
%transpose_53 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_220, 1, 2), kwargs = {})
%view_221 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_55, [2, -1, 20, 64]), kwargs = {})
%transpose_54 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_221, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_13 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_52, %transpose_53, %transpose_54, True), kwargs = {})
%getitem_116 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_13, 0), kwargs = {})
%getitem_117 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_13, 1), kwargs = {})
%detach_63 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_116,), kwargs = {})
%transpose_55 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_116, 1, 2), kwargs = {})
%view_222 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_55, [2, -1, 1280]), kwargs = {})
%_param_constant278 : [#users=1] = get_attr[target=_param_constant278]
%t_91 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant278,), kwargs = {})
%view_223 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_222, [288, 1280]), kwargs = {})
%_param_constant279 : [#users=1] = get_attr[target=_param_constant279]
%addmm_42 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant279, %view_223, %t_91), kwargs = {})
%view_224 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_42, [2, 144, 1280]), kwargs = {})
%add_140 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_224, %add_137), kwargs = {})
%_to_copy_142 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_140,), kwargs = {dtype: torch.float32})
%var_mean_45 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_142, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_118 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_45, 0), kwargs = {})
%getitem_119 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_45, 1), kwargs = {})
%add_141 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_118, 1e-05), kwargs = {})
%rsqrt_45 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_141,), kwargs = {})
%sub_45 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_140, %getitem_119), kwargs = {})
%mul_99 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_45, %rsqrt_45), kwargs = {})
%_param_constant280 : [#users=1] = get_attr[target=_param_constant280]
%mul_100 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_99, %_param_constant280), kwargs = {})
%_param_constant281 : [#users=1] = get_attr[target=_param_constant281]
%add_142 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_100, %_param_constant281), kwargs = {})
%_to_copy_143 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_142,), kwargs = {dtype: torch.float16})
%_param_constant282 : [#users=1] = get_attr[target=_param_constant282]
%t_92 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant282,), kwargs = {})
%view_225 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_143, [288, 1280]), kwargs = {})
%_param_constant283 : [#users=1] = get_attr[target=_param_constant283]
%addmm_43 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant283, %view_225, %t_92), kwargs = {})
%view_226 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_43, [2, 144, 10240]), kwargs = {})
%slice_37 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_226, -1, 0, 5120), kwargs = {})
%slice_38 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_226, -1, 5120, 10240), kwargs = {})
%gelu_6 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_38,), kwargs = {})
%mul_101 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_37, %gelu_6), kwargs = {})
%_param_constant284 : [#users=1] = get_attr[target=_param_constant284]
%t_93 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant284,), kwargs = {})
%view_227 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_101, [288, 5120]), kwargs = {})
%_param_constant285 : [#users=1] = get_attr[target=_param_constant285]
%addmm_44 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant285, %view_227, %t_93), kwargs = {})
%view_228 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_44, [2, 144, 1280]), kwargs = {})
%add_143 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_228, %add_140), kwargs = {})
%_param_constant286 : [#users=1] = get_attr[target=_param_constant286]
%t_94 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant286,), kwargs = {})
%view_229 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_143, [288, 1280]), kwargs = {})
%_param_constant287 : [#users=1] = get_attr[target=_param_constant287]
%addmm_45 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant287, %view_229, %t_94), kwargs = {})
%view_230 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_45, [2, 144, 1280]), kwargs = {})
%view_231 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_230, [2, 12, 12, 1280]), kwargs = {})
%permute_13 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_231, [0, 3, 1, 2]), kwargs = {})
%clone_13 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_13,), kwargs = {memory_format: torch.contiguous_format})
%add_144 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%clone_13, %div_9), kwargs = {})
%view_232 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_144, [2, 32, 40, 144]), kwargs = {})
%_to_copy_144 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_232,), kwargs = {dtype: torch.float32})
%var_mean_46 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_144, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_120 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_46, 0), kwargs = {})
%getitem_121 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_46, 1), kwargs = {})
%add_145 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_120, 1e-05), kwargs = {})
%rsqrt_46 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_145,), kwargs = {})
%sub_46 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_232, %getitem_121), kwargs = {})
%mul_102 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_46, %rsqrt_46), kwargs = {})
%view_233 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_102, [2, 1280, 12, 12]), kwargs = {})
%_param_constant288 : [#users=1] = get_attr[target=_param_constant288]
%unsqueeze_170 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant288, 0), kwargs = {})
%unsqueeze_171 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_170, 2), kwargs = {})
%unsqueeze_172 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_171, 3), kwargs = {})
%_param_constant289 : [#users=1] = get_attr[target=_param_constant289]
%unsqueeze_173 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant289, 0), kwargs = {})
%unsqueeze_174 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_173, 2), kwargs = {})
%unsqueeze_175 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_174, 3), kwargs = {})
%mul_103 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_233, %unsqueeze_175), kwargs = {})
%add_146 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_103, %unsqueeze_172), kwargs = {})
%_to_copy_145 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_146,), kwargs = {dtype: torch.float16})
%_to_copy_146 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_121,), kwargs = {dtype: torch.float16})
%_to_copy_147 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_46,), kwargs = {dtype: torch.float16})
%squeeze_50 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_146, [2, 3]), kwargs = {})
%squeeze_51 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_147, [2, 3]), kwargs = {})
%detach_64 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_50,), kwargs = {})
%detach_65 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_51,), kwargs = {})
%silu_28 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_145,), kwargs = {})
%_param_constant290 : [#users=1] = get_attr[target=_param_constant290]
%_param_constant291 : [#users=1] = get_attr[target=_param_constant291]
%convolution_24 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_28, %_param_constant290, %_param_constant291, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_29 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant292 : [#users=1] = get_attr[target=_param_constant292]
%t_95 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant292,), kwargs = {})
%_param_constant293 : [#users=1] = get_attr[target=_param_constant293]
%addmm_46 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant293, %silu_29, %t_95), kwargs = {})
%slice_39 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_46, 0, 0, 9223372036854775807), kwargs = {})
%slice_40 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_39, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_176 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_40, 2), kwargs = {})
%unsqueeze_177 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_176, 3), kwargs = {})
%add_147 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_24, %unsqueeze_177), kwargs = {})
%view_234 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_147, [2, 32, 40, 144]), kwargs = {})
%_to_copy_148 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_234,), kwargs = {dtype: torch.float32})
%var_mean_47 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_148, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_122 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_47, 0), kwargs = {})
%getitem_123 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_47, 1), kwargs = {})
%add_148 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_122, 1e-05), kwargs = {})
%rsqrt_47 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_148,), kwargs = {})
%sub_47 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_234, %getitem_123), kwargs = {})
%mul_104 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_47, %rsqrt_47), kwargs = {})
%view_235 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_104, [2, 1280, 12, 12]), kwargs = {})
%_param_constant294 : [#users=1] = get_attr[target=_param_constant294]
%unsqueeze_178 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant294, 0), kwargs = {})
%unsqueeze_179 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_178, 2), kwargs = {})
%unsqueeze_180 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_179, 3), kwargs = {})
%_param_constant295 : [#users=1] = get_attr[target=_param_constant295]
%unsqueeze_181 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant295, 0), kwargs = {})
%unsqueeze_182 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_181, 2), kwargs = {})
%unsqueeze_183 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_182, 3), kwargs = {})
%mul_105 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_235, %unsqueeze_183), kwargs = {})
%add_149 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_105, %unsqueeze_180), kwargs = {})
%_to_copy_149 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_149,), kwargs = {dtype: torch.float16})
%_to_copy_150 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_123,), kwargs = {dtype: torch.float16})
%_to_copy_151 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_47,), kwargs = {dtype: torch.float16})
%squeeze_52 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_150, [2, 3]), kwargs = {})
%squeeze_53 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_151, [2, 3]), kwargs = {})
%detach_66 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_52,), kwargs = {})
%detach_67 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_53,), kwargs = {})
%silu_30 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_149,), kwargs = {})
%_param_constant296 : [#users=1] = get_attr[target=_param_constant296]
%_param_constant297 : [#users=1] = get_attr[target=_param_constant297]
%convolution_25 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_30, %_param_constant296, %_param_constant297, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%add_150 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_144, %convolution_25), kwargs = {})
%div_10 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_150, 1), kwargs = {})
%cat_2 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%div_10, %div_8], 1), kwargs = {})
%view_236 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_2, [2, 32, 80, 144]), kwargs = {})
%_to_copy_152 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_236,), kwargs = {dtype: torch.float32})
%var_mean_48 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_152, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_124 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_48, 0), kwargs = {})
%getitem_125 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_48, 1), kwargs = {})
%add_151 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_124, 1e-05), kwargs = {})
%rsqrt_48 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_151,), kwargs = {})
%sub_48 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_236, %getitem_125), kwargs = {})
%mul_106 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_48, %rsqrt_48), kwargs = {})
%view_237 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_106, [2, 2560, 12, 12]), kwargs = {})
%_param_constant298 : [#users=1] = get_attr[target=_param_constant298]
%unsqueeze_184 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant298, 0), kwargs = {})
%unsqueeze_185 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_184, 2), kwargs = {})
%unsqueeze_186 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_185, 3), kwargs = {})
%_param_constant299 : [#users=1] = get_attr[target=_param_constant299]
%unsqueeze_187 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant299, 0), kwargs = {})
%unsqueeze_188 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_187, 2), kwargs = {})
%unsqueeze_189 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_188, 3), kwargs = {})
%mul_107 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_237, %unsqueeze_189), kwargs = {})
%add_152 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_107, %unsqueeze_186), kwargs = {})
%_to_copy_153 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_152,), kwargs = {dtype: torch.float16})
%_to_copy_154 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_125,), kwargs = {dtype: torch.float16})
%_to_copy_155 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_48,), kwargs = {dtype: torch.float16})
%squeeze_54 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_154, [2, 3]), kwargs = {})
%squeeze_55 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_155, [2, 3]), kwargs = {})
%detach_68 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_54,), kwargs = {})
%detach_69 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_55,), kwargs = {})
%silu_31 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_153,), kwargs = {})
%_param_constant300 : [#users=1] = get_attr[target=_param_constant300]
%_param_constant301 : [#users=1] = get_attr[target=_param_constant301]
%convolution_26 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_31, %_param_constant300, %_param_constant301, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_32 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant302 : [#users=1] = get_attr[target=_param_constant302]
%t_96 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant302,), kwargs = {})
%_param_constant303 : [#users=1] = get_attr[target=_param_constant303]
%addmm_47 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant303, %silu_32, %t_96), kwargs = {})
%slice_41 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_47, 0, 0, 9223372036854775807), kwargs = {})
%slice_42 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_41, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_190 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_42, 2), kwargs = {})
%unsqueeze_191 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_190, 3), kwargs = {})
%add_153 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_26, %unsqueeze_191), kwargs = {})
%view_238 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_153, [2, 32, 40, 144]), kwargs = {})
%_to_copy_156 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_238,), kwargs = {dtype: torch.float32})
%var_mean_49 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_156, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_126 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_49, 0), kwargs = {})
%getitem_127 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_49, 1), kwargs = {})
%add_154 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_126, 1e-05), kwargs = {})
%rsqrt_49 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_154,), kwargs = {})
%sub_49 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_238, %getitem_127), kwargs = {})
%mul_108 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_49, %rsqrt_49), kwargs = {})
%view_239 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_108, [2, 1280, 12, 12]), kwargs = {})
%_param_constant304 : [#users=1] = get_attr[target=_param_constant304]
%unsqueeze_192 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant304, 0), kwargs = {})
%unsqueeze_193 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_192, 2), kwargs = {})
%unsqueeze_194 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_193, 3), kwargs = {})
%_param_constant305 : [#users=1] = get_attr[target=_param_constant305]
%unsqueeze_195 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant305, 0), kwargs = {})
%unsqueeze_196 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_195, 2), kwargs = {})
%unsqueeze_197 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_196, 3), kwargs = {})
%mul_109 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_239, %unsqueeze_197), kwargs = {})
%add_155 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_109, %unsqueeze_194), kwargs = {})
%_to_copy_157 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_155,), kwargs = {dtype: torch.float16})
%_to_copy_158 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_127,), kwargs = {dtype: torch.float16})
%_to_copy_159 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_49,), kwargs = {dtype: torch.float16})
%squeeze_56 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_158, [2, 3]), kwargs = {})
%squeeze_57 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_159, [2, 3]), kwargs = {})
%detach_70 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_56,), kwargs = {})
%detach_71 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_57,), kwargs = {})
%silu_33 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_157,), kwargs = {})
%_param_constant306 : [#users=1] = get_attr[target=_param_constant306]
%_param_constant307 : [#users=1] = get_attr[target=_param_constant307]
%convolution_27 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_33, %_param_constant306, %_param_constant307, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant308 : [#users=1] = get_attr[target=_param_constant308]
%_param_constant309 : [#users=1] = get_attr[target=_param_constant309]
%convolution_28 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_2, %_param_constant308, %_param_constant309, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_156 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_28, %convolution_27), kwargs = {})
%div_11 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_156, 1.0), kwargs = {})
%cat_3 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%div_11, %div_7], 1), kwargs = {})
%view_240 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_3, [2, 32, 80, 144]), kwargs = {})
%_to_copy_160 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_240,), kwargs = {dtype: torch.float32})
%var_mean_50 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_160, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_128 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_50, 0), kwargs = {})
%getitem_129 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_50, 1), kwargs = {})
%add_157 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_128, 1e-05), kwargs = {})
%rsqrt_50 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_157,), kwargs = {})
%sub_50 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_240, %getitem_129), kwargs = {})
%mul_110 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_50, %rsqrt_50), kwargs = {})
%view_241 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_110, [2, 2560, 12, 12]), kwargs = {})
%_param_constant310 : [#users=1] = get_attr[target=_param_constant310]
%unsqueeze_198 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant310, 0), kwargs = {})
%unsqueeze_199 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_198, 2), kwargs = {})
%unsqueeze_200 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_199, 3), kwargs = {})
%_param_constant311 : [#users=1] = get_attr[target=_param_constant311]
%unsqueeze_201 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant311, 0), kwargs = {})
%unsqueeze_202 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_201, 2), kwargs = {})
%unsqueeze_203 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_202, 3), kwargs = {})
%mul_111 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_241, %unsqueeze_203), kwargs = {})
%add_158 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_111, %unsqueeze_200), kwargs = {})
%_to_copy_161 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_158,), kwargs = {dtype: torch.float16})
%_to_copy_162 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_129,), kwargs = {dtype: torch.float16})
%_to_copy_163 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_50,), kwargs = {dtype: torch.float16})
%squeeze_58 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_162, [2, 3]), kwargs = {})
%squeeze_59 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_163, [2, 3]), kwargs = {})
%detach_72 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_58,), kwargs = {})
%detach_73 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_59,), kwargs = {})
%silu_34 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_161,), kwargs = {})
%_param_constant312 : [#users=1] = get_attr[target=_param_constant312]
%_param_constant313 : [#users=1] = get_attr[target=_param_constant313]
%convolution_29 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_34, %_param_constant312, %_param_constant313, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_35 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant314 : [#users=1] = get_attr[target=_param_constant314]
%t_97 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant314,), kwargs = {})
%_param_constant315 : [#users=1] = get_attr[target=_param_constant315]
%addmm_48 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant315, %silu_35, %t_97), kwargs = {})
%slice_43 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_48, 0, 0, 9223372036854775807), kwargs = {})
%slice_44 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_43, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_204 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_44, 2), kwargs = {})
%unsqueeze_205 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_204, 3), kwargs = {})
%add_159 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_29, %unsqueeze_205), kwargs = {})
%view_242 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_159, [2, 32, 40, 144]), kwargs = {})
%_to_copy_164 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_242,), kwargs = {dtype: torch.float32})
%var_mean_51 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_164, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_130 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_51, 0), kwargs = {})
%getitem_131 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_51, 1), kwargs = {})
%add_160 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_130, 1e-05), kwargs = {})
%rsqrt_51 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_160,), kwargs = {})
%sub_51 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_242, %getitem_131), kwargs = {})
%mul_112 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_51, %rsqrt_51), kwargs = {})
%view_243 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_112, [2, 1280, 12, 12]), kwargs = {})
%_param_constant316 : [#users=1] = get_attr[target=_param_constant316]
%unsqueeze_206 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant316, 0), kwargs = {})
%unsqueeze_207 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_206, 2), kwargs = {})
%unsqueeze_208 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_207, 3), kwargs = {})
%_param_constant317 : [#users=1] = get_attr[target=_param_constant317]
%unsqueeze_209 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant317, 0), kwargs = {})
%unsqueeze_210 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_209, 2), kwargs = {})
%unsqueeze_211 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_210, 3), kwargs = {})
%mul_113 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_243, %unsqueeze_211), kwargs = {})
%add_161 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_113, %unsqueeze_208), kwargs = {})
%_to_copy_165 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_161,), kwargs = {dtype: torch.float16})
%_to_copy_166 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_131,), kwargs = {dtype: torch.float16})
%_to_copy_167 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_51,), kwargs = {dtype: torch.float16})
%squeeze_60 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_166, [2, 3]), kwargs = {})
%squeeze_61 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_167, [2, 3]), kwargs = {})
%detach_74 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_60,), kwargs = {})
%detach_75 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_61,), kwargs = {})
%silu_36 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_165,), kwargs = {})
%_param_constant318 : [#users=1] = get_attr[target=_param_constant318]
%_param_constant319 : [#users=1] = get_attr[target=_param_constant319]
%convolution_30 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_36, %_param_constant318, %_param_constant319, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant320 : [#users=1] = get_attr[target=_param_constant320]
%_param_constant321 : [#users=1] = get_attr[target=_param_constant321]
%convolution_31 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_3, %_param_constant320, %_param_constant321, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_162 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_31, %convolution_30), kwargs = {})
%div_12 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_162, 1.0), kwargs = {})
%cat_4 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%div_12, %convolution_17], 1), kwargs = {})
%view_244 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_4, [2, 32, 80, 144]), kwargs = {})
%_to_copy_168 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_244,), kwargs = {dtype: torch.float32})
%var_mean_52 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_168, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_132 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_52, 0), kwargs = {})
%getitem_133 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_52, 1), kwargs = {})
%add_163 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_132, 1e-05), kwargs = {})
%rsqrt_52 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_163,), kwargs = {})
%sub_52 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_244, %getitem_133), kwargs = {})
%mul_114 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_52, %rsqrt_52), kwargs = {})
%view_245 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_114, [2, 2560, 12, 12]), kwargs = {})
%_param_constant322 : [#users=1] = get_attr[target=_param_constant322]
%unsqueeze_212 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant322, 0), kwargs = {})
%unsqueeze_213 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_212, 2), kwargs = {})
%unsqueeze_214 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_213, 3), kwargs = {})
%_param_constant323 : [#users=1] = get_attr[target=_param_constant323]
%unsqueeze_215 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant323, 0), kwargs = {})
%unsqueeze_216 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_215, 2), kwargs = {})
%unsqueeze_217 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_216, 3), kwargs = {})
%mul_115 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_245, %unsqueeze_217), kwargs = {})
%add_164 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_115, %unsqueeze_214), kwargs = {})
%_to_copy_169 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_164,), kwargs = {dtype: torch.float16})
%_to_copy_170 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_133,), kwargs = {dtype: torch.float16})
%_to_copy_171 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_52,), kwargs = {dtype: torch.float16})
%squeeze_62 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_170, [2, 3]), kwargs = {})
%squeeze_63 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_171, [2, 3]), kwargs = {})
%detach_76 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_62,), kwargs = {})
%detach_77 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_63,), kwargs = {})
%silu_37 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_169,), kwargs = {})
%_param_constant324 : [#users=1] = get_attr[target=_param_constant324]
%_param_constant325 : [#users=1] = get_attr[target=_param_constant325]
%convolution_32 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_37, %_param_constant324, %_param_constant325, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_38 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant326 : [#users=1] = get_attr[target=_param_constant326]
%t_98 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant326,), kwargs = {})
%_param_constant327 : [#users=1] = get_attr[target=_param_constant327]
%addmm_49 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant327, %silu_38, %t_98), kwargs = {})
%slice_45 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_49, 0, 0, 9223372036854775807), kwargs = {})
%slice_46 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_45, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_218 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_46, 2), kwargs = {})
%unsqueeze_219 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_218, 3), kwargs = {})
%add_165 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_32, %unsqueeze_219), kwargs = {})
%view_246 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_165, [2, 32, 40, 144]), kwargs = {})
%_to_copy_172 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_246,), kwargs = {dtype: torch.float32})
%var_mean_53 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_172, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_134 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_53, 0), kwargs = {})
%getitem_135 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_53, 1), kwargs = {})
%add_166 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_134, 1e-05), kwargs = {})
%rsqrt_53 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_166,), kwargs = {})
%sub_53 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_246, %getitem_135), kwargs = {})
%mul_116 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_53, %rsqrt_53), kwargs = {})
%view_247 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_116, [2, 1280, 12, 12]), kwargs = {})
%_param_constant328 : [#users=1] = get_attr[target=_param_constant328]
%unsqueeze_220 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant328, 0), kwargs = {})
%unsqueeze_221 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_220, 2), kwargs = {})
%unsqueeze_222 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_221, 3), kwargs = {})
%_param_constant329 : [#users=1] = get_attr[target=_param_constant329]
%unsqueeze_223 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant329, 0), kwargs = {})
%unsqueeze_224 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_223, 2), kwargs = {})
%unsqueeze_225 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_224, 3), kwargs = {})
%mul_117 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_247, %unsqueeze_225), kwargs = {})
%add_167 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_117, %unsqueeze_222), kwargs = {})
%_to_copy_173 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_167,), kwargs = {dtype: torch.float16})
%_to_copy_174 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_135,), kwargs = {dtype: torch.float16})
%_to_copy_175 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_53,), kwargs = {dtype: torch.float16})
%squeeze_64 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_174, [2, 3]), kwargs = {})
%squeeze_65 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_175, [2, 3]), kwargs = {})
%detach_78 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_64,), kwargs = {})
%detach_79 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_65,), kwargs = {})
%silu_39 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_173,), kwargs = {})
%_param_constant330 : [#users=1] = get_attr[target=_param_constant330]
%_param_constant331 : [#users=1] = get_attr[target=_param_constant331]
%convolution_33 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_39, %_param_constant330, %_param_constant331, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant332 : [#users=1] = get_attr[target=_param_constant332]
%_param_constant333 : [#users=1] = get_attr[target=_param_constant333]
%convolution_34 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_4, %_param_constant332, %_param_constant333, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_168 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_34, %convolution_33), kwargs = {})
%div_13 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%add_168, 1.0), kwargs = {})
%upsample_nearest2d : [#users=1] = call_function[target=torch.ops.aten.upsample_nearest2d](args = (%div_13, [24, 24], 2.0, 2.0), kwargs = {})
%_param_constant334 : [#users=1] = get_attr[target=_param_constant334]
%_param_constant335 : [#users=1] = get_attr[target=_param_constant335]
%convolution_35 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%upsample_nearest2d, %_param_constant334, %_param_constant335, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%cat_5 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%convolution_35, %add_113], 1), kwargs = {})
%view_248 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_5, [2, 32, 80, 576]), kwargs = {})
%_to_copy_176 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_248,), kwargs = {dtype: torch.float32})
%var_mean_54 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_176, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_136 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_54, 0), kwargs = {})
%getitem_137 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_54, 1), kwargs = {})
%add_169 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_136, 1e-05), kwargs = {})
%rsqrt_54 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_169,), kwargs = {})
%sub_54 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_248, %getitem_137), kwargs = {})
%mul_118 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_54, %rsqrt_54), kwargs = {})
%view_249 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_118, [2, 2560, 24, 24]), kwargs = {})
%_param_constant336 : [#users=1] = get_attr[target=_param_constant336]
%unsqueeze_226 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant336, 0), kwargs = {})
%unsqueeze_227 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_226, 2), kwargs = {})
%unsqueeze_228 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_227, 3), kwargs = {})
%_param_constant337 : [#users=1] = get_attr[target=_param_constant337]
%unsqueeze_229 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant337, 0), kwargs = {})
%unsqueeze_230 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_229, 2), kwargs = {})
%unsqueeze_231 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_230, 3), kwargs = {})
%mul_119 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_249, %unsqueeze_231), kwargs = {})
%add_170 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_119, %unsqueeze_228), kwargs = {})
%_to_copy_177 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_170,), kwargs = {dtype: torch.float16})
%_to_copy_178 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_137,), kwargs = {dtype: torch.float16})
%_to_copy_179 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_54,), kwargs = {dtype: torch.float16})
%squeeze_66 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_178, [2, 3]), kwargs = {})
%squeeze_67 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_179, [2, 3]), kwargs = {})
%detach_80 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_66,), kwargs = {})
%detach_81 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_67,), kwargs = {})
%silu_40 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_177,), kwargs = {})
%_param_constant338 : [#users=1] = get_attr[target=_param_constant338]
%_param_constant339 : [#users=1] = get_attr[target=_param_constant339]
%convolution_36 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_40, %_param_constant338, %_param_constant339, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_41 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant340 : [#users=1] = get_attr[target=_param_constant340]
%t_99 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant340,), kwargs = {})
%_param_constant341 : [#users=1] = get_attr[target=_param_constant341]
%addmm_50 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant341, %silu_41, %t_99), kwargs = {})
%slice_47 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_50, 0, 0, 9223372036854775807), kwargs = {})
%slice_48 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_47, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_232 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_48, 2), kwargs = {})
%unsqueeze_233 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_232, 3), kwargs = {})
%add_171 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_36, %unsqueeze_233), kwargs = {})
%view_250 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_171, [2, 32, 40, 576]), kwargs = {})
%_to_copy_180 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_250,), kwargs = {dtype: torch.float32})
%var_mean_55 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_180, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_138 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_55, 0), kwargs = {})
%getitem_139 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_55, 1), kwargs = {})
%add_172 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_138, 1e-05), kwargs = {})
%rsqrt_55 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_172,), kwargs = {})
%sub_55 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_250, %getitem_139), kwargs = {})
%mul_120 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_55, %rsqrt_55), kwargs = {})
%view_251 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_120, [2, 1280, 24, 24]), kwargs = {})
%_param_constant342 : [#users=1] = get_attr[target=_param_constant342]
%unsqueeze_234 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant342, 0), kwargs = {})
%unsqueeze_235 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_234, 2), kwargs = {})
%unsqueeze_236 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_235, 3), kwargs = {})
%_param_constant343 : [#users=1] = get_attr[target=_param_constant343]
%unsqueeze_237 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant343, 0), kwargs = {})
%unsqueeze_238 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_237, 2), kwargs = {})
%unsqueeze_239 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_238, 3), kwargs = {})
%mul_121 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_251, %unsqueeze_239), kwargs = {})
%add_173 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_121, %unsqueeze_236), kwargs = {})
%_to_copy_181 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_173,), kwargs = {dtype: torch.float16})
%_to_copy_182 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_139,), kwargs = {dtype: torch.float16})
%_to_copy_183 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_55,), kwargs = {dtype: torch.float16})
%squeeze_68 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_182, [2, 3]), kwargs = {})
%squeeze_69 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_183, [2, 3]), kwargs = {})
%detach_82 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_68,), kwargs = {})
%detach_83 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_69,), kwargs = {})
%silu_42 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_181,), kwargs = {})
%_param_constant344 : [#users=1] = get_attr[target=_param_constant344]
%_param_constant345 : [#users=1] = get_attr[target=_param_constant345]
%convolution_37 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_42, %_param_constant344, %_param_constant345, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant346 : [#users=1] = get_attr[target=_param_constant346]
%_param_constant347 : [#users=1] = get_attr[target=_param_constant347]
%convolution_38 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_5, %_param_constant346, %_param_constant347, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_174 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_38, %convolution_37), kwargs = {})
%div_14 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_174, 1.0), kwargs = {})
%view_252 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_14, [2, 32, 40, 576]), kwargs = {})
%_to_copy_184 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_252,), kwargs = {dtype: torch.float32})
%var_mean_56 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_184, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_140 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_56, 0), kwargs = {})
%getitem_141 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_56, 1), kwargs = {})
%add_175 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_140, 1e-06), kwargs = {})
%rsqrt_56 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_175,), kwargs = {})
%sub_56 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_252, %getitem_141), kwargs = {})
%mul_122 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_56, %rsqrt_56), kwargs = {})
%view_253 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_122, [2, 1280, 24, 24]), kwargs = {})
%_param_constant348 : [#users=1] = get_attr[target=_param_constant348]
%unsqueeze_240 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant348, 0), kwargs = {})
%unsqueeze_241 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_240, 2), kwargs = {})
%unsqueeze_242 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_241, 3), kwargs = {})
%_param_constant349 : [#users=1] = get_attr[target=_param_constant349]
%unsqueeze_243 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant349, 0), kwargs = {})
%unsqueeze_244 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_243, 2), kwargs = {})
%unsqueeze_245 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_244, 3), kwargs = {})
%mul_123 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_253, %unsqueeze_245), kwargs = {})
%add_176 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_123, %unsqueeze_242), kwargs = {})
%_to_copy_185 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_176,), kwargs = {dtype: torch.float16})
%_to_copy_186 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_141,), kwargs = {dtype: torch.float16})
%_to_copy_187 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_56,), kwargs = {dtype: torch.float16})
%squeeze_70 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_186, [2, 3]), kwargs = {})
%squeeze_71 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_187, [2, 3]), kwargs = {})
%detach_84 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_70,), kwargs = {})
%detach_85 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_71,), kwargs = {})
%permute_14 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_185, [0, 2, 3, 1]), kwargs = {})
%view_254 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_14, [2, 576, 1280]), kwargs = {})
%_param_constant350 : [#users=1] = get_attr[target=_param_constant350]
%t_100 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant350,), kwargs = {})
%clone_14 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_254,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_56 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_14, [1152, 1280]), kwargs = {})
%mm_49 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_56, %t_100), kwargs = {})
%_unsafe_view_57 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_49, [2, 576, 1280]), kwargs = {})
%_param_constant351 : [#users=1] = get_attr[target=_param_constant351]
%add_177 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_57, %_param_constant351), kwargs = {})
%_to_copy_188 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_177,), kwargs = {dtype: torch.float32})
%var_mean_57 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_188, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_142 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_57, 0), kwargs = {})
%getitem_143 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_57, 1), kwargs = {})
%add_178 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_142, 1e-05), kwargs = {})
%rsqrt_57 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_178,), kwargs = {})
%sub_57 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_177, %getitem_143), kwargs = {})
%mul_124 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_57, %rsqrt_57), kwargs = {})
%_param_constant352 : [#users=1] = get_attr[target=_param_constant352]
%mul_125 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_124, %_param_constant352), kwargs = {})
%_param_constant353 : [#users=1] = get_attr[target=_param_constant353]
%add_179 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_125, %_param_constant353), kwargs = {})
%_to_copy_189 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_179,), kwargs = {dtype: torch.float16})
%_param_constant354 : [#users=1] = get_attr[target=_param_constant354]
%t_101 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant354,), kwargs = {})
%view_255 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_189, [1152, 1280]), kwargs = {})
%mm_50 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_255, %t_101), kwargs = {})
%_unsafe_view_58 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_50, [2, 576, 1280]), kwargs = {})
%_param_constant355 : [#users=1] = get_attr[target=_param_constant355]
%t_102 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant355,), kwargs = {})
%view_256 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_189, [1152, 1280]), kwargs = {})
%mm_51 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_256, %t_102), kwargs = {})
%_unsafe_view_59 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_51, [2, 576, 1280]), kwargs = {})
%_param_constant356 : [#users=1] = get_attr[target=_param_constant356]
%t_103 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant356,), kwargs = {})
%view_257 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_189, [1152, 1280]), kwargs = {})
%mm_52 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_257, %t_103), kwargs = {})
%_unsafe_view_60 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_52, [2, 576, 1280]), kwargs = {})
%view_258 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_58, [2, -1, 20, 64]), kwargs = {})
%transpose_56 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_258, 1, 2), kwargs = {})
%view_259 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_59, [2, -1, 20, 64]), kwargs = {})
%transpose_57 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_259, 1, 2), kwargs = {})
%view_260 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_60, [2, -1, 20, 64]), kwargs = {})
%transpose_58 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_260, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_14 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_56, %transpose_57, %transpose_58, True), kwargs = {})
%getitem_144 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_14, 0), kwargs = {})
%getitem_145 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_14, 1), kwargs = {})
%detach_86 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_144,), kwargs = {})
%transpose_59 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_144, 1, 2), kwargs = {})
%view_261 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_59, [2, -1, 1280]), kwargs = {})
%_param_constant357 : [#users=1] = get_attr[target=_param_constant357]
%t_104 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant357,), kwargs = {})
%view_262 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_261, [1152, 1280]), kwargs = {})
%_param_constant358 : [#users=1] = get_attr[target=_param_constant358]
%addmm_51 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant358, %view_262, %t_104), kwargs = {})
%view_263 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_51, [2, 576, 1280]), kwargs = {})
%add_180 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_263, %add_177), kwargs = {})
%_to_copy_190 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_180,), kwargs = {dtype: torch.float32})
%var_mean_58 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_190, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_146 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_58, 0), kwargs = {})
%getitem_147 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_58, 1), kwargs = {})
%add_181 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_146, 1e-05), kwargs = {})
%rsqrt_58 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_181,), kwargs = {})
%sub_58 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_180, %getitem_147), kwargs = {})
%mul_126 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_58, %rsqrt_58), kwargs = {})
%_param_constant359 : [#users=1] = get_attr[target=_param_constant359]
%mul_127 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_126, %_param_constant359), kwargs = {})
%_param_constant360 : [#users=1] = get_attr[target=_param_constant360]
%add_182 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_127, %_param_constant360), kwargs = {})
%_to_copy_191 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_182,), kwargs = {dtype: torch.float16})
%_param_constant361 : [#users=1] = get_attr[target=_param_constant361]
%t_105 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant361,), kwargs = {})
%view_264 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_191, [1152, 1280]), kwargs = {})
%mm_53 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_264, %t_105), kwargs = {})
%_unsafe_view_61 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_53, [2, 576, 1280]), kwargs = {})
%_param_constant362 : [#users=1] = get_attr[target=_param_constant362]
%t_106 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant362,), kwargs = {})
%view_265 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_54 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_265, %t_106), kwargs = {})
%_unsafe_view_62 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_54, [2, 77, 1280]), kwargs = {})
%_param_constant363 : [#users=1] = get_attr[target=_param_constant363]
%t_107 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant363,), kwargs = {})
%view_266 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_55 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_266, %t_107), kwargs = {})
%_unsafe_view_63 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_55, [2, 77, 1280]), kwargs = {})
%view_267 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_61, [2, -1, 20, 64]), kwargs = {})
%transpose_60 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_267, 1, 2), kwargs = {})
%view_268 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_62, [2, -1, 20, 64]), kwargs = {})
%transpose_61 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_268, 1, 2), kwargs = {})
%view_269 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_63, [2, -1, 20, 64]), kwargs = {})
%transpose_62 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_269, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_15 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_60, %transpose_61, %transpose_62, True), kwargs = {})
%getitem_148 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_15, 0), kwargs = {})
%getitem_149 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_15, 1), kwargs = {})
%detach_87 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_148,), kwargs = {})
%transpose_63 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_148, 1, 2), kwargs = {})
%view_270 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_63, [2, -1, 1280]), kwargs = {})
%_param_constant364 : [#users=1] = get_attr[target=_param_constant364]
%t_108 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant364,), kwargs = {})
%view_271 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_270, [1152, 1280]), kwargs = {})
%_param_constant365 : [#users=1] = get_attr[target=_param_constant365]
%addmm_52 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant365, %view_271, %t_108), kwargs = {})
%view_272 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_52, [2, 576, 1280]), kwargs = {})
%add_183 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_272, %add_180), kwargs = {})
%_to_copy_192 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_183,), kwargs = {dtype: torch.float32})
%var_mean_59 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_192, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_150 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_59, 0), kwargs = {})
%getitem_151 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_59, 1), kwargs = {})
%add_184 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_150, 1e-05), kwargs = {})
%rsqrt_59 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_184,), kwargs = {})
%sub_59 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_183, %getitem_151), kwargs = {})
%mul_128 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_59, %rsqrt_59), kwargs = {})
%_param_constant366 : [#users=1] = get_attr[target=_param_constant366]
%mul_129 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_128, %_param_constant366), kwargs = {})
%_param_constant367 : [#users=1] = get_attr[target=_param_constant367]
%add_185 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_129, %_param_constant367), kwargs = {})
%_to_copy_193 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_185,), kwargs = {dtype: torch.float16})
%_param_constant368 : [#users=1] = get_attr[target=_param_constant368]
%t_109 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant368,), kwargs = {})
%view_273 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_193, [1152, 1280]), kwargs = {})
%_param_constant369 : [#users=1] = get_attr[target=_param_constant369]
%addmm_53 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant369, %view_273, %t_109), kwargs = {})
%view_274 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_53, [2, 576, 10240]), kwargs = {})
%slice_49 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_274, -1, 0, 5120), kwargs = {})
%slice_50 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_274, -1, 5120, 10240), kwargs = {})
%gelu_7 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_50,), kwargs = {})
%mul_130 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_49, %gelu_7), kwargs = {})
%_param_constant370 : [#users=1] = get_attr[target=_param_constant370]
%t_110 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant370,), kwargs = {})
%view_275 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_130, [1152, 5120]), kwargs = {})
%_param_constant371 : [#users=1] = get_attr[target=_param_constant371]
%addmm_54 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant371, %view_275, %t_110), kwargs = {})
%view_276 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_54, [2, 576, 1280]), kwargs = {})
%add_186 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_276, %add_183), kwargs = {})
%_param_constant372 : [#users=1] = get_attr[target=_param_constant372]
%t_111 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant372,), kwargs = {})
%view_277 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_186, [1152, 1280]), kwargs = {})
%_param_constant373 : [#users=1] = get_attr[target=_param_constant373]
%addmm_55 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant373, %view_277, %t_111), kwargs = {})
%view_278 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_55, [2, 576, 1280]), kwargs = {})
%view_279 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_278, [2, 24, 24, 1280]), kwargs = {})
%permute_15 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_279, [0, 3, 1, 2]), kwargs = {})
%clone_15 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_15,), kwargs = {memory_format: torch.contiguous_format})
%add_187 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_15, %div_14), kwargs = {})
%cat_6 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_187, %add_94], 1), kwargs = {})
%view_280 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_6, [2, 32, 80, 576]), kwargs = {})
%_to_copy_194 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_280,), kwargs = {dtype: torch.float32})
%var_mean_60 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_194, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_152 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_60, 0), kwargs = {})
%getitem_153 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_60, 1), kwargs = {})
%add_188 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_152, 1e-05), kwargs = {})
%rsqrt_60 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_188,), kwargs = {})
%sub_60 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_280, %getitem_153), kwargs = {})
%mul_131 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_60, %rsqrt_60), kwargs = {})
%view_281 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_131, [2, 2560, 24, 24]), kwargs = {})
%_param_constant374 : [#users=1] = get_attr[target=_param_constant374]
%unsqueeze_246 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant374, 0), kwargs = {})
%unsqueeze_247 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_246, 2), kwargs = {})
%unsqueeze_248 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_247, 3), kwargs = {})
%_param_constant375 : [#users=1] = get_attr[target=_param_constant375]
%unsqueeze_249 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant375, 0), kwargs = {})
%unsqueeze_250 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_249, 2), kwargs = {})
%unsqueeze_251 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_250, 3), kwargs = {})
%mul_132 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_281, %unsqueeze_251), kwargs = {})
%add_189 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_132, %unsqueeze_248), kwargs = {})
%_to_copy_195 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_189,), kwargs = {dtype: torch.float16})
%_to_copy_196 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_153,), kwargs = {dtype: torch.float16})
%_to_copy_197 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_60,), kwargs = {dtype: torch.float16})
%squeeze_72 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_196, [2, 3]), kwargs = {})
%squeeze_73 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_197, [2, 3]), kwargs = {})
%detach_88 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_72,), kwargs = {})
%detach_89 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_73,), kwargs = {})
%silu_43 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_195,), kwargs = {})
%_param_constant376 : [#users=1] = get_attr[target=_param_constant376]
%_param_constant377 : [#users=1] = get_attr[target=_param_constant377]
%convolution_39 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_43, %_param_constant376, %_param_constant377, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_44 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant378 : [#users=1] = get_attr[target=_param_constant378]
%t_112 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant378,), kwargs = {})
%_param_constant379 : [#users=1] = get_attr[target=_param_constant379]
%addmm_56 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant379, %silu_44, %t_112), kwargs = {})
%slice_51 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_56, 0, 0, 9223372036854775807), kwargs = {})
%slice_52 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_51, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_252 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_52, 2), kwargs = {})
%unsqueeze_253 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_252, 3), kwargs = {})
%add_190 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_39, %unsqueeze_253), kwargs = {})
%view_282 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_190, [2, 32, 40, 576]), kwargs = {})
%_to_copy_198 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_282,), kwargs = {dtype: torch.float32})
%var_mean_61 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_198, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_154 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_61, 0), kwargs = {})
%getitem_155 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_61, 1), kwargs = {})
%add_191 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_154, 1e-05), kwargs = {})
%rsqrt_61 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_191,), kwargs = {})
%sub_61 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_282, %getitem_155), kwargs = {})
%mul_133 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_61, %rsqrt_61), kwargs = {})
%view_283 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_133, [2, 1280, 24, 24]), kwargs = {})
%_param_constant380 : [#users=1] = get_attr[target=_param_constant380]
%unsqueeze_254 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant380, 0), kwargs = {})
%unsqueeze_255 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_254, 2), kwargs = {})
%unsqueeze_256 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_255, 3), kwargs = {})
%_param_constant381 : [#users=1] = get_attr[target=_param_constant381]
%unsqueeze_257 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant381, 0), kwargs = {})
%unsqueeze_258 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_257, 2), kwargs = {})
%unsqueeze_259 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_258, 3), kwargs = {})
%mul_134 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_283, %unsqueeze_259), kwargs = {})
%add_192 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_134, %unsqueeze_256), kwargs = {})
%_to_copy_199 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_192,), kwargs = {dtype: torch.float16})
%_to_copy_200 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_155,), kwargs = {dtype: torch.float16})
%_to_copy_201 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_61,), kwargs = {dtype: torch.float16})
%squeeze_74 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_200, [2, 3]), kwargs = {})
%squeeze_75 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_201, [2, 3]), kwargs = {})
%detach_90 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_74,), kwargs = {})
%detach_91 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_75,), kwargs = {})
%silu_45 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_199,), kwargs = {})
%_param_constant382 : [#users=1] = get_attr[target=_param_constant382]
%_param_constant383 : [#users=1] = get_attr[target=_param_constant383]
%convolution_40 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_45, %_param_constant382, %_param_constant383, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant384 : [#users=1] = get_attr[target=_param_constant384]
%_param_constant385 : [#users=1] = get_attr[target=_param_constant385]
%convolution_41 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_6, %_param_constant384, %_param_constant385, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_193 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_41, %convolution_40), kwargs = {})
%div_15 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_193, 1.0), kwargs = {})
%view_284 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_15, [2, 32, 40, 576]), kwargs = {})
%_to_copy_202 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_284,), kwargs = {dtype: torch.float32})
%var_mean_62 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_202, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_156 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_62, 0), kwargs = {})
%getitem_157 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_62, 1), kwargs = {})
%add_194 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_156, 1e-06), kwargs = {})
%rsqrt_62 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_194,), kwargs = {})
%sub_62 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_284, %getitem_157), kwargs = {})
%mul_135 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_62, %rsqrt_62), kwargs = {})
%view_285 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_135, [2, 1280, 24, 24]), kwargs = {})
%_param_constant386 : [#users=1] = get_attr[target=_param_constant386]
%unsqueeze_260 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant386, 0), kwargs = {})
%unsqueeze_261 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_260, 2), kwargs = {})
%unsqueeze_262 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_261, 3), kwargs = {})
%_param_constant387 : [#users=1] = get_attr[target=_param_constant387]
%unsqueeze_263 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant387, 0), kwargs = {})
%unsqueeze_264 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_263, 2), kwargs = {})
%unsqueeze_265 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_264, 3), kwargs = {})
%mul_136 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_285, %unsqueeze_265), kwargs = {})
%add_195 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_136, %unsqueeze_262), kwargs = {})
%_to_copy_203 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_195,), kwargs = {dtype: torch.float16})
%_to_copy_204 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_157,), kwargs = {dtype: torch.float16})
%_to_copy_205 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_62,), kwargs = {dtype: torch.float16})
%squeeze_76 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_204, [2, 3]), kwargs = {})
%squeeze_77 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_205, [2, 3]), kwargs = {})
%detach_92 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_76,), kwargs = {})
%detach_93 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_77,), kwargs = {})
%permute_16 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_203, [0, 2, 3, 1]), kwargs = {})
%view_286 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_16, [2, 576, 1280]), kwargs = {})
%_param_constant388 : [#users=1] = get_attr[target=_param_constant388]
%t_113 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant388,), kwargs = {})
%clone_16 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_286,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_64 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_16, [1152, 1280]), kwargs = {})
%mm_56 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_64, %t_113), kwargs = {})
%_unsafe_view_65 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_56, [2, 576, 1280]), kwargs = {})
%_param_constant389 : [#users=1] = get_attr[target=_param_constant389]
%add_196 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_65, %_param_constant389), kwargs = {})
%_to_copy_206 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_196,), kwargs = {dtype: torch.float32})
%var_mean_63 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_206, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_158 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_63, 0), kwargs = {})
%getitem_159 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_63, 1), kwargs = {})
%add_197 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_158, 1e-05), kwargs = {})
%rsqrt_63 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_197,), kwargs = {})
%sub_63 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_196, %getitem_159), kwargs = {})
%mul_137 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_63, %rsqrt_63), kwargs = {})
%_param_constant390 : [#users=1] = get_attr[target=_param_constant390]
%mul_138 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_137, %_param_constant390), kwargs = {})
%_param_constant391 : [#users=1] = get_attr[target=_param_constant391]
%add_198 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_138, %_param_constant391), kwargs = {})
%_to_copy_207 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_198,), kwargs = {dtype: torch.float16})
%_param_constant392 : [#users=1] = get_attr[target=_param_constant392]
%t_114 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant392,), kwargs = {})
%view_287 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_207, [1152, 1280]), kwargs = {})
%mm_57 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_287, %t_114), kwargs = {})
%_unsafe_view_66 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_57, [2, 576, 1280]), kwargs = {})
%_param_constant393 : [#users=1] = get_attr[target=_param_constant393]
%t_115 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant393,), kwargs = {})
%view_288 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_207, [1152, 1280]), kwargs = {})
%mm_58 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_288, %t_115), kwargs = {})
%_unsafe_view_67 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_58, [2, 576, 1280]), kwargs = {})
%_param_constant394 : [#users=1] = get_attr[target=_param_constant394]
%t_116 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant394,), kwargs = {})
%view_289 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_207, [1152, 1280]), kwargs = {})
%mm_59 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_289, %t_116), kwargs = {})
%_unsafe_view_68 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_59, [2, 576, 1280]), kwargs = {})
%view_290 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_66, [2, -1, 20, 64]), kwargs = {})
%transpose_64 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_290, 1, 2), kwargs = {})
%view_291 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_67, [2, -1, 20, 64]), kwargs = {})
%transpose_65 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_291, 1, 2), kwargs = {})
%view_292 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_68, [2, -1, 20, 64]), kwargs = {})
%transpose_66 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_292, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_16 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_64, %transpose_65, %transpose_66, True), kwargs = {})
%getitem_160 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_16, 0), kwargs = {})
%getitem_161 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_16, 1), kwargs = {})
%detach_94 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_160,), kwargs = {})
%transpose_67 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_160, 1, 2), kwargs = {})
%view_293 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_67, [2, -1, 1280]), kwargs = {})
%_param_constant395 : [#users=1] = get_attr[target=_param_constant395]
%t_117 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant395,), kwargs = {})
%view_294 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_293, [1152, 1280]), kwargs = {})
%_param_constant396 : [#users=1] = get_attr[target=_param_constant396]
%addmm_57 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant396, %view_294, %t_117), kwargs = {})
%view_295 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_57, [2, 576, 1280]), kwargs = {})
%add_199 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_295, %add_196), kwargs = {})
%_to_copy_208 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_199,), kwargs = {dtype: torch.float32})
%var_mean_64 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_208, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_162 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_64, 0), kwargs = {})
%getitem_163 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_64, 1), kwargs = {})
%add_200 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_162, 1e-05), kwargs = {})
%rsqrt_64 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_200,), kwargs = {})
%sub_64 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_199, %getitem_163), kwargs = {})
%mul_139 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_64, %rsqrt_64), kwargs = {})
%_param_constant397 : [#users=1] = get_attr[target=_param_constant397]
%mul_140 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_139, %_param_constant397), kwargs = {})
%_param_constant398 : [#users=1] = get_attr[target=_param_constant398]
%add_201 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_140, %_param_constant398), kwargs = {})
%_to_copy_209 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_201,), kwargs = {dtype: torch.float16})
%_param_constant399 : [#users=1] = get_attr[target=_param_constant399]
%t_118 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant399,), kwargs = {})
%view_296 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_209, [1152, 1280]), kwargs = {})
%mm_60 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_296, %t_118), kwargs = {})
%_unsafe_view_69 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_60, [2, 576, 1280]), kwargs = {})
%_param_constant400 : [#users=1] = get_attr[target=_param_constant400]
%t_119 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant400,), kwargs = {})
%view_297 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_61 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_297, %t_119), kwargs = {})
%_unsafe_view_70 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_61, [2, 77, 1280]), kwargs = {})
%_param_constant401 : [#users=1] = get_attr[target=_param_constant401]
%t_120 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant401,), kwargs = {})
%view_298 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_62 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_298, %t_120), kwargs = {})
%_unsafe_view_71 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_62, [2, 77, 1280]), kwargs = {})
%view_299 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_69, [2, -1, 20, 64]), kwargs = {})
%transpose_68 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_299, 1, 2), kwargs = {})
%view_300 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_70, [2, -1, 20, 64]), kwargs = {})
%transpose_69 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_300, 1, 2), kwargs = {})
%view_301 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_71, [2, -1, 20, 64]), kwargs = {})
%transpose_70 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_301, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_17 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_68, %transpose_69, %transpose_70, True), kwargs = {})
%getitem_164 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_17, 0), kwargs = {})
%getitem_165 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_17, 1), kwargs = {})
%detach_95 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_164,), kwargs = {})
%transpose_71 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_164, 1, 2), kwargs = {})
%view_302 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_71, [2, -1, 1280]), kwargs = {})
%_param_constant402 : [#users=1] = get_attr[target=_param_constant402]
%t_121 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant402,), kwargs = {})
%view_303 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_302, [1152, 1280]), kwargs = {})
%_param_constant403 : [#users=1] = get_attr[target=_param_constant403]
%addmm_58 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant403, %view_303, %t_121), kwargs = {})
%view_304 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_58, [2, 576, 1280]), kwargs = {})
%add_202 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_304, %add_199), kwargs = {})
%_to_copy_210 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_202,), kwargs = {dtype: torch.float32})
%var_mean_65 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_210, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_166 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_65, 0), kwargs = {})
%getitem_167 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_65, 1), kwargs = {})
%add_203 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_166, 1e-05), kwargs = {})
%rsqrt_65 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_203,), kwargs = {})
%sub_65 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_202, %getitem_167), kwargs = {})
%mul_141 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_65, %rsqrt_65), kwargs = {})
%_param_constant404 : [#users=1] = get_attr[target=_param_constant404]
%mul_142 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_141, %_param_constant404), kwargs = {})
%_param_constant405 : [#users=1] = get_attr[target=_param_constant405]
%add_204 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_142, %_param_constant405), kwargs = {})
%_to_copy_211 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_204,), kwargs = {dtype: torch.float16})
%_param_constant406 : [#users=1] = get_attr[target=_param_constant406]
%t_122 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant406,), kwargs = {})
%view_305 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_211, [1152, 1280]), kwargs = {})
%_param_constant407 : [#users=1] = get_attr[target=_param_constant407]
%addmm_59 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant407, %view_305, %t_122), kwargs = {})
%view_306 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_59, [2, 576, 10240]), kwargs = {})
%slice_53 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_306, -1, 0, 5120), kwargs = {})
%slice_54 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_306, -1, 5120, 10240), kwargs = {})
%gelu_8 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_54,), kwargs = {})
%mul_143 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_53, %gelu_8), kwargs = {})
%_param_constant408 : [#users=1] = get_attr[target=_param_constant408]
%t_123 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant408,), kwargs = {})
%view_307 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_143, [1152, 5120]), kwargs = {})
%_param_constant409 : [#users=1] = get_attr[target=_param_constant409]
%addmm_60 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant409, %view_307, %t_123), kwargs = {})
%view_308 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_60, [2, 576, 1280]), kwargs = {})
%add_205 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_308, %add_202), kwargs = {})
%_param_constant410 : [#users=1] = get_attr[target=_param_constant410]
%t_124 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant410,), kwargs = {})
%view_309 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_205, [1152, 1280]), kwargs = {})
%_param_constant411 : [#users=1] = get_attr[target=_param_constant411]
%addmm_61 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant411, %view_309, %t_124), kwargs = {})
%view_310 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_61, [2, 576, 1280]), kwargs = {})
%view_311 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_310, [2, 24, 24, 1280]), kwargs = {})
%permute_17 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_311, [0, 3, 1, 2]), kwargs = {})
%clone_17 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_17,), kwargs = {memory_format: torch.contiguous_format})
%add_206 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_17, %div_15), kwargs = {})
%cat_7 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_206, %convolution_11], 1), kwargs = {})
%view_312 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_7, [2, 32, 60, 576]), kwargs = {})
%_to_copy_212 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_312,), kwargs = {dtype: torch.float32})
%var_mean_66 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_212, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_168 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_66, 0), kwargs = {})
%getitem_169 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_66, 1), kwargs = {})
%add_207 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_168, 1e-05), kwargs = {})
%rsqrt_66 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_207,), kwargs = {})
%sub_66 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_312, %getitem_169), kwargs = {})
%mul_144 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_66, %rsqrt_66), kwargs = {})
%view_313 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_144, [2, 1920, 24, 24]), kwargs = {})
%_param_constant412 : [#users=1] = get_attr[target=_param_constant412]
%unsqueeze_266 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant412, 0), kwargs = {})
%unsqueeze_267 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_266, 2), kwargs = {})
%unsqueeze_268 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_267, 3), kwargs = {})
%_param_constant413 : [#users=1] = get_attr[target=_param_constant413]
%unsqueeze_269 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant413, 0), kwargs = {})
%unsqueeze_270 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_269, 2), kwargs = {})
%unsqueeze_271 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_270, 3), kwargs = {})
%mul_145 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_313, %unsqueeze_271), kwargs = {})
%add_208 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_145, %unsqueeze_268), kwargs = {})
%_to_copy_213 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_208,), kwargs = {dtype: torch.float16})
%_to_copy_214 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_169,), kwargs = {dtype: torch.float16})
%_to_copy_215 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_66,), kwargs = {dtype: torch.float16})
%squeeze_78 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_214, [2, 3]), kwargs = {})
%squeeze_79 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_215, [2, 3]), kwargs = {})
%detach_96 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_78,), kwargs = {})
%detach_97 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_79,), kwargs = {})
%silu_46 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_213,), kwargs = {})
%_param_constant414 : [#users=1] = get_attr[target=_param_constant414]
%_param_constant415 : [#users=1] = get_attr[target=_param_constant415]
%convolution_42 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_46, %_param_constant414, %_param_constant415, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_47 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant416 : [#users=1] = get_attr[target=_param_constant416]
%t_125 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant416,), kwargs = {})
%_param_constant417 : [#users=1] = get_attr[target=_param_constant417]
%addmm_62 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant417, %silu_47, %t_125), kwargs = {})
%slice_55 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_62, 0, 0, 9223372036854775807), kwargs = {})
%slice_56 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_55, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_272 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_56, 2), kwargs = {})
%unsqueeze_273 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_272, 3), kwargs = {})
%add_209 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_42, %unsqueeze_273), kwargs = {})
%view_314 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_209, [2, 32, 40, 576]), kwargs = {})
%_to_copy_216 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_314,), kwargs = {dtype: torch.float32})
%var_mean_67 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_216, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_170 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_67, 0), kwargs = {})
%getitem_171 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_67, 1), kwargs = {})
%add_210 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_170, 1e-05), kwargs = {})
%rsqrt_67 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_210,), kwargs = {})
%sub_67 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_314, %getitem_171), kwargs = {})
%mul_146 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_67, %rsqrt_67), kwargs = {})
%view_315 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_146, [2, 1280, 24, 24]), kwargs = {})
%_param_constant418 : [#users=1] = get_attr[target=_param_constant418]
%unsqueeze_274 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant418, 0), kwargs = {})
%unsqueeze_275 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_274, 2), kwargs = {})
%unsqueeze_276 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_275, 3), kwargs = {})
%_param_constant419 : [#users=1] = get_attr[target=_param_constant419]
%unsqueeze_277 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant419, 0), kwargs = {})
%unsqueeze_278 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_277, 2), kwargs = {})
%unsqueeze_279 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_278, 3), kwargs = {})
%mul_147 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_315, %unsqueeze_279), kwargs = {})
%add_211 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_147, %unsqueeze_276), kwargs = {})
%_to_copy_217 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_211,), kwargs = {dtype: torch.float16})
%_to_copy_218 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_171,), kwargs = {dtype: torch.float16})
%_to_copy_219 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_67,), kwargs = {dtype: torch.float16})
%squeeze_80 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_218, [2, 3]), kwargs = {})
%squeeze_81 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_219, [2, 3]), kwargs = {})
%detach_98 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_80,), kwargs = {})
%detach_99 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_81,), kwargs = {})
%silu_48 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_217,), kwargs = {})
%_param_constant420 : [#users=1] = get_attr[target=_param_constant420]
%_param_constant421 : [#users=1] = get_attr[target=_param_constant421]
%convolution_43 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_48, %_param_constant420, %_param_constant421, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant422 : [#users=1] = get_attr[target=_param_constant422]
%_param_constant423 : [#users=1] = get_attr[target=_param_constant423]
%convolution_44 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_7, %_param_constant422, %_param_constant423, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_212 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_44, %convolution_43), kwargs = {})
%div_16 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_212, 1.0), kwargs = {})
%view_316 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_16, [2, 32, 40, 576]), kwargs = {})
%_to_copy_220 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_316,), kwargs = {dtype: torch.float32})
%var_mean_68 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_220, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_172 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_68, 0), kwargs = {})
%getitem_173 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_68, 1), kwargs = {})
%add_213 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_172, 1e-06), kwargs = {})
%rsqrt_68 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_213,), kwargs = {})
%sub_68 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_316, %getitem_173), kwargs = {})
%mul_148 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_68, %rsqrt_68), kwargs = {})
%view_317 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_148, [2, 1280, 24, 24]), kwargs = {})
%_param_constant424 : [#users=1] = get_attr[target=_param_constant424]
%unsqueeze_280 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant424, 0), kwargs = {})
%unsqueeze_281 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_280, 2), kwargs = {})
%unsqueeze_282 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_281, 3), kwargs = {})
%_param_constant425 : [#users=1] = get_attr[target=_param_constant425]
%unsqueeze_283 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant425, 0), kwargs = {})
%unsqueeze_284 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_283, 2), kwargs = {})
%unsqueeze_285 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_284, 3), kwargs = {})
%mul_149 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_317, %unsqueeze_285), kwargs = {})
%add_214 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_149, %unsqueeze_282), kwargs = {})
%_to_copy_221 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_214,), kwargs = {dtype: torch.float16})
%_to_copy_222 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_173,), kwargs = {dtype: torch.float16})
%_to_copy_223 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_68,), kwargs = {dtype: torch.float16})
%squeeze_82 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_222, [2, 3]), kwargs = {})
%squeeze_83 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_223, [2, 3]), kwargs = {})
%detach_100 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_82,), kwargs = {})
%detach_101 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_83,), kwargs = {})
%permute_18 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_221, [0, 2, 3, 1]), kwargs = {})
%view_318 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_18, [2, 576, 1280]), kwargs = {})
%_param_constant426 : [#users=1] = get_attr[target=_param_constant426]
%t_126 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant426,), kwargs = {})
%clone_18 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_318,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_72 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_18, [1152, 1280]), kwargs = {})
%mm_63 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_72, %t_126), kwargs = {})
%_unsafe_view_73 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_63, [2, 576, 1280]), kwargs = {})
%_param_constant427 : [#users=1] = get_attr[target=_param_constant427]
%add_215 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_73, %_param_constant427), kwargs = {})
%_to_copy_224 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_215,), kwargs = {dtype: torch.float32})
%var_mean_69 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_224, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_174 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_69, 0), kwargs = {})
%getitem_175 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_69, 1), kwargs = {})
%add_216 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_174, 1e-05), kwargs = {})
%rsqrt_69 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_216,), kwargs = {})
%sub_69 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_215, %getitem_175), kwargs = {})
%mul_150 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_69, %rsqrt_69), kwargs = {})
%_param_constant428 : [#users=1] = get_attr[target=_param_constant428]
%mul_151 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_150, %_param_constant428), kwargs = {})
%_param_constant429 : [#users=1] = get_attr[target=_param_constant429]
%add_217 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_151, %_param_constant429), kwargs = {})
%_to_copy_225 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_217,), kwargs = {dtype: torch.float16})
%_param_constant430 : [#users=1] = get_attr[target=_param_constant430]
%t_127 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant430,), kwargs = {})
%view_319 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_225, [1152, 1280]), kwargs = {})
%mm_64 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_319, %t_127), kwargs = {})
%_unsafe_view_74 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_64, [2, 576, 1280]), kwargs = {})
%_param_constant431 : [#users=1] = get_attr[target=_param_constant431]
%t_128 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant431,), kwargs = {})
%view_320 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_225, [1152, 1280]), kwargs = {})
%mm_65 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_320, %t_128), kwargs = {})
%_unsafe_view_75 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_65, [2, 576, 1280]), kwargs = {})
%_param_constant432 : [#users=1] = get_attr[target=_param_constant432]
%t_129 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant432,), kwargs = {})
%view_321 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_225, [1152, 1280]), kwargs = {})
%mm_66 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_321, %t_129), kwargs = {})
%_unsafe_view_76 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_66, [2, 576, 1280]), kwargs = {})
%view_322 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_74, [2, -1, 20, 64]), kwargs = {})
%transpose_72 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_322, 1, 2), kwargs = {})
%view_323 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_75, [2, -1, 20, 64]), kwargs = {})
%transpose_73 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_323, 1, 2), kwargs = {})
%view_324 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_76, [2, -1, 20, 64]), kwargs = {})
%transpose_74 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_324, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_18 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_72, %transpose_73, %transpose_74, True), kwargs = {})
%getitem_176 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_18, 0), kwargs = {})
%getitem_177 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_18, 1), kwargs = {})
%detach_102 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_176,), kwargs = {})
%transpose_75 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_176, 1, 2), kwargs = {})
%view_325 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_75, [2, -1, 1280]), kwargs = {})
%_param_constant433 : [#users=1] = get_attr[target=_param_constant433]
%t_130 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant433,), kwargs = {})
%view_326 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_325, [1152, 1280]), kwargs = {})
%_param_constant434 : [#users=1] = get_attr[target=_param_constant434]
%addmm_63 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant434, %view_326, %t_130), kwargs = {})
%view_327 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_63, [2, 576, 1280]), kwargs = {})
%add_218 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_327, %add_215), kwargs = {})
%_to_copy_226 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_218,), kwargs = {dtype: torch.float32})
%var_mean_70 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_226, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_178 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_70, 0), kwargs = {})
%getitem_179 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_70, 1), kwargs = {})
%add_219 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_178, 1e-05), kwargs = {})
%rsqrt_70 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_219,), kwargs = {})
%sub_70 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_218, %getitem_179), kwargs = {})
%mul_152 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_70, %rsqrt_70), kwargs = {})
%_param_constant435 : [#users=1] = get_attr[target=_param_constant435]
%mul_153 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_152, %_param_constant435), kwargs = {})
%_param_constant436 : [#users=1] = get_attr[target=_param_constant436]
%add_220 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_153, %_param_constant436), kwargs = {})
%_to_copy_227 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_220,), kwargs = {dtype: torch.float16})
%_param_constant437 : [#users=1] = get_attr[target=_param_constant437]
%t_131 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant437,), kwargs = {})
%view_328 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_227, [1152, 1280]), kwargs = {})
%mm_67 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_328, %t_131), kwargs = {})
%_unsafe_view_77 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_67, [2, 576, 1280]), kwargs = {})
%_param_constant438 : [#users=1] = get_attr[target=_param_constant438]
%t_132 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant438,), kwargs = {})
%view_329 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_68 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_329, %t_132), kwargs = {})
%_unsafe_view_78 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_68, [2, 77, 1280]), kwargs = {})
%_param_constant439 : [#users=1] = get_attr[target=_param_constant439]
%t_133 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant439,), kwargs = {})
%view_330 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_69 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_330, %t_133), kwargs = {})
%_unsafe_view_79 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_69, [2, 77, 1280]), kwargs = {})
%view_331 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_77, [2, -1, 20, 64]), kwargs = {})
%transpose_76 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_331, 1, 2), kwargs = {})
%view_332 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_78, [2, -1, 20, 64]), kwargs = {})
%transpose_77 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_332, 1, 2), kwargs = {})
%view_333 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_79, [2, -1, 20, 64]), kwargs = {})
%transpose_78 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_333, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_19 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_76, %transpose_77, %transpose_78, True), kwargs = {})
%getitem_180 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_19, 0), kwargs = {})
%getitem_181 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_19, 1), kwargs = {})
%detach_103 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_180,), kwargs = {})
%transpose_79 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_180, 1, 2), kwargs = {})
%view_334 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_79, [2, -1, 1280]), kwargs = {})
%_param_constant440 : [#users=1] = get_attr[target=_param_constant440]
%t_134 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant440,), kwargs = {})
%view_335 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_334, [1152, 1280]), kwargs = {})
%_param_constant441 : [#users=1] = get_attr[target=_param_constant441]
%addmm_64 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant441, %view_335, %t_134), kwargs = {})
%view_336 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_64, [2, 576, 1280]), kwargs = {})
%add_221 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_336, %add_218), kwargs = {})
%_to_copy_228 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_221,), kwargs = {dtype: torch.float32})
%var_mean_71 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_228, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_182 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_71, 0), kwargs = {})
%getitem_183 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_71, 1), kwargs = {})
%add_222 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_182, 1e-05), kwargs = {})
%rsqrt_71 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_222,), kwargs = {})
%sub_71 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_221, %getitem_183), kwargs = {})
%mul_154 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_71, %rsqrt_71), kwargs = {})
%_param_constant442 : [#users=1] = get_attr[target=_param_constant442]
%mul_155 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_154, %_param_constant442), kwargs = {})
%_param_constant443 : [#users=1] = get_attr[target=_param_constant443]
%add_223 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_155, %_param_constant443), kwargs = {})
%_to_copy_229 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_223,), kwargs = {dtype: torch.float16})
%_param_constant444 : [#users=1] = get_attr[target=_param_constant444]
%t_135 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant444,), kwargs = {})
%view_337 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_229, [1152, 1280]), kwargs = {})
%_param_constant445 : [#users=1] = get_attr[target=_param_constant445]
%addmm_65 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant445, %view_337, %t_135), kwargs = {})
%view_338 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_65, [2, 576, 10240]), kwargs = {})
%slice_57 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_338, -1, 0, 5120), kwargs = {})
%slice_58 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_338, -1, 5120, 10240), kwargs = {})
%gelu_9 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_58,), kwargs = {})
%mul_156 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_57, %gelu_9), kwargs = {})
%_param_constant446 : [#users=1] = get_attr[target=_param_constant446]
%t_136 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant446,), kwargs = {})
%view_339 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_156, [1152, 5120]), kwargs = {})
%_param_constant447 : [#users=1] = get_attr[target=_param_constant447]
%addmm_66 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant447, %view_339, %t_136), kwargs = {})
%view_340 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_66, [2, 576, 1280]), kwargs = {})
%add_224 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_340, %add_221), kwargs = {})
%_param_constant448 : [#users=1] = get_attr[target=_param_constant448]
%t_137 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant448,), kwargs = {})
%view_341 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_224, [1152, 1280]), kwargs = {})
%_param_constant449 : [#users=1] = get_attr[target=_param_constant449]
%addmm_67 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant449, %view_341, %t_137), kwargs = {})
%view_342 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_67, [2, 576, 1280]), kwargs = {})
%view_343 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_342, [2, 24, 24, 1280]), kwargs = {})
%permute_19 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_343, [0, 3, 1, 2]), kwargs = {})
%clone_19 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_19,), kwargs = {memory_format: torch.contiguous_format})
%add_225 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_19, %div_16), kwargs = {})
%upsample_nearest2d_1 : [#users=1] = call_function[target=torch.ops.aten.upsample_nearest2d](args = (%add_225, [48, 48], 2.0, 2.0), kwargs = {})
%_param_constant450 : [#users=1] = get_attr[target=_param_constant450]
%_param_constant451 : [#users=1] = get_attr[target=_param_constant451]
%convolution_45 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%upsample_nearest2d_1, %_param_constant450, %_param_constant451, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%cat_8 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%convolution_45, %add_75], 1), kwargs = {})
%view_344 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_8, [2, 32, 60, 2304]), kwargs = {})
%_to_copy_230 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_344,), kwargs = {dtype: torch.float32})
%var_mean_72 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_230, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_184 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_72, 0), kwargs = {})
%getitem_185 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_72, 1), kwargs = {})
%add_226 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_184, 1e-05), kwargs = {})
%rsqrt_72 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_226,), kwargs = {})
%sub_72 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_344, %getitem_185), kwargs = {})
%mul_157 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_72, %rsqrt_72), kwargs = {})
%view_345 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_157, [2, 1920, 48, 48]), kwargs = {})
%_param_constant452 : [#users=1] = get_attr[target=_param_constant452]
%unsqueeze_286 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant452, 0), kwargs = {})
%unsqueeze_287 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_286, 2), kwargs = {})
%unsqueeze_288 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_287, 3), kwargs = {})
%_param_constant453 : [#users=1] = get_attr[target=_param_constant453]
%unsqueeze_289 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant453, 0), kwargs = {})
%unsqueeze_290 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_289, 2), kwargs = {})
%unsqueeze_291 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_290, 3), kwargs = {})
%mul_158 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_345, %unsqueeze_291), kwargs = {})
%add_227 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_158, %unsqueeze_288), kwargs = {})
%_to_copy_231 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_227,), kwargs = {dtype: torch.float16})
%_to_copy_232 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_185,), kwargs = {dtype: torch.float16})
%_to_copy_233 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_72,), kwargs = {dtype: torch.float16})
%squeeze_84 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_232, [2, 3]), kwargs = {})
%squeeze_85 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_233, [2, 3]), kwargs = {})
%detach_104 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_84,), kwargs = {})
%detach_105 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_85,), kwargs = {})
%silu_49 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_231,), kwargs = {})
%_param_constant454 : [#users=1] = get_attr[target=_param_constant454]
%_param_constant455 : [#users=1] = get_attr[target=_param_constant455]
%convolution_46 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_49, %_param_constant454, %_param_constant455, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_50 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant456 : [#users=1] = get_attr[target=_param_constant456]
%t_138 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant456,), kwargs = {})
%_param_constant457 : [#users=1] = get_attr[target=_param_constant457]
%addmm_68 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant457, %silu_50, %t_138), kwargs = {})
%slice_59 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_68, 0, 0, 9223372036854775807), kwargs = {})
%slice_60 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_59, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_292 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_60, 2), kwargs = {})
%unsqueeze_293 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_292, 3), kwargs = {})
%add_228 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_46, %unsqueeze_293), kwargs = {})
%view_346 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_228, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_234 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_346,), kwargs = {dtype: torch.float32})
%var_mean_73 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_234, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_186 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_73, 0), kwargs = {})
%getitem_187 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_73, 1), kwargs = {})
%add_229 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_186, 1e-05), kwargs = {})
%rsqrt_73 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_229,), kwargs = {})
%sub_73 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_346, %getitem_187), kwargs = {})
%mul_159 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_73, %rsqrt_73), kwargs = {})
%view_347 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_159, [2, 640, 48, 48]), kwargs = {})
%_param_constant458 : [#users=1] = get_attr[target=_param_constant458]
%unsqueeze_294 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant458, 0), kwargs = {})
%unsqueeze_295 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_294, 2), kwargs = {})
%unsqueeze_296 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_295, 3), kwargs = {})
%_param_constant459 : [#users=1] = get_attr[target=_param_constant459]
%unsqueeze_297 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant459, 0), kwargs = {})
%unsqueeze_298 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_297, 2), kwargs = {})
%unsqueeze_299 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_298, 3), kwargs = {})
%mul_160 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_347, %unsqueeze_299), kwargs = {})
%add_230 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_160, %unsqueeze_296), kwargs = {})
%_to_copy_235 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_230,), kwargs = {dtype: torch.float16})
%_to_copy_236 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_187,), kwargs = {dtype: torch.float16})
%_to_copy_237 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_73,), kwargs = {dtype: torch.float16})
%squeeze_86 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_236, [2, 3]), kwargs = {})
%squeeze_87 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_237, [2, 3]), kwargs = {})
%detach_106 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_86,), kwargs = {})
%detach_107 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_87,), kwargs = {})
%silu_51 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_235,), kwargs = {})
%_param_constant460 : [#users=1] = get_attr[target=_param_constant460]
%_param_constant461 : [#users=1] = get_attr[target=_param_constant461]
%convolution_47 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_51, %_param_constant460, %_param_constant461, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant462 : [#users=1] = get_attr[target=_param_constant462]
%_param_constant463 : [#users=1] = get_attr[target=_param_constant463]
%convolution_48 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_8, %_param_constant462, %_param_constant463, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_231 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_48, %convolution_47), kwargs = {})
%div_17 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_231, 1.0), kwargs = {})
%view_348 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_17, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_238 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_348,), kwargs = {dtype: torch.float32})
%var_mean_74 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_238, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_188 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_74, 0), kwargs = {})
%getitem_189 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_74, 1), kwargs = {})
%add_232 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_188, 1e-06), kwargs = {})
%rsqrt_74 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_232,), kwargs = {})
%sub_74 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_348, %getitem_189), kwargs = {})
%mul_161 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_74, %rsqrt_74), kwargs = {})
%view_349 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_161, [2, 640, 48, 48]), kwargs = {})
%_param_constant464 : [#users=1] = get_attr[target=_param_constant464]
%unsqueeze_300 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant464, 0), kwargs = {})
%unsqueeze_301 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_300, 2), kwargs = {})
%unsqueeze_302 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_301, 3), kwargs = {})
%_param_constant465 : [#users=1] = get_attr[target=_param_constant465]
%unsqueeze_303 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant465, 0), kwargs = {})
%unsqueeze_304 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_303, 2), kwargs = {})
%unsqueeze_305 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_304, 3), kwargs = {})
%mul_162 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_349, %unsqueeze_305), kwargs = {})
%add_233 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_162, %unsqueeze_302), kwargs = {})
%_to_copy_239 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_233,), kwargs = {dtype: torch.float16})
%_to_copy_240 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_189,), kwargs = {dtype: torch.float16})
%_to_copy_241 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_74,), kwargs = {dtype: torch.float16})
%squeeze_88 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_240, [2, 3]), kwargs = {})
%squeeze_89 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_241, [2, 3]), kwargs = {})
%detach_108 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_88,), kwargs = {})
%detach_109 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_89,), kwargs = {})
%permute_20 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_239, [0, 2, 3, 1]), kwargs = {})
%view_350 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_20, [2, 2304, 640]), kwargs = {})
%_param_constant466 : [#users=1] = get_attr[target=_param_constant466]
%t_139 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant466,), kwargs = {})
%clone_20 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_350,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_80 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_20, [4608, 640]), kwargs = {})
%mm_70 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_80, %t_139), kwargs = {})
%_unsafe_view_81 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_70, [2, 2304, 640]), kwargs = {})
%_param_constant467 : [#users=1] = get_attr[target=_param_constant467]
%add_234 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_81, %_param_constant467), kwargs = {})
%_to_copy_242 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_234,), kwargs = {dtype: torch.float32})
%var_mean_75 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_242, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_190 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_75, 0), kwargs = {})
%getitem_191 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_75, 1), kwargs = {})
%add_235 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_190, 1e-05), kwargs = {})
%rsqrt_75 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_235,), kwargs = {})
%sub_75 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_234, %getitem_191), kwargs = {})
%mul_163 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_75, %rsqrt_75), kwargs = {})
%_param_constant468 : [#users=1] = get_attr[target=_param_constant468]
%mul_164 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_163, %_param_constant468), kwargs = {})
%_param_constant469 : [#users=1] = get_attr[target=_param_constant469]
%add_236 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_164, %_param_constant469), kwargs = {})
%_to_copy_243 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_236,), kwargs = {dtype: torch.float16})
%_param_constant470 : [#users=1] = get_attr[target=_param_constant470]
%t_140 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant470,), kwargs = {})
%view_351 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_243, [4608, 640]), kwargs = {})
%mm_71 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_351, %t_140), kwargs = {})
%_unsafe_view_82 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_71, [2, 2304, 640]), kwargs = {})
%_param_constant471 : [#users=1] = get_attr[target=_param_constant471]
%t_141 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant471,), kwargs = {})
%view_352 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_243, [4608, 640]), kwargs = {})
%mm_72 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_352, %t_141), kwargs = {})
%_unsafe_view_83 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_72, [2, 2304, 640]), kwargs = {})
%_param_constant472 : [#users=1] = get_attr[target=_param_constant472]
%t_142 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant472,), kwargs = {})
%view_353 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_243, [4608, 640]), kwargs = {})
%mm_73 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_353, %t_142), kwargs = {})
%_unsafe_view_84 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_73, [2, 2304, 640]), kwargs = {})
%view_354 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_82, [2, -1, 10, 64]), kwargs = {})
%transpose_80 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_354, 1, 2), kwargs = {})
%view_355 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_83, [2, -1, 10, 64]), kwargs = {})
%transpose_81 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_355, 1, 2), kwargs = {})
%view_356 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_84, [2, -1, 10, 64]), kwargs = {})
%transpose_82 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_356, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_20 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_80, %transpose_81, %transpose_82, True), kwargs = {})
%getitem_192 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_20, 0), kwargs = {})
%getitem_193 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_20, 1), kwargs = {})
%detach_110 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_192,), kwargs = {})
%transpose_83 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_192, 1, 2), kwargs = {})
%view_357 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_83, [2, -1, 640]), kwargs = {})
%_param_constant473 : [#users=1] = get_attr[target=_param_constant473]
%t_143 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant473,), kwargs = {})
%view_358 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_357, [4608, 640]), kwargs = {})
%_param_constant474 : [#users=1] = get_attr[target=_param_constant474]
%addmm_69 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant474, %view_358, %t_143), kwargs = {})
%view_359 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_69, [2, 2304, 640]), kwargs = {})
%add_237 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_359, %add_234), kwargs = {})
%_to_copy_244 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_237,), kwargs = {dtype: torch.float32})
%var_mean_76 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_244, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_194 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_76, 0), kwargs = {})
%getitem_195 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_76, 1), kwargs = {})
%add_238 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_194, 1e-05), kwargs = {})
%rsqrt_76 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_238,), kwargs = {})
%sub_76 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_237, %getitem_195), kwargs = {})
%mul_165 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_76, %rsqrt_76), kwargs = {})
%_param_constant475 : [#users=1] = get_attr[target=_param_constant475]
%mul_166 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_165, %_param_constant475), kwargs = {})
%_param_constant476 : [#users=1] = get_attr[target=_param_constant476]
%add_239 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_166, %_param_constant476), kwargs = {})
%_to_copy_245 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_239,), kwargs = {dtype: torch.float16})
%_param_constant477 : [#users=1] = get_attr[target=_param_constant477]
%t_144 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant477,), kwargs = {})
%view_360 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_245, [4608, 640]), kwargs = {})
%mm_74 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_360, %t_144), kwargs = {})
%_unsafe_view_85 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_74, [2, 2304, 640]), kwargs = {})
%_param_constant478 : [#users=1] = get_attr[target=_param_constant478]
%t_145 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant478,), kwargs = {})
%view_361 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_75 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_361, %t_145), kwargs = {})
%_unsafe_view_86 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_75, [2, 77, 640]), kwargs = {})
%_param_constant479 : [#users=1] = get_attr[target=_param_constant479]
%t_146 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant479,), kwargs = {})
%view_362 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_76 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_362, %t_146), kwargs = {})
%_unsafe_view_87 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_76, [2, 77, 640]), kwargs = {})
%view_363 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_85, [2, -1, 10, 64]), kwargs = {})
%transpose_84 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_363, 1, 2), kwargs = {})
%view_364 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_86, [2, -1, 10, 64]), kwargs = {})
%transpose_85 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_364, 1, 2), kwargs = {})
%view_365 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_87, [2, -1, 10, 64]), kwargs = {})
%transpose_86 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_365, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_21 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_84, %transpose_85, %transpose_86, True), kwargs = {})
%getitem_196 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_21, 0), kwargs = {})
%getitem_197 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_21, 1), kwargs = {})
%detach_111 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_196,), kwargs = {})
%transpose_87 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_196, 1, 2), kwargs = {})
%view_366 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_87, [2, -1, 640]), kwargs = {})
%_param_constant480 : [#users=1] = get_attr[target=_param_constant480]
%t_147 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant480,), kwargs = {})
%view_367 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_366, [4608, 640]), kwargs = {})
%_param_constant481 : [#users=1] = get_attr[target=_param_constant481]
%addmm_70 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant481, %view_367, %t_147), kwargs = {})
%view_368 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_70, [2, 2304, 640]), kwargs = {})
%add_240 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_368, %add_237), kwargs = {})
%_to_copy_246 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_240,), kwargs = {dtype: torch.float32})
%var_mean_77 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_246, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_198 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_77, 0), kwargs = {})
%getitem_199 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_77, 1), kwargs = {})
%add_241 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_198, 1e-05), kwargs = {})
%rsqrt_77 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_241,), kwargs = {})
%sub_77 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_240, %getitem_199), kwargs = {})
%mul_167 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_77, %rsqrt_77), kwargs = {})
%_param_constant482 : [#users=1] = get_attr[target=_param_constant482]
%mul_168 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_167, %_param_constant482), kwargs = {})
%_param_constant483 : [#users=1] = get_attr[target=_param_constant483]
%add_242 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_168, %_param_constant483), kwargs = {})
%_to_copy_247 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_242,), kwargs = {dtype: torch.float16})
%_param_constant484 : [#users=1] = get_attr[target=_param_constant484]
%t_148 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant484,), kwargs = {})
%view_369 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_247, [4608, 640]), kwargs = {})
%_param_constant485 : [#users=1] = get_attr[target=_param_constant485]
%addmm_71 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant485, %view_369, %t_148), kwargs = {})
%view_370 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_71, [2, 2304, 5120]), kwargs = {})
%slice_61 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_370, -1, 0, 2560), kwargs = {})
%slice_62 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_370, -1, 2560, 5120), kwargs = {})
%gelu_10 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_62,), kwargs = {})
%mul_169 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_61, %gelu_10), kwargs = {})
%_param_constant486 : [#users=1] = get_attr[target=_param_constant486]
%t_149 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant486,), kwargs = {})
%view_371 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_169, [4608, 2560]), kwargs = {})
%_param_constant487 : [#users=1] = get_attr[target=_param_constant487]
%addmm_72 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant487, %view_371, %t_149), kwargs = {})
%view_372 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_72, [2, 2304, 640]), kwargs = {})
%add_243 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_372, %add_240), kwargs = {})
%_param_constant488 : [#users=1] = get_attr[target=_param_constant488]
%t_150 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant488,), kwargs = {})
%view_373 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_243, [4608, 640]), kwargs = {})
%_param_constant489 : [#users=1] = get_attr[target=_param_constant489]
%addmm_73 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant489, %view_373, %t_150), kwargs = {})
%view_374 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_73, [2, 2304, 640]), kwargs = {})
%view_375 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_374, [2, 48, 48, 640]), kwargs = {})
%permute_21 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_375, [0, 3, 1, 2]), kwargs = {})
%clone_21 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_21,), kwargs = {memory_format: torch.contiguous_format})
%add_244 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_21, %div_17), kwargs = {})
%cat_9 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_244, %add_56], 1), kwargs = {})
%view_376 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_9, [2, 32, 40, 2304]), kwargs = {})
%_to_copy_248 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_376,), kwargs = {dtype: torch.float32})
%var_mean_78 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_248, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_200 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_78, 0), kwargs = {})
%getitem_201 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_78, 1), kwargs = {})
%add_245 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_200, 1e-05), kwargs = {})
%rsqrt_78 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_245,), kwargs = {})
%sub_78 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_376, %getitem_201), kwargs = {})
%mul_170 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_78, %rsqrt_78), kwargs = {})
%view_377 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_170, [2, 1280, 48, 48]), kwargs = {})
%_param_constant490 : [#users=1] = get_attr[target=_param_constant490]
%unsqueeze_306 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant490, 0), kwargs = {})
%unsqueeze_307 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_306, 2), kwargs = {})
%unsqueeze_308 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_307, 3), kwargs = {})
%_param_constant491 : [#users=1] = get_attr[target=_param_constant491]
%unsqueeze_309 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant491, 0), kwargs = {})
%unsqueeze_310 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_309, 2), kwargs = {})
%unsqueeze_311 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_310, 3), kwargs = {})
%mul_171 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_377, %unsqueeze_311), kwargs = {})
%add_246 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_171, %unsqueeze_308), kwargs = {})
%_to_copy_249 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_246,), kwargs = {dtype: torch.float16})
%_to_copy_250 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_201,), kwargs = {dtype: torch.float16})
%_to_copy_251 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_78,), kwargs = {dtype: torch.float16})
%squeeze_90 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_250, [2, 3]), kwargs = {})
%squeeze_91 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_251, [2, 3]), kwargs = {})
%detach_112 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_90,), kwargs = {})
%detach_113 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_91,), kwargs = {})
%silu_52 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_249,), kwargs = {})
%_param_constant492 : [#users=1] = get_attr[target=_param_constant492]
%_param_constant493 : [#users=1] = get_attr[target=_param_constant493]
%convolution_49 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_52, %_param_constant492, %_param_constant493, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_53 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant494 : [#users=1] = get_attr[target=_param_constant494]
%t_151 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant494,), kwargs = {})
%_param_constant495 : [#users=1] = get_attr[target=_param_constant495]
%addmm_74 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant495, %silu_53, %t_151), kwargs = {})
%slice_63 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_74, 0, 0, 9223372036854775807), kwargs = {})
%slice_64 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_63, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_312 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_64, 2), kwargs = {})
%unsqueeze_313 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_312, 3), kwargs = {})
%add_247 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_49, %unsqueeze_313), kwargs = {})
%view_378 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_247, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_252 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_378,), kwargs = {dtype: torch.float32})
%var_mean_79 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_252, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_202 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_79, 0), kwargs = {})
%getitem_203 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_79, 1), kwargs = {})
%add_248 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_202, 1e-05), kwargs = {})
%rsqrt_79 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_248,), kwargs = {})
%sub_79 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_378, %getitem_203), kwargs = {})
%mul_172 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_79, %rsqrt_79), kwargs = {})
%view_379 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_172, [2, 640, 48, 48]), kwargs = {})
%_param_constant496 : [#users=1] = get_attr[target=_param_constant496]
%unsqueeze_314 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant496, 0), kwargs = {})
%unsqueeze_315 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_314, 2), kwargs = {})
%unsqueeze_316 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_315, 3), kwargs = {})
%_param_constant497 : [#users=1] = get_attr[target=_param_constant497]
%unsqueeze_317 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant497, 0), kwargs = {})
%unsqueeze_318 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_317, 2), kwargs = {})
%unsqueeze_319 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_318, 3), kwargs = {})
%mul_173 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_379, %unsqueeze_319), kwargs = {})
%add_249 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_173, %unsqueeze_316), kwargs = {})
%_to_copy_253 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_249,), kwargs = {dtype: torch.float16})
%_to_copy_254 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_203,), kwargs = {dtype: torch.float16})
%_to_copy_255 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_79,), kwargs = {dtype: torch.float16})
%squeeze_92 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_254, [2, 3]), kwargs = {})
%squeeze_93 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_255, [2, 3]), kwargs = {})
%detach_114 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_92,), kwargs = {})
%detach_115 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_93,), kwargs = {})
%silu_54 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_253,), kwargs = {})
%_param_constant498 : [#users=1] = get_attr[target=_param_constant498]
%_param_constant499 : [#users=1] = get_attr[target=_param_constant499]
%convolution_50 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_54, %_param_constant498, %_param_constant499, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant500 : [#users=1] = get_attr[target=_param_constant500]
%_param_constant501 : [#users=1] = get_attr[target=_param_constant501]
%convolution_51 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_9, %_param_constant500, %_param_constant501, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_250 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_51, %convolution_50), kwargs = {})
%div_18 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_250, 1.0), kwargs = {})
%view_380 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_18, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_256 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_380,), kwargs = {dtype: torch.float32})
%var_mean_80 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_256, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_204 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_80, 0), kwargs = {})
%getitem_205 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_80, 1), kwargs = {})
%add_251 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_204, 1e-06), kwargs = {})
%rsqrt_80 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_251,), kwargs = {})
%sub_80 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_380, %getitem_205), kwargs = {})
%mul_174 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_80, %rsqrt_80), kwargs = {})
%view_381 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_174, [2, 640, 48, 48]), kwargs = {})
%_param_constant502 : [#users=1] = get_attr[target=_param_constant502]
%unsqueeze_320 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant502, 0), kwargs = {})
%unsqueeze_321 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_320, 2), kwargs = {})
%unsqueeze_322 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_321, 3), kwargs = {})
%_param_constant503 : [#users=1] = get_attr[target=_param_constant503]
%unsqueeze_323 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant503, 0), kwargs = {})
%unsqueeze_324 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_323, 2), kwargs = {})
%unsqueeze_325 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_324, 3), kwargs = {})
%mul_175 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_381, %unsqueeze_325), kwargs = {})
%add_252 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_175, %unsqueeze_322), kwargs = {})
%_to_copy_257 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_252,), kwargs = {dtype: torch.float16})
%_to_copy_258 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_205,), kwargs = {dtype: torch.float16})
%_to_copy_259 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_80,), kwargs = {dtype: torch.float16})
%squeeze_94 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_258, [2, 3]), kwargs = {})
%squeeze_95 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_259, [2, 3]), kwargs = {})
%detach_116 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_94,), kwargs = {})
%detach_117 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_95,), kwargs = {})
%permute_22 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_257, [0, 2, 3, 1]), kwargs = {})
%view_382 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_22, [2, 2304, 640]), kwargs = {})
%_param_constant504 : [#users=1] = get_attr[target=_param_constant504]
%t_152 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant504,), kwargs = {})
%clone_22 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_382,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_88 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_22, [4608, 640]), kwargs = {})
%mm_77 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_88, %t_152), kwargs = {})
%_unsafe_view_89 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_77, [2, 2304, 640]), kwargs = {})
%_param_constant505 : [#users=1] = get_attr[target=_param_constant505]
%add_253 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_89, %_param_constant505), kwargs = {})
%_to_copy_260 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_253,), kwargs = {dtype: torch.float32})
%var_mean_81 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_260, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_206 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_81, 0), kwargs = {})
%getitem_207 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_81, 1), kwargs = {})
%add_254 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_206, 1e-05), kwargs = {})
%rsqrt_81 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_254,), kwargs = {})
%sub_81 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_253, %getitem_207), kwargs = {})
%mul_176 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_81, %rsqrt_81), kwargs = {})
%_param_constant506 : [#users=1] = get_attr[target=_param_constant506]
%mul_177 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_176, %_param_constant506), kwargs = {})
%_param_constant507 : [#users=1] = get_attr[target=_param_constant507]
%add_255 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_177, %_param_constant507), kwargs = {})
%_to_copy_261 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_255,), kwargs = {dtype: torch.float16})
%_param_constant508 : [#users=1] = get_attr[target=_param_constant508]
%t_153 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant508,), kwargs = {})
%view_383 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_261, [4608, 640]), kwargs = {})
%mm_78 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_383, %t_153), kwargs = {})
%_unsafe_view_90 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_78, [2, 2304, 640]), kwargs = {})
%_param_constant509 : [#users=1] = get_attr[target=_param_constant509]
%t_154 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant509,), kwargs = {})
%view_384 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_261, [4608, 640]), kwargs = {})
%mm_79 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_384, %t_154), kwargs = {})
%_unsafe_view_91 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_79, [2, 2304, 640]), kwargs = {})
%_param_constant510 : [#users=1] = get_attr[target=_param_constant510]
%t_155 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant510,), kwargs = {})
%view_385 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_261, [4608, 640]), kwargs = {})
%mm_80 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_385, %t_155), kwargs = {})
%_unsafe_view_92 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_80, [2, 2304, 640]), kwargs = {})
%view_386 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_90, [2, -1, 10, 64]), kwargs = {})
%transpose_88 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_386, 1, 2), kwargs = {})
%view_387 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_91, [2, -1, 10, 64]), kwargs = {})
%transpose_89 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_387, 1, 2), kwargs = {})
%view_388 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_92, [2, -1, 10, 64]), kwargs = {})
%transpose_90 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_388, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_22 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_88, %transpose_89, %transpose_90, True), kwargs = {})
%getitem_208 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_22, 0), kwargs = {})
%getitem_209 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_22, 1), kwargs = {})
%detach_118 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_208,), kwargs = {})
%transpose_91 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_208, 1, 2), kwargs = {})
%view_389 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_91, [2, -1, 640]), kwargs = {})
%_param_constant511 : [#users=1] = get_attr[target=_param_constant511]
%t_156 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant511,), kwargs = {})
%view_390 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_389, [4608, 640]), kwargs = {})
%_param_constant512 : [#users=1] = get_attr[target=_param_constant512]
%addmm_75 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant512, %view_390, %t_156), kwargs = {})
%view_391 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_75, [2, 2304, 640]), kwargs = {})
%add_256 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_391, %add_253), kwargs = {})
%_to_copy_262 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_256,), kwargs = {dtype: torch.float32})
%var_mean_82 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_262, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_210 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_82, 0), kwargs = {})
%getitem_211 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_82, 1), kwargs = {})
%add_257 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_210, 1e-05), kwargs = {})
%rsqrt_82 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_257,), kwargs = {})
%sub_82 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_256, %getitem_211), kwargs = {})
%mul_178 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_82, %rsqrt_82), kwargs = {})
%_param_constant513 : [#users=1] = get_attr[target=_param_constant513]
%mul_179 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_178, %_param_constant513), kwargs = {})
%_param_constant514 : [#users=1] = get_attr[target=_param_constant514]
%add_258 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_179, %_param_constant514), kwargs = {})
%_to_copy_263 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_258,), kwargs = {dtype: torch.float16})
%_param_constant515 : [#users=1] = get_attr[target=_param_constant515]
%t_157 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant515,), kwargs = {})
%view_392 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_263, [4608, 640]), kwargs = {})
%mm_81 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_392, %t_157), kwargs = {})
%_unsafe_view_93 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_81, [2, 2304, 640]), kwargs = {})
%_param_constant516 : [#users=1] = get_attr[target=_param_constant516]
%t_158 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant516,), kwargs = {})
%view_393 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_82 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_393, %t_158), kwargs = {})
%_unsafe_view_94 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_82, [2, 77, 640]), kwargs = {})
%_param_constant517 : [#users=1] = get_attr[target=_param_constant517]
%t_159 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant517,), kwargs = {})
%view_394 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_83 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_394, %t_159), kwargs = {})
%_unsafe_view_95 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_83, [2, 77, 640]), kwargs = {})
%view_395 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_93, [2, -1, 10, 64]), kwargs = {})
%transpose_92 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_395, 1, 2), kwargs = {})
%view_396 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_94, [2, -1, 10, 64]), kwargs = {})
%transpose_93 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_396, 1, 2), kwargs = {})
%view_397 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_95, [2, -1, 10, 64]), kwargs = {})
%transpose_94 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_397, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_23 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_92, %transpose_93, %transpose_94, True), kwargs = {})
%getitem_212 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_23, 0), kwargs = {})
%getitem_213 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_23, 1), kwargs = {})
%detach_119 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_212,), kwargs = {})
%transpose_95 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_212, 1, 2), kwargs = {})
%view_398 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_95, [2, -1, 640]), kwargs = {})
%_param_constant518 : [#users=1] = get_attr[target=_param_constant518]
%t_160 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant518,), kwargs = {})
%view_399 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_398, [4608, 640]), kwargs = {})
%_param_constant519 : [#users=1] = get_attr[target=_param_constant519]
%addmm_76 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant519, %view_399, %t_160), kwargs = {})
%view_400 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_76, [2, 2304, 640]), kwargs = {})
%add_259 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_400, %add_256), kwargs = {})
%_to_copy_264 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_259,), kwargs = {dtype: torch.float32})
%var_mean_83 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_264, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_214 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_83, 0), kwargs = {})
%getitem_215 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_83, 1), kwargs = {})
%add_260 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_214, 1e-05), kwargs = {})
%rsqrt_83 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_260,), kwargs = {})
%sub_83 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_259, %getitem_215), kwargs = {})
%mul_180 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_83, %rsqrt_83), kwargs = {})
%_param_constant520 : [#users=1] = get_attr[target=_param_constant520]
%mul_181 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_180, %_param_constant520), kwargs = {})
%_param_constant521 : [#users=1] = get_attr[target=_param_constant521]
%add_261 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_181, %_param_constant521), kwargs = {})
%_to_copy_265 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_261,), kwargs = {dtype: torch.float16})
%_param_constant522 : [#users=1] = get_attr[target=_param_constant522]
%t_161 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant522,), kwargs = {})
%view_401 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_265, [4608, 640]), kwargs = {})
%_param_constant523 : [#users=1] = get_attr[target=_param_constant523]
%addmm_77 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant523, %view_401, %t_161), kwargs = {})
%view_402 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_77, [2, 2304, 5120]), kwargs = {})
%slice_65 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_402, -1, 0, 2560), kwargs = {})
%slice_66 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_402, -1, 2560, 5120), kwargs = {})
%gelu_11 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_66,), kwargs = {})
%mul_182 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_65, %gelu_11), kwargs = {})
%_param_constant524 : [#users=1] = get_attr[target=_param_constant524]
%t_162 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant524,), kwargs = {})
%view_403 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_182, [4608, 2560]), kwargs = {})
%_param_constant525 : [#users=1] = get_attr[target=_param_constant525]
%addmm_78 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant525, %view_403, %t_162), kwargs = {})
%view_404 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_78, [2, 2304, 640]), kwargs = {})
%add_262 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_404, %add_259), kwargs = {})
%_param_constant526 : [#users=1] = get_attr[target=_param_constant526]
%t_163 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant526,), kwargs = {})
%view_405 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_262, [4608, 640]), kwargs = {})
%_param_constant527 : [#users=1] = get_attr[target=_param_constant527]
%addmm_79 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant527, %view_405, %t_163), kwargs = {})
%view_406 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_79, [2, 2304, 640]), kwargs = {})
%view_407 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_406, [2, 48, 48, 640]), kwargs = {})
%permute_23 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_407, [0, 3, 1, 2]), kwargs = {})
%clone_23 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_23,), kwargs = {memory_format: torch.contiguous_format})
%add_263 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_23, %div_18), kwargs = {})
%cat_10 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_263, %convolution_5], 1), kwargs = {})
%view_408 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_10, [2, 32, 30, 2304]), kwargs = {})
%_to_copy_266 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_408,), kwargs = {dtype: torch.float32})
%var_mean_84 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_266, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_216 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_84, 0), kwargs = {})
%getitem_217 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_84, 1), kwargs = {})
%add_264 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_216, 1e-05), kwargs = {})
%rsqrt_84 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_264,), kwargs = {})
%sub_84 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_408, %getitem_217), kwargs = {})
%mul_183 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_84, %rsqrt_84), kwargs = {})
%view_409 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_183, [2, 960, 48, 48]), kwargs = {})
%_param_constant528 : [#users=1] = get_attr[target=_param_constant528]
%unsqueeze_326 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant528, 0), kwargs = {})
%unsqueeze_327 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_326, 2), kwargs = {})
%unsqueeze_328 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_327, 3), kwargs = {})
%_param_constant529 : [#users=1] = get_attr[target=_param_constant529]
%unsqueeze_329 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant529, 0), kwargs = {})
%unsqueeze_330 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_329, 2), kwargs = {})
%unsqueeze_331 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_330, 3), kwargs = {})
%mul_184 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_409, %unsqueeze_331), kwargs = {})
%add_265 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_184, %unsqueeze_328), kwargs = {})
%_to_copy_267 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_265,), kwargs = {dtype: torch.float16})
%_to_copy_268 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_217,), kwargs = {dtype: torch.float16})
%_to_copy_269 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_84,), kwargs = {dtype: torch.float16})
%squeeze_96 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_268, [2, 3]), kwargs = {})
%squeeze_97 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_269, [2, 3]), kwargs = {})
%detach_120 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_96,), kwargs = {})
%detach_121 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_97,), kwargs = {})
%silu_55 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_267,), kwargs = {})
%_param_constant530 : [#users=1] = get_attr[target=_param_constant530]
%_param_constant531 : [#users=1] = get_attr[target=_param_constant531]
%convolution_52 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_55, %_param_constant530, %_param_constant531, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_56 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant532 : [#users=1] = get_attr[target=_param_constant532]
%t_164 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant532,), kwargs = {})
%_param_constant533 : [#users=1] = get_attr[target=_param_constant533]
%addmm_80 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant533, %silu_56, %t_164), kwargs = {})
%slice_67 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_80, 0, 0, 9223372036854775807), kwargs = {})
%slice_68 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_67, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_332 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_68, 2), kwargs = {})
%unsqueeze_333 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_332, 3), kwargs = {})
%add_266 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_52, %unsqueeze_333), kwargs = {})
%view_410 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_266, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_270 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_410,), kwargs = {dtype: torch.float32})
%var_mean_85 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_270, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_218 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_85, 0), kwargs = {})
%getitem_219 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_85, 1), kwargs = {})
%add_267 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_218, 1e-05), kwargs = {})
%rsqrt_85 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_267,), kwargs = {})
%sub_85 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_410, %getitem_219), kwargs = {})
%mul_185 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_85, %rsqrt_85), kwargs = {})
%view_411 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_185, [2, 640, 48, 48]), kwargs = {})
%_param_constant534 : [#users=1] = get_attr[target=_param_constant534]
%unsqueeze_334 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant534, 0), kwargs = {})
%unsqueeze_335 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_334, 2), kwargs = {})
%unsqueeze_336 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_335, 3), kwargs = {})
%_param_constant535 : [#users=1] = get_attr[target=_param_constant535]
%unsqueeze_337 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant535, 0), kwargs = {})
%unsqueeze_338 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_337, 2), kwargs = {})
%unsqueeze_339 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_338, 3), kwargs = {})
%mul_186 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_411, %unsqueeze_339), kwargs = {})
%add_268 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_186, %unsqueeze_336), kwargs = {})
%_to_copy_271 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_268,), kwargs = {dtype: torch.float16})
%_to_copy_272 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_219,), kwargs = {dtype: torch.float16})
%_to_copy_273 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_85,), kwargs = {dtype: torch.float16})
%squeeze_98 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_272, [2, 3]), kwargs = {})
%squeeze_99 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_273, [2, 3]), kwargs = {})
%detach_122 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_98,), kwargs = {})
%detach_123 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_99,), kwargs = {})
%silu_57 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_271,), kwargs = {})
%_param_constant536 : [#users=1] = get_attr[target=_param_constant536]
%_param_constant537 : [#users=1] = get_attr[target=_param_constant537]
%convolution_53 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_57, %_param_constant536, %_param_constant537, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant538 : [#users=1] = get_attr[target=_param_constant538]
%_param_constant539 : [#users=1] = get_attr[target=_param_constant539]
%convolution_54 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_10, %_param_constant538, %_param_constant539, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_269 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_54, %convolution_53), kwargs = {})
%div_19 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_269, 1.0), kwargs = {})
%view_412 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_19, [2, 32, 20, 2304]), kwargs = {})
%_to_copy_274 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_412,), kwargs = {dtype: torch.float32})
%var_mean_86 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_274, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_220 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_86, 0), kwargs = {})
%getitem_221 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_86, 1), kwargs = {})
%add_270 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_220, 1e-06), kwargs = {})
%rsqrt_86 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_270,), kwargs = {})
%sub_86 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_412, %getitem_221), kwargs = {})
%mul_187 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_86, %rsqrt_86), kwargs = {})
%view_413 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_187, [2, 640, 48, 48]), kwargs = {})
%_param_constant540 : [#users=1] = get_attr[target=_param_constant540]
%unsqueeze_340 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant540, 0), kwargs = {})
%unsqueeze_341 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_340, 2), kwargs = {})
%unsqueeze_342 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_341, 3), kwargs = {})
%_param_constant541 : [#users=1] = get_attr[target=_param_constant541]
%unsqueeze_343 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant541, 0), kwargs = {})
%unsqueeze_344 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_343, 2), kwargs = {})
%unsqueeze_345 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_344, 3), kwargs = {})
%mul_188 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_413, %unsqueeze_345), kwargs = {})
%add_271 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_188, %unsqueeze_342), kwargs = {})
%_to_copy_275 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_271,), kwargs = {dtype: torch.float16})
%_to_copy_276 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_221,), kwargs = {dtype: torch.float16})
%_to_copy_277 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_86,), kwargs = {dtype: torch.float16})
%squeeze_100 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_276, [2, 3]), kwargs = {})
%squeeze_101 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_277, [2, 3]), kwargs = {})
%detach_124 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_100,), kwargs = {})
%detach_125 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_101,), kwargs = {})
%permute_24 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_275, [0, 2, 3, 1]), kwargs = {})
%view_414 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_24, [2, 2304, 640]), kwargs = {})
%_param_constant542 : [#users=1] = get_attr[target=_param_constant542]
%t_165 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant542,), kwargs = {})
%clone_24 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_414,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_96 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_24, [4608, 640]), kwargs = {})
%mm_84 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_96, %t_165), kwargs = {})
%_unsafe_view_97 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_84, [2, 2304, 640]), kwargs = {})
%_param_constant543 : [#users=1] = get_attr[target=_param_constant543]
%add_272 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_97, %_param_constant543), kwargs = {})
%_to_copy_278 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_272,), kwargs = {dtype: torch.float32})
%var_mean_87 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_278, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_222 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_87, 0), kwargs = {})
%getitem_223 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_87, 1), kwargs = {})
%add_273 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_222, 1e-05), kwargs = {})
%rsqrt_87 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_273,), kwargs = {})
%sub_87 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_272, %getitem_223), kwargs = {})
%mul_189 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_87, %rsqrt_87), kwargs = {})
%_param_constant544 : [#users=1] = get_attr[target=_param_constant544]
%mul_190 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_189, %_param_constant544), kwargs = {})
%_param_constant545 : [#users=1] = get_attr[target=_param_constant545]
%add_274 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_190, %_param_constant545), kwargs = {})
%_to_copy_279 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_274,), kwargs = {dtype: torch.float16})
%_param_constant546 : [#users=1] = get_attr[target=_param_constant546]
%t_166 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant546,), kwargs = {})
%view_415 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_279, [4608, 640]), kwargs = {})
%mm_85 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_415, %t_166), kwargs = {})
%_unsafe_view_98 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_85, [2, 2304, 640]), kwargs = {})
%_param_constant547 : [#users=1] = get_attr[target=_param_constant547]
%t_167 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant547,), kwargs = {})
%view_416 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_279, [4608, 640]), kwargs = {})
%mm_86 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_416, %t_167), kwargs = {})
%_unsafe_view_99 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_86, [2, 2304, 640]), kwargs = {})
%_param_constant548 : [#users=1] = get_attr[target=_param_constant548]
%t_168 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant548,), kwargs = {})
%view_417 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_279, [4608, 640]), kwargs = {})
%mm_87 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_417, %t_168), kwargs = {})
%_unsafe_view_100 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_87, [2, 2304, 640]), kwargs = {})
%view_418 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_98, [2, -1, 10, 64]), kwargs = {})
%transpose_96 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_418, 1, 2), kwargs = {})
%view_419 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_99, [2, -1, 10, 64]), kwargs = {})
%transpose_97 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_419, 1, 2), kwargs = {})
%view_420 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_100, [2, -1, 10, 64]), kwargs = {})
%transpose_98 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_420, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_24 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_96, %transpose_97, %transpose_98, True), kwargs = {})
%getitem_224 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_24, 0), kwargs = {})
%getitem_225 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_24, 1), kwargs = {})
%detach_126 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_224,), kwargs = {})
%transpose_99 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_224, 1, 2), kwargs = {})
%view_421 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_99, [2, -1, 640]), kwargs = {})
%_param_constant549 : [#users=1] = get_attr[target=_param_constant549]
%t_169 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant549,), kwargs = {})
%view_422 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_421, [4608, 640]), kwargs = {})
%_param_constant550 : [#users=1] = get_attr[target=_param_constant550]
%addmm_81 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant550, %view_422, %t_169), kwargs = {})
%view_423 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_81, [2, 2304, 640]), kwargs = {})
%add_275 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_423, %add_272), kwargs = {})
%_to_copy_280 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_275,), kwargs = {dtype: torch.float32})
%var_mean_88 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_280, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_226 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_88, 0), kwargs = {})
%getitem_227 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_88, 1), kwargs = {})
%add_276 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_226, 1e-05), kwargs = {})
%rsqrt_88 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_276,), kwargs = {})
%sub_88 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_275, %getitem_227), kwargs = {})
%mul_191 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_88, %rsqrt_88), kwargs = {})
%_param_constant551 : [#users=1] = get_attr[target=_param_constant551]
%mul_192 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_191, %_param_constant551), kwargs = {})
%_param_constant552 : [#users=1] = get_attr[target=_param_constant552]
%add_277 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_192, %_param_constant552), kwargs = {})
%_to_copy_281 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_277,), kwargs = {dtype: torch.float16})
%_param_constant553 : [#users=1] = get_attr[target=_param_constant553]
%t_170 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant553,), kwargs = {})
%view_424 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_281, [4608, 640]), kwargs = {})
%mm_88 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_424, %t_170), kwargs = {})
%_unsafe_view_101 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_88, [2, 2304, 640]), kwargs = {})
%_param_constant554 : [#users=1] = get_attr[target=_param_constant554]
%t_171 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant554,), kwargs = {})
%view_425 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_89 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_425, %t_171), kwargs = {})
%_unsafe_view_102 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_89, [2, 77, 640]), kwargs = {})
%_param_constant555 : [#users=1] = get_attr[target=_param_constant555]
%t_172 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant555,), kwargs = {})
%view_426 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_90 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_426, %t_172), kwargs = {})
%_unsafe_view_103 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_90, [2, 77, 640]), kwargs = {})
%view_427 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_101, [2, -1, 10, 64]), kwargs = {})
%transpose_100 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_427, 1, 2), kwargs = {})
%view_428 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_102, [2, -1, 10, 64]), kwargs = {})
%transpose_101 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_428, 1, 2), kwargs = {})
%view_429 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_103, [2, -1, 10, 64]), kwargs = {})
%transpose_102 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_429, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_25 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_100, %transpose_101, %transpose_102, True), kwargs = {})
%getitem_228 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_25, 0), kwargs = {})
%getitem_229 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_25, 1), kwargs = {})
%detach_127 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_228,), kwargs = {})
%transpose_103 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_228, 1, 2), kwargs = {})
%view_430 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_103, [2, -1, 640]), kwargs = {})
%_param_constant556 : [#users=1] = get_attr[target=_param_constant556]
%t_173 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant556,), kwargs = {})
%view_431 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_430, [4608, 640]), kwargs = {})
%_param_constant557 : [#users=1] = get_attr[target=_param_constant557]
%addmm_82 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant557, %view_431, %t_173), kwargs = {})
%view_432 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_82, [2, 2304, 640]), kwargs = {})
%add_278 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_432, %add_275), kwargs = {})
%_to_copy_282 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_278,), kwargs = {dtype: torch.float32})
%var_mean_89 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_282, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_230 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_89, 0), kwargs = {})
%getitem_231 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_89, 1), kwargs = {})
%add_279 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_230, 1e-05), kwargs = {})
%rsqrt_89 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_279,), kwargs = {})
%sub_89 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_278, %getitem_231), kwargs = {})
%mul_193 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_89, %rsqrt_89), kwargs = {})
%_param_constant558 : [#users=1] = get_attr[target=_param_constant558]
%mul_194 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_193, %_param_constant558), kwargs = {})
%_param_constant559 : [#users=1] = get_attr[target=_param_constant559]
%add_280 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_194, %_param_constant559), kwargs = {})
%_to_copy_283 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_280,), kwargs = {dtype: torch.float16})
%_param_constant560 : [#users=1] = get_attr[target=_param_constant560]
%t_174 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant560,), kwargs = {})
%view_433 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_283, [4608, 640]), kwargs = {})
%_param_constant561 : [#users=1] = get_attr[target=_param_constant561]
%addmm_83 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant561, %view_433, %t_174), kwargs = {})
%view_434 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_83, [2, 2304, 5120]), kwargs = {})
%slice_69 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_434, -1, 0, 2560), kwargs = {})
%slice_70 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_434, -1, 2560, 5120), kwargs = {})
%gelu_12 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_70,), kwargs = {})
%mul_195 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_69, %gelu_12), kwargs = {})
%_param_constant562 : [#users=1] = get_attr[target=_param_constant562]
%t_175 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant562,), kwargs = {})
%view_435 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_195, [4608, 2560]), kwargs = {})
%_param_constant563 : [#users=1] = get_attr[target=_param_constant563]
%addmm_84 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant563, %view_435, %t_175), kwargs = {})
%view_436 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_84, [2, 2304, 640]), kwargs = {})
%add_281 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_436, %add_278), kwargs = {})
%_param_constant564 : [#users=1] = get_attr[target=_param_constant564]
%t_176 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant564,), kwargs = {})
%view_437 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_281, [4608, 640]), kwargs = {})
%_param_constant565 : [#users=1] = get_attr[target=_param_constant565]
%addmm_85 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant565, %view_437, %t_176), kwargs = {})
%view_438 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_85, [2, 2304, 640]), kwargs = {})
%view_439 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_438, [2, 48, 48, 640]), kwargs = {})
%permute_25 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_439, [0, 3, 1, 2]), kwargs = {})
%clone_25 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_25,), kwargs = {memory_format: torch.contiguous_format})
%add_282 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_25, %div_19), kwargs = {})
%upsample_nearest2d_2 : [#users=1] = call_function[target=torch.ops.aten.upsample_nearest2d](args = (%add_282, [96, 96], 2.0, 2.0), kwargs = {})
%_param_constant566 : [#users=1] = get_attr[target=_param_constant566]
%_param_constant567 : [#users=1] = get_attr[target=_param_constant567]
%convolution_55 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%upsample_nearest2d_2, %_param_constant566, %_param_constant567, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%cat_11 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%convolution_55, %add_37], 1), kwargs = {})
%view_440 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_11, [2, 32, 30, 9216]), kwargs = {})
%_to_copy_284 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_440,), kwargs = {dtype: torch.float32})
%var_mean_90 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_284, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_232 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_90, 0), kwargs = {})
%getitem_233 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_90, 1), kwargs = {})
%add_283 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_232, 1e-05), kwargs = {})
%rsqrt_90 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_283,), kwargs = {})
%sub_90 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_440, %getitem_233), kwargs = {})
%mul_196 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_90, %rsqrt_90), kwargs = {})
%view_441 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_196, [2, 960, 96, 96]), kwargs = {})
%_param_constant568 : [#users=1] = get_attr[target=_param_constant568]
%unsqueeze_346 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant568, 0), kwargs = {})
%unsqueeze_347 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_346, 2), kwargs = {})
%unsqueeze_348 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_347, 3), kwargs = {})
%_param_constant569 : [#users=1] = get_attr[target=_param_constant569]
%unsqueeze_349 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant569, 0), kwargs = {})
%unsqueeze_350 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_349, 2), kwargs = {})
%unsqueeze_351 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_350, 3), kwargs = {})
%mul_197 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_441, %unsqueeze_351), kwargs = {})
%add_284 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_197, %unsqueeze_348), kwargs = {})
%_to_copy_285 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_284,), kwargs = {dtype: torch.float16})
%_to_copy_286 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_233,), kwargs = {dtype: torch.float16})
%_to_copy_287 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_90,), kwargs = {dtype: torch.float16})
%squeeze_102 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_286, [2, 3]), kwargs = {})
%squeeze_103 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_287, [2, 3]), kwargs = {})
%detach_128 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_102,), kwargs = {})
%detach_129 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_103,), kwargs = {})
%silu_58 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_285,), kwargs = {})
%_param_constant570 : [#users=1] = get_attr[target=_param_constant570]
%_param_constant571 : [#users=1] = get_attr[target=_param_constant571]
%convolution_56 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_58, %_param_constant570, %_param_constant571, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_59 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant572 : [#users=1] = get_attr[target=_param_constant572]
%t_177 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant572,), kwargs = {})
%_param_constant573 : [#users=1] = get_attr[target=_param_constant573]
%addmm_86 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant573, %silu_59, %t_177), kwargs = {})
%slice_71 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_86, 0, 0, 9223372036854775807), kwargs = {})
%slice_72 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_71, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_352 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_72, 2), kwargs = {})
%unsqueeze_353 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_352, 3), kwargs = {})
%add_285 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_56, %unsqueeze_353), kwargs = {})
%view_442 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_285, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_288 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_442,), kwargs = {dtype: torch.float32})
%var_mean_91 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_288, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_234 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_91, 0), kwargs = {})
%getitem_235 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_91, 1), kwargs = {})
%add_286 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_234, 1e-05), kwargs = {})
%rsqrt_91 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_286,), kwargs = {})
%sub_91 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_442, %getitem_235), kwargs = {})
%mul_198 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_91, %rsqrt_91), kwargs = {})
%view_443 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_198, [2, 320, 96, 96]), kwargs = {})
%_param_constant574 : [#users=1] = get_attr[target=_param_constant574]
%unsqueeze_354 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant574, 0), kwargs = {})
%unsqueeze_355 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_354, 2), kwargs = {})
%unsqueeze_356 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_355, 3), kwargs = {})
%_param_constant575 : [#users=1] = get_attr[target=_param_constant575]
%unsqueeze_357 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant575, 0), kwargs = {})
%unsqueeze_358 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_357, 2), kwargs = {})
%unsqueeze_359 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_358, 3), kwargs = {})
%mul_199 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_443, %unsqueeze_359), kwargs = {})
%add_287 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_199, %unsqueeze_356), kwargs = {})
%_to_copy_289 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_287,), kwargs = {dtype: torch.float16})
%_to_copy_290 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_235,), kwargs = {dtype: torch.float16})
%_to_copy_291 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_91,), kwargs = {dtype: torch.float16})
%squeeze_104 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_290, [2, 3]), kwargs = {})
%squeeze_105 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_291, [2, 3]), kwargs = {})
%detach_130 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_104,), kwargs = {})
%detach_131 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_105,), kwargs = {})
%silu_60 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_289,), kwargs = {})
%_param_constant576 : [#users=1] = get_attr[target=_param_constant576]
%_param_constant577 : [#users=1] = get_attr[target=_param_constant577]
%convolution_57 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_60, %_param_constant576, %_param_constant577, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant578 : [#users=1] = get_attr[target=_param_constant578]
%_param_constant579 : [#users=1] = get_attr[target=_param_constant579]
%convolution_58 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_11, %_param_constant578, %_param_constant579, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_288 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_58, %convolution_57), kwargs = {})
%div_20 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_288, 1.0), kwargs = {})
%view_444 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_20, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_292 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_444,), kwargs = {dtype: torch.float32})
%var_mean_92 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_292, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_236 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_92, 0), kwargs = {})
%getitem_237 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_92, 1), kwargs = {})
%add_289 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_236, 1e-06), kwargs = {})
%rsqrt_92 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_289,), kwargs = {})
%sub_92 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_444, %getitem_237), kwargs = {})
%mul_200 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_92, %rsqrt_92), kwargs = {})
%view_445 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_200, [2, 320, 96, 96]), kwargs = {})
%_param_constant580 : [#users=1] = get_attr[target=_param_constant580]
%unsqueeze_360 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant580, 0), kwargs = {})
%unsqueeze_361 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_360, 2), kwargs = {})
%unsqueeze_362 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_361, 3), kwargs = {})
%_param_constant581 : [#users=1] = get_attr[target=_param_constant581]
%unsqueeze_363 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant581, 0), kwargs = {})
%unsqueeze_364 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_363, 2), kwargs = {})
%unsqueeze_365 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_364, 3), kwargs = {})
%mul_201 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_445, %unsqueeze_365), kwargs = {})
%add_290 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_201, %unsqueeze_362), kwargs = {})
%_to_copy_293 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_290,), kwargs = {dtype: torch.float16})
%_to_copy_294 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_237,), kwargs = {dtype: torch.float16})
%_to_copy_295 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_92,), kwargs = {dtype: torch.float16})
%squeeze_106 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_294, [2, 3]), kwargs = {})
%squeeze_107 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_295, [2, 3]), kwargs = {})
%detach_132 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_106,), kwargs = {})
%detach_133 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_107,), kwargs = {})
%permute_26 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_293, [0, 2, 3, 1]), kwargs = {})
%view_446 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_26, [2, 9216, 320]), kwargs = {})
%_param_constant582 : [#users=1] = get_attr[target=_param_constant582]
%t_178 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant582,), kwargs = {})
%clone_26 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_446,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_104 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_26, [18432, 320]), kwargs = {})
%mm_91 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_104, %t_178), kwargs = {})
%_unsafe_view_105 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_91, [2, 9216, 320]), kwargs = {})
%_param_constant583 : [#users=1] = get_attr[target=_param_constant583]
%add_291 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_105, %_param_constant583), kwargs = {})
%_to_copy_296 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_291,), kwargs = {dtype: torch.float32})
%var_mean_93 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_296, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_238 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_93, 0), kwargs = {})
%getitem_239 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_93, 1), kwargs = {})
%add_292 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_238, 1e-05), kwargs = {})
%rsqrt_93 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_292,), kwargs = {})
%sub_93 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_291, %getitem_239), kwargs = {})
%mul_202 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_93, %rsqrt_93), kwargs = {})
%_param_constant584 : [#users=1] = get_attr[target=_param_constant584]
%mul_203 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_202, %_param_constant584), kwargs = {})
%_param_constant585 : [#users=1] = get_attr[target=_param_constant585]
%add_293 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_203, %_param_constant585), kwargs = {})
%_to_copy_297 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_293,), kwargs = {dtype: torch.float16})
%_param_constant586 : [#users=1] = get_attr[target=_param_constant586]
%t_179 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant586,), kwargs = {})
%view_447 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_297, [18432, 320]), kwargs = {})
%mm_92 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_447, %t_179), kwargs = {})
%_unsafe_view_106 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_92, [2, 9216, 320]), kwargs = {})
%_param_constant587 : [#users=1] = get_attr[target=_param_constant587]
%t_180 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant587,), kwargs = {})
%view_448 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_297, [18432, 320]), kwargs = {})
%mm_93 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_448, %t_180), kwargs = {})
%_unsafe_view_107 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_93, [2, 9216, 320]), kwargs = {})
%_param_constant588 : [#users=1] = get_attr[target=_param_constant588]
%t_181 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant588,), kwargs = {})
%view_449 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_297, [18432, 320]), kwargs = {})
%mm_94 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_449, %t_181), kwargs = {})
%_unsafe_view_108 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_94, [2, 9216, 320]), kwargs = {})
%view_450 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_106, [2, -1, 5, 64]), kwargs = {})
%transpose_104 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_450, 1, 2), kwargs = {})
%view_451 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_107, [2, -1, 5, 64]), kwargs = {})
%transpose_105 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_451, 1, 2), kwargs = {})
%view_452 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_108, [2, -1, 5, 64]), kwargs = {})
%transpose_106 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_452, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_26 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_104, %transpose_105, %transpose_106, True), kwargs = {})
%getitem_240 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_26, 0), kwargs = {})
%getitem_241 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_26, 1), kwargs = {})
%detach_134 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_240,), kwargs = {})
%transpose_107 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_240, 1, 2), kwargs = {})
%view_453 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_107, [2, -1, 320]), kwargs = {})
%_param_constant589 : [#users=1] = get_attr[target=_param_constant589]
%t_182 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant589,), kwargs = {})
%view_454 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_453, [18432, 320]), kwargs = {})
%_param_constant590 : [#users=1] = get_attr[target=_param_constant590]
%addmm_87 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant590, %view_454, %t_182), kwargs = {})
%view_455 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_87, [2, 9216, 320]), kwargs = {})
%add_294 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_455, %add_291), kwargs = {})
%_to_copy_298 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_294,), kwargs = {dtype: torch.float32})
%var_mean_94 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_298, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_242 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_94, 0), kwargs = {})
%getitem_243 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_94, 1), kwargs = {})
%add_295 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_242, 1e-05), kwargs = {})
%rsqrt_94 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_295,), kwargs = {})
%sub_94 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_294, %getitem_243), kwargs = {})
%mul_204 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_94, %rsqrt_94), kwargs = {})
%_param_constant591 : [#users=1] = get_attr[target=_param_constant591]
%mul_205 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_204, %_param_constant591), kwargs = {})
%_param_constant592 : [#users=1] = get_attr[target=_param_constant592]
%add_296 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_205, %_param_constant592), kwargs = {})
%_to_copy_299 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_296,), kwargs = {dtype: torch.float16})
%_param_constant593 : [#users=1] = get_attr[target=_param_constant593]
%t_183 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant593,), kwargs = {})
%view_456 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_299, [18432, 320]), kwargs = {})
%mm_95 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_456, %t_183), kwargs = {})
%_unsafe_view_109 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_95, [2, 9216, 320]), kwargs = {})
%_param_constant594 : [#users=1] = get_attr[target=_param_constant594]
%t_184 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant594,), kwargs = {})
%view_457 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_96 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_457, %t_184), kwargs = {})
%_unsafe_view_110 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_96, [2, 77, 320]), kwargs = {})
%_param_constant595 : [#users=1] = get_attr[target=_param_constant595]
%t_185 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant595,), kwargs = {})
%view_458 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_97 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_458, %t_185), kwargs = {})
%_unsafe_view_111 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_97, [2, 77, 320]), kwargs = {})
%view_459 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_109, [2, -1, 5, 64]), kwargs = {})
%transpose_108 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_459, 1, 2), kwargs = {})
%view_460 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_110, [2, -1, 5, 64]), kwargs = {})
%transpose_109 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_460, 1, 2), kwargs = {})
%view_461 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_111, [2, -1, 5, 64]), kwargs = {})
%transpose_110 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_461, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_27 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_108, %transpose_109, %transpose_110, True), kwargs = {})
%getitem_244 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_27, 0), kwargs = {})
%getitem_245 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_27, 1), kwargs = {})
%detach_135 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_244,), kwargs = {})
%transpose_111 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_244, 1, 2), kwargs = {})
%view_462 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_111, [2, -1, 320]), kwargs = {})
%_param_constant596 : [#users=1] = get_attr[target=_param_constant596]
%t_186 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant596,), kwargs = {})
%view_463 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_462, [18432, 320]), kwargs = {})
%_param_constant597 : [#users=1] = get_attr[target=_param_constant597]
%addmm_88 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant597, %view_463, %t_186), kwargs = {})
%view_464 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_88, [2, 9216, 320]), kwargs = {})
%add_297 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_464, %add_294), kwargs = {})
%_to_copy_300 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_297,), kwargs = {dtype: torch.float32})
%var_mean_95 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_300, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_246 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_95, 0), kwargs = {})
%getitem_247 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_95, 1), kwargs = {})
%add_298 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_246, 1e-05), kwargs = {})
%rsqrt_95 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_298,), kwargs = {})
%sub_95 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_297, %getitem_247), kwargs = {})
%mul_206 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_95, %rsqrt_95), kwargs = {})
%_param_constant598 : [#users=1] = get_attr[target=_param_constant598]
%mul_207 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_206, %_param_constant598), kwargs = {})
%_param_constant599 : [#users=1] = get_attr[target=_param_constant599]
%add_299 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_207, %_param_constant599), kwargs = {})
%_to_copy_301 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_299,), kwargs = {dtype: torch.float16})
%_param_constant600 : [#users=1] = get_attr[target=_param_constant600]
%t_187 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant600,), kwargs = {})
%view_465 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_301, [18432, 320]), kwargs = {})
%_param_constant601 : [#users=1] = get_attr[target=_param_constant601]
%addmm_89 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant601, %view_465, %t_187), kwargs = {})
%view_466 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_89, [2, 9216, 2560]), kwargs = {})
%slice_73 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_466, -1, 0, 1280), kwargs = {})
%slice_74 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_466, -1, 1280, 2560), kwargs = {})
%gelu_13 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_74,), kwargs = {})
%mul_208 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_73, %gelu_13), kwargs = {})
%_param_constant602 : [#users=1] = get_attr[target=_param_constant602]
%t_188 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant602,), kwargs = {})
%view_467 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_208, [18432, 1280]), kwargs = {})
%_param_constant603 : [#users=1] = get_attr[target=_param_constant603]
%addmm_90 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant603, %view_467, %t_188), kwargs = {})
%view_468 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_90, [2, 9216, 320]), kwargs = {})
%add_300 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_468, %add_297), kwargs = {})
%_param_constant604 : [#users=1] = get_attr[target=_param_constant604]
%t_189 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant604,), kwargs = {})
%view_469 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_300, [18432, 320]), kwargs = {})
%_param_constant605 : [#users=1] = get_attr[target=_param_constant605]
%addmm_91 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant605, %view_469, %t_189), kwargs = {})
%view_470 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_91, [2, 9216, 320]), kwargs = {})
%view_471 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_470, [2, 96, 96, 320]), kwargs = {})
%permute_27 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_471, [0, 3, 1, 2]), kwargs = {})
%clone_27 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_27,), kwargs = {memory_format: torch.contiguous_format})
%add_301 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_27, %div_20), kwargs = {})
%cat_12 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_301, %add_18], 1), kwargs = {})
%view_472 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_12, [2, 32, 20, 9216]), kwargs = {})
%_to_copy_302 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_472,), kwargs = {dtype: torch.float32})
%var_mean_96 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_302, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_248 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_96, 0), kwargs = {})
%getitem_249 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_96, 1), kwargs = {})
%add_302 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_248, 1e-05), kwargs = {})
%rsqrt_96 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_302,), kwargs = {})
%sub_96 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_472, %getitem_249), kwargs = {})
%mul_209 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_96, %rsqrt_96), kwargs = {})
%view_473 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_209, [2, 640, 96, 96]), kwargs = {})
%_param_constant606 : [#users=1] = get_attr[target=_param_constant606]
%unsqueeze_366 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant606, 0), kwargs = {})
%unsqueeze_367 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_366, 2), kwargs = {})
%unsqueeze_368 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_367, 3), kwargs = {})
%_param_constant607 : [#users=1] = get_attr[target=_param_constant607]
%unsqueeze_369 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant607, 0), kwargs = {})
%unsqueeze_370 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_369, 2), kwargs = {})
%unsqueeze_371 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_370, 3), kwargs = {})
%mul_210 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_473, %unsqueeze_371), kwargs = {})
%add_303 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_210, %unsqueeze_368), kwargs = {})
%_to_copy_303 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_303,), kwargs = {dtype: torch.float16})
%_to_copy_304 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_249,), kwargs = {dtype: torch.float16})
%_to_copy_305 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_96,), kwargs = {dtype: torch.float16})
%squeeze_108 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_304, [2, 3]), kwargs = {})
%squeeze_109 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_305, [2, 3]), kwargs = {})
%detach_136 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_108,), kwargs = {})
%detach_137 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_109,), kwargs = {})
%silu_61 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_303,), kwargs = {})
%_param_constant608 : [#users=1] = get_attr[target=_param_constant608]
%_param_constant609 : [#users=1] = get_attr[target=_param_constant609]
%convolution_59 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_61, %_param_constant608, %_param_constant609, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_62 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant610 : [#users=1] = get_attr[target=_param_constant610]
%t_190 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant610,), kwargs = {})
%_param_constant611 : [#users=1] = get_attr[target=_param_constant611]
%addmm_92 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant611, %silu_62, %t_190), kwargs = {})
%slice_75 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_92, 0, 0, 9223372036854775807), kwargs = {})
%slice_76 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_75, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_372 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_76, 2), kwargs = {})
%unsqueeze_373 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_372, 3), kwargs = {})
%add_304 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_59, %unsqueeze_373), kwargs = {})
%view_474 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_304, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_306 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_474,), kwargs = {dtype: torch.float32})
%var_mean_97 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_306, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_250 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_97, 0), kwargs = {})
%getitem_251 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_97, 1), kwargs = {})
%add_305 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_250, 1e-05), kwargs = {})
%rsqrt_97 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_305,), kwargs = {})
%sub_97 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_474, %getitem_251), kwargs = {})
%mul_211 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_97, %rsqrt_97), kwargs = {})
%view_475 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_211, [2, 320, 96, 96]), kwargs = {})
%_param_constant612 : [#users=1] = get_attr[target=_param_constant612]
%unsqueeze_374 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant612, 0), kwargs = {})
%unsqueeze_375 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_374, 2), kwargs = {})
%unsqueeze_376 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_375, 3), kwargs = {})
%_param_constant613 : [#users=1] = get_attr[target=_param_constant613]
%unsqueeze_377 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant613, 0), kwargs = {})
%unsqueeze_378 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_377, 2), kwargs = {})
%unsqueeze_379 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_378, 3), kwargs = {})
%mul_212 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_475, %unsqueeze_379), kwargs = {})
%add_306 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_212, %unsqueeze_376), kwargs = {})
%_to_copy_307 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_306,), kwargs = {dtype: torch.float16})
%_to_copy_308 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_251,), kwargs = {dtype: torch.float16})
%_to_copy_309 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_97,), kwargs = {dtype: torch.float16})
%squeeze_110 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_308, [2, 3]), kwargs = {})
%squeeze_111 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_309, [2, 3]), kwargs = {})
%detach_138 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_110,), kwargs = {})
%detach_139 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_111,), kwargs = {})
%silu_63 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_307,), kwargs = {})
%_param_constant614 : [#users=1] = get_attr[target=_param_constant614]
%_param_constant615 : [#users=1] = get_attr[target=_param_constant615]
%convolution_60 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_63, %_param_constant614, %_param_constant615, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant616 : [#users=1] = get_attr[target=_param_constant616]
%_param_constant617 : [#users=1] = get_attr[target=_param_constant617]
%convolution_61 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_12, %_param_constant616, %_param_constant617, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_307 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_61, %convolution_60), kwargs = {})
%div_21 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_307, 1.0), kwargs = {})
%view_476 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_21, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_310 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_476,), kwargs = {dtype: torch.float32})
%var_mean_98 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_310, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_252 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_98, 0), kwargs = {})
%getitem_253 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_98, 1), kwargs = {})
%add_308 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_252, 1e-06), kwargs = {})
%rsqrt_98 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_308,), kwargs = {})
%sub_98 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_476, %getitem_253), kwargs = {})
%mul_213 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_98, %rsqrt_98), kwargs = {})
%view_477 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_213, [2, 320, 96, 96]), kwargs = {})
%_param_constant618 : [#users=1] = get_attr[target=_param_constant618]
%unsqueeze_380 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant618, 0), kwargs = {})
%unsqueeze_381 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_380, 2), kwargs = {})
%unsqueeze_382 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_381, 3), kwargs = {})
%_param_constant619 : [#users=1] = get_attr[target=_param_constant619]
%unsqueeze_383 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant619, 0), kwargs = {})
%unsqueeze_384 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_383, 2), kwargs = {})
%unsqueeze_385 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_384, 3), kwargs = {})
%mul_214 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_477, %unsqueeze_385), kwargs = {})
%add_309 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_214, %unsqueeze_382), kwargs = {})
%_to_copy_311 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_309,), kwargs = {dtype: torch.float16})
%_to_copy_312 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_253,), kwargs = {dtype: torch.float16})
%_to_copy_313 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_98,), kwargs = {dtype: torch.float16})
%squeeze_112 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_312, [2, 3]), kwargs = {})
%squeeze_113 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_313, [2, 3]), kwargs = {})
%detach_140 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_112,), kwargs = {})
%detach_141 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_113,), kwargs = {})
%permute_28 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_311, [0, 2, 3, 1]), kwargs = {})
%view_478 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_28, [2, 9216, 320]), kwargs = {})
%_param_constant620 : [#users=1] = get_attr[target=_param_constant620]
%t_191 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant620,), kwargs = {})
%clone_28 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_478,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_112 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_28, [18432, 320]), kwargs = {})
%mm_98 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_112, %t_191), kwargs = {})
%_unsafe_view_113 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_98, [2, 9216, 320]), kwargs = {})
%_param_constant621 : [#users=1] = get_attr[target=_param_constant621]
%add_310 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_113, %_param_constant621), kwargs = {})
%_to_copy_314 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_310,), kwargs = {dtype: torch.float32})
%var_mean_99 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_314, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_254 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_99, 0), kwargs = {})
%getitem_255 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_99, 1), kwargs = {})
%add_311 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_254, 1e-05), kwargs = {})
%rsqrt_99 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_311,), kwargs = {})
%sub_99 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_310, %getitem_255), kwargs = {})
%mul_215 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_99, %rsqrt_99), kwargs = {})
%_param_constant622 : [#users=1] = get_attr[target=_param_constant622]
%mul_216 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_215, %_param_constant622), kwargs = {})
%_param_constant623 : [#users=1] = get_attr[target=_param_constant623]
%add_312 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_216, %_param_constant623), kwargs = {})
%_to_copy_315 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_312,), kwargs = {dtype: torch.float16})
%_param_constant624 : [#users=1] = get_attr[target=_param_constant624]
%t_192 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant624,), kwargs = {})
%view_479 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_315, [18432, 320]), kwargs = {})
%mm_99 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_479, %t_192), kwargs = {})
%_unsafe_view_114 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_99, [2, 9216, 320]), kwargs = {})
%_param_constant625 : [#users=1] = get_attr[target=_param_constant625]
%t_193 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant625,), kwargs = {})
%view_480 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_315, [18432, 320]), kwargs = {})
%mm_100 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_480, %t_193), kwargs = {})
%_unsafe_view_115 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_100, [2, 9216, 320]), kwargs = {})
%_param_constant626 : [#users=1] = get_attr[target=_param_constant626]
%t_194 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant626,), kwargs = {})
%view_481 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_315, [18432, 320]), kwargs = {})
%mm_101 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_481, %t_194), kwargs = {})
%_unsafe_view_116 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_101, [2, 9216, 320]), kwargs = {})
%view_482 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_114, [2, -1, 5, 64]), kwargs = {})
%transpose_112 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_482, 1, 2), kwargs = {})
%view_483 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_115, [2, -1, 5, 64]), kwargs = {})
%transpose_113 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_483, 1, 2), kwargs = {})
%view_484 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_116, [2, -1, 5, 64]), kwargs = {})
%transpose_114 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_484, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_28 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_112, %transpose_113, %transpose_114, True), kwargs = {})
%getitem_256 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_28, 0), kwargs = {})
%getitem_257 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_28, 1), kwargs = {})
%detach_142 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_256,), kwargs = {})
%transpose_115 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_256, 1, 2), kwargs = {})
%view_485 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_115, [2, -1, 320]), kwargs = {})
%_param_constant627 : [#users=1] = get_attr[target=_param_constant627]
%t_195 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant627,), kwargs = {})
%view_486 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_485, [18432, 320]), kwargs = {})
%_param_constant628 : [#users=1] = get_attr[target=_param_constant628]
%addmm_93 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant628, %view_486, %t_195), kwargs = {})
%view_487 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_93, [2, 9216, 320]), kwargs = {})
%add_313 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_487, %add_310), kwargs = {})
%_to_copy_316 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_313,), kwargs = {dtype: torch.float32})
%var_mean_100 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_316, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_258 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_100, 0), kwargs = {})
%getitem_259 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_100, 1), kwargs = {})
%add_314 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_258, 1e-05), kwargs = {})
%rsqrt_100 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_314,), kwargs = {})
%sub_100 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_313, %getitem_259), kwargs = {})
%mul_217 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_100, %rsqrt_100), kwargs = {})
%_param_constant629 : [#users=1] = get_attr[target=_param_constant629]
%mul_218 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_217, %_param_constant629), kwargs = {})
%_param_constant630 : [#users=1] = get_attr[target=_param_constant630]
%add_315 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_218, %_param_constant630), kwargs = {})
%_to_copy_317 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_315,), kwargs = {dtype: torch.float16})
%_param_constant631 : [#users=1] = get_attr[target=_param_constant631]
%t_196 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant631,), kwargs = {})
%view_488 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_317, [18432, 320]), kwargs = {})
%mm_102 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_488, %t_196), kwargs = {})
%_unsafe_view_117 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_102, [2, 9216, 320]), kwargs = {})
%_param_constant632 : [#users=1] = get_attr[target=_param_constant632]
%t_197 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant632,), kwargs = {})
%view_489 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_103 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_489, %t_197), kwargs = {})
%_unsafe_view_118 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_103, [2, 77, 320]), kwargs = {})
%_param_constant633 : [#users=1] = get_attr[target=_param_constant633]
%t_198 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant633,), kwargs = {})
%view_490 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_104 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_490, %t_198), kwargs = {})
%_unsafe_view_119 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_104, [2, 77, 320]), kwargs = {})
%view_491 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_117, [2, -1, 5, 64]), kwargs = {})
%transpose_116 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_491, 1, 2), kwargs = {})
%view_492 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_118, [2, -1, 5, 64]), kwargs = {})
%transpose_117 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_492, 1, 2), kwargs = {})
%view_493 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_119, [2, -1, 5, 64]), kwargs = {})
%transpose_118 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_493, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_29 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_116, %transpose_117, %transpose_118, True), kwargs = {})
%getitem_260 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_29, 0), kwargs = {})
%getitem_261 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_29, 1), kwargs = {})
%detach_143 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_260,), kwargs = {})
%transpose_119 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_260, 1, 2), kwargs = {})
%view_494 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_119, [2, -1, 320]), kwargs = {})
%_param_constant634 : [#users=1] = get_attr[target=_param_constant634]
%t_199 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant634,), kwargs = {})
%view_495 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_494, [18432, 320]), kwargs = {})
%_param_constant635 : [#users=1] = get_attr[target=_param_constant635]
%addmm_94 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant635, %view_495, %t_199), kwargs = {})
%view_496 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_94, [2, 9216, 320]), kwargs = {})
%add_316 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_496, %add_313), kwargs = {})
%_to_copy_318 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_316,), kwargs = {dtype: torch.float32})
%var_mean_101 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_318, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_262 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_101, 0), kwargs = {})
%getitem_263 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_101, 1), kwargs = {})
%add_317 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_262, 1e-05), kwargs = {})
%rsqrt_101 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_317,), kwargs = {})
%sub_101 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_316, %getitem_263), kwargs = {})
%mul_219 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_101, %rsqrt_101), kwargs = {})
%_param_constant636 : [#users=1] = get_attr[target=_param_constant636]
%mul_220 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_219, %_param_constant636), kwargs = {})
%_param_constant637 : [#users=1] = get_attr[target=_param_constant637]
%add_318 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_220, %_param_constant637), kwargs = {})
%_to_copy_319 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_318,), kwargs = {dtype: torch.float16})
%_param_constant638 : [#users=1] = get_attr[target=_param_constant638]
%t_200 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant638,), kwargs = {})
%view_497 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_319, [18432, 320]), kwargs = {})
%_param_constant639 : [#users=1] = get_attr[target=_param_constant639]
%addmm_95 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant639, %view_497, %t_200), kwargs = {})
%view_498 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_95, [2, 9216, 2560]), kwargs = {})
%slice_77 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_498, -1, 0, 1280), kwargs = {})
%slice_78 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_498, -1, 1280, 2560), kwargs = {})
%gelu_14 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_78,), kwargs = {})
%mul_221 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_77, %gelu_14), kwargs = {})
%_param_constant640 : [#users=1] = get_attr[target=_param_constant640]
%t_201 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant640,), kwargs = {})
%view_499 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_221, [18432, 1280]), kwargs = {})
%_param_constant641 : [#users=1] = get_attr[target=_param_constant641]
%addmm_96 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant641, %view_499, %t_201), kwargs = {})
%view_500 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_96, [2, 9216, 320]), kwargs = {})
%add_319 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_500, %add_316), kwargs = {})
%_param_constant642 : [#users=1] = get_attr[target=_param_constant642]
%t_202 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant642,), kwargs = {})
%view_501 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_319, [18432, 320]), kwargs = {})
%_param_constant643 : [#users=1] = get_attr[target=_param_constant643]
%addmm_97 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant643, %view_501, %t_202), kwargs = {})
%view_502 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_97, [2, 9216, 320]), kwargs = {})
%view_503 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_502, [2, 96, 96, 320]), kwargs = {})
%permute_29 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_503, [0, 3, 1, 2]), kwargs = {})
%clone_29 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_29,), kwargs = {memory_format: torch.contiguous_format})
%add_320 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_29, %div_21), kwargs = {})
%cat_13 : [#users=2] = call_function[target=torch.ops.aten.cat](args = ([%add_320, %convolution], 1), kwargs = {})
%view_504 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%cat_13, [2, 32, 20, 9216]), kwargs = {})
%_to_copy_320 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_504,), kwargs = {dtype: torch.float32})
%var_mean_102 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_320, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_264 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_102, 0), kwargs = {})
%getitem_265 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_102, 1), kwargs = {})
%add_321 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_264, 1e-05), kwargs = {})
%rsqrt_102 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_321,), kwargs = {})
%sub_102 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_504, %getitem_265), kwargs = {})
%mul_222 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_102, %rsqrt_102), kwargs = {})
%view_505 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_222, [2, 640, 96, 96]), kwargs = {})
%_param_constant644 : [#users=1] = get_attr[target=_param_constant644]
%unsqueeze_386 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant644, 0), kwargs = {})
%unsqueeze_387 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_386, 2), kwargs = {})
%unsqueeze_388 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_387, 3), kwargs = {})
%_param_constant645 : [#users=1] = get_attr[target=_param_constant645]
%unsqueeze_389 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant645, 0), kwargs = {})
%unsqueeze_390 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_389, 2), kwargs = {})
%unsqueeze_391 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_390, 3), kwargs = {})
%mul_223 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_505, %unsqueeze_391), kwargs = {})
%add_322 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_223, %unsqueeze_388), kwargs = {})
%_to_copy_321 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_322,), kwargs = {dtype: torch.float16})
%_to_copy_322 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_265,), kwargs = {dtype: torch.float16})
%_to_copy_323 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_102,), kwargs = {dtype: torch.float16})
%squeeze_114 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_322, [2, 3]), kwargs = {})
%squeeze_115 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_323, [2, 3]), kwargs = {})
%detach_144 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_114,), kwargs = {})
%detach_145 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_115,), kwargs = {})
%silu_64 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_321,), kwargs = {})
%_param_constant646 : [#users=1] = get_attr[target=_param_constant646]
%_param_constant647 : [#users=1] = get_attr[target=_param_constant647]
%convolution_62 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_64, %_param_constant646, %_param_constant647, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%silu_65 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%addmm_1,), kwargs = {})
%_param_constant648 : [#users=1] = get_attr[target=_param_constant648]
%t_203 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant648,), kwargs = {})
%_param_constant649 : [#users=1] = get_attr[target=_param_constant649]
%addmm_98 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant649, %silu_65, %t_203), kwargs = {})
%slice_79 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%addmm_98, 0, 0, 9223372036854775807), kwargs = {})
%slice_80 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%slice_79, 1, 0, 9223372036854775807), kwargs = {})
%unsqueeze_392 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%slice_80, 2), kwargs = {})
%unsqueeze_393 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_392, 3), kwargs = {})
%add_323 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_62, %unsqueeze_393), kwargs = {})
%view_506 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_323, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_324 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_506,), kwargs = {dtype: torch.float32})
%var_mean_103 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_324, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_266 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_103, 0), kwargs = {})
%getitem_267 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_103, 1), kwargs = {})
%add_324 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_266, 1e-05), kwargs = {})
%rsqrt_103 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_324,), kwargs = {})
%sub_103 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_506, %getitem_267), kwargs = {})
%mul_224 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_103, %rsqrt_103), kwargs = {})
%view_507 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_224, [2, 320, 96, 96]), kwargs = {})
%_param_constant650 : [#users=1] = get_attr[target=_param_constant650]
%unsqueeze_394 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant650, 0), kwargs = {})
%unsqueeze_395 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_394, 2), kwargs = {})
%unsqueeze_396 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_395, 3), kwargs = {})
%_param_constant651 : [#users=1] = get_attr[target=_param_constant651]
%unsqueeze_397 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant651, 0), kwargs = {})
%unsqueeze_398 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_397, 2), kwargs = {})
%unsqueeze_399 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_398, 3), kwargs = {})
%mul_225 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_507, %unsqueeze_399), kwargs = {})
%add_325 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_225, %unsqueeze_396), kwargs = {})
%_to_copy_325 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_325,), kwargs = {dtype: torch.float16})
%_to_copy_326 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_267,), kwargs = {dtype: torch.float16})
%_to_copy_327 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_103,), kwargs = {dtype: torch.float16})
%squeeze_116 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_326, [2, 3]), kwargs = {})
%squeeze_117 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_327, [2, 3]), kwargs = {})
%detach_146 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_116,), kwargs = {})
%detach_147 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_117,), kwargs = {})
%silu_66 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_325,), kwargs = {})
%_param_constant652 : [#users=1] = get_attr[target=_param_constant652]
%_param_constant653 : [#users=1] = get_attr[target=_param_constant653]
%convolution_63 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_66, %_param_constant652, %_param_constant653, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
%_param_constant654 : [#users=1] = get_attr[target=_param_constant654]
%_param_constant655 : [#users=1] = get_attr[target=_param_constant655]
%convolution_64 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%cat_13, %_param_constant654, %_param_constant655, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
%add_326 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%convolution_64, %convolution_63), kwargs = {})
%div_22 : [#users=2] = call_function[target=torch.ops.aten.div](args = (%add_326, 1.0), kwargs = {})
%view_508 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%div_22, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_328 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_508,), kwargs = {dtype: torch.float32})
%var_mean_104 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_328, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_268 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_104, 0), kwargs = {})
%getitem_269 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_104, 1), kwargs = {})
%add_327 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_268, 1e-06), kwargs = {})
%rsqrt_104 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_327,), kwargs = {})
%sub_104 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_508, %getitem_269), kwargs = {})
%mul_226 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_104, %rsqrt_104), kwargs = {})
%view_509 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_226, [2, 320, 96, 96]), kwargs = {})
%_param_constant656 : [#users=1] = get_attr[target=_param_constant656]
%unsqueeze_400 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant656, 0), kwargs = {})
%unsqueeze_401 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_400, 2), kwargs = {})
%unsqueeze_402 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_401, 3), kwargs = {})
%_param_constant657 : [#users=1] = get_attr[target=_param_constant657]
%unsqueeze_403 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant657, 0), kwargs = {})
%unsqueeze_404 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_403, 2), kwargs = {})
%unsqueeze_405 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_404, 3), kwargs = {})
%mul_227 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_509, %unsqueeze_405), kwargs = {})
%add_328 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_227, %unsqueeze_402), kwargs = {})
%_to_copy_329 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_328,), kwargs = {dtype: torch.float16})
%_to_copy_330 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_269,), kwargs = {dtype: torch.float16})
%_to_copy_331 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_104,), kwargs = {dtype: torch.float16})
%squeeze_118 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_330, [2, 3]), kwargs = {})
%squeeze_119 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_331, [2, 3]), kwargs = {})
%detach_148 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_118,), kwargs = {})
%detach_149 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_119,), kwargs = {})
%permute_30 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%_to_copy_329, [0, 2, 3, 1]), kwargs = {})
%view_510 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%permute_30, [2, 9216, 320]), kwargs = {})
%_param_constant658 : [#users=1] = get_attr[target=_param_constant658]
%t_204 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant658,), kwargs = {})
%clone_30 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%view_510,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_120 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_30, [18432, 320]), kwargs = {})
%mm_105 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_unsafe_view_120, %t_204), kwargs = {})
%_unsafe_view_121 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_105, [2, 9216, 320]), kwargs = {})
%_param_constant659 : [#users=1] = get_attr[target=_param_constant659]
%add_329 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%_unsafe_view_121, %_param_constant659), kwargs = {})
%_to_copy_332 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_329,), kwargs = {dtype: torch.float32})
%var_mean_105 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_332, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_270 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_105, 0), kwargs = {})
%getitem_271 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_105, 1), kwargs = {})
%add_330 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_270, 1e-05), kwargs = {})
%rsqrt_105 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_330,), kwargs = {})
%sub_105 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_329, %getitem_271), kwargs = {})
%mul_228 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_105, %rsqrt_105), kwargs = {})
%_param_constant660 : [#users=1] = get_attr[target=_param_constant660]
%mul_229 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_228, %_param_constant660), kwargs = {})
%_param_constant661 : [#users=1] = get_attr[target=_param_constant661]
%add_331 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_229, %_param_constant661), kwargs = {})
%_to_copy_333 : [#users=3] = call_function[target=torch.ops.aten._to_copy](args = (%add_331,), kwargs = {dtype: torch.float16})
%_param_constant662 : [#users=1] = get_attr[target=_param_constant662]
%t_205 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant662,), kwargs = {})
%view_511 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_333, [18432, 320]), kwargs = {})
%mm_106 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_511, %t_205), kwargs = {})
%_unsafe_view_122 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_106, [2, 9216, 320]), kwargs = {})
%_param_constant663 : [#users=1] = get_attr[target=_param_constant663]
%t_206 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant663,), kwargs = {})
%view_512 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_333, [18432, 320]), kwargs = {})
%mm_107 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_512, %t_206), kwargs = {})
%_unsafe_view_123 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_107, [2, 9216, 320]), kwargs = {})
%_param_constant664 : [#users=1] = get_attr[target=_param_constant664]
%t_207 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant664,), kwargs = {})
%view_513 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_333, [18432, 320]), kwargs = {})
%mm_108 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_513, %t_207), kwargs = {})
%_unsafe_view_124 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_108, [2, 9216, 320]), kwargs = {})
%view_514 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_122, [2, -1, 5, 64]), kwargs = {})
%transpose_120 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_514, 1, 2), kwargs = {})
%view_515 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_123, [2, -1, 5, 64]), kwargs = {})
%transpose_121 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_515, 1, 2), kwargs = {})
%view_516 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_124, [2, -1, 5, 64]), kwargs = {})
%transpose_122 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_516, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_30 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_120, %transpose_121, %transpose_122, True), kwargs = {})
%getitem_272 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_30, 0), kwargs = {})
%getitem_273 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_30, 1), kwargs = {})
%detach_150 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_272,), kwargs = {})
%transpose_123 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_272, 1, 2), kwargs = {})
%view_517 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_123, [2, -1, 320]), kwargs = {})
%_param_constant665 : [#users=1] = get_attr[target=_param_constant665]
%t_208 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant665,), kwargs = {})
%view_518 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_517, [18432, 320]), kwargs = {})
%_param_constant666 : [#users=1] = get_attr[target=_param_constant666]
%addmm_99 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant666, %view_518, %t_208), kwargs = {})
%view_519 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_99, [2, 9216, 320]), kwargs = {})
%add_332 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_519, %add_329), kwargs = {})
%_to_copy_334 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_332,), kwargs = {dtype: torch.float32})
%var_mean_106 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_334, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_274 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_106, 0), kwargs = {})
%getitem_275 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_106, 1), kwargs = {})
%add_333 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_274, 1e-05), kwargs = {})
%rsqrt_106 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_333,), kwargs = {})
%sub_106 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_332, %getitem_275), kwargs = {})
%mul_230 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_106, %rsqrt_106), kwargs = {})
%_param_constant667 : [#users=1] = get_attr[target=_param_constant667]
%mul_231 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_230, %_param_constant667), kwargs = {})
%_param_constant668 : [#users=1] = get_attr[target=_param_constant668]
%add_334 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_231, %_param_constant668), kwargs = {})
%_to_copy_335 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_334,), kwargs = {dtype: torch.float16})
%_param_constant669 : [#users=1] = get_attr[target=_param_constant669]
%t_209 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant669,), kwargs = {})
%view_520 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_335, [18432, 320]), kwargs = {})
%mm_109 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_520, %t_209), kwargs = {})
%_unsafe_view_125 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_109, [2, 9216, 320]), kwargs = {})
%_param_constant670 : [#users=1] = get_attr[target=_param_constant670]
%t_210 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant670,), kwargs = {})
%view_521 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_110 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_521, %t_210), kwargs = {})
%_unsafe_view_126 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_110, [2, 77, 320]), kwargs = {})
%_param_constant671 : [#users=1] = get_attr[target=_param_constant671]
%t_211 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant671,), kwargs = {})
%view_522 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg2_1, [154, 1024]), kwargs = {})
%mm_111 : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%view_522, %t_211), kwargs = {})
%_unsafe_view_127 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm_111, [2, 77, 320]), kwargs = {})
%view_523 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_125, [2, -1, 5, 64]), kwargs = {})
%transpose_124 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_523, 1, 2), kwargs = {})
%view_524 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_126, [2, -1, 5, 64]), kwargs = {})
%transpose_125 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_524, 1, 2), kwargs = {})
%view_525 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_127, [2, -1, 5, 64]), kwargs = {})
%transpose_126 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%view_525, 1, 2), kwargs = {})
%_scaled_dot_product_efficient_attention_31 : [#users=2] = call_function[target=torch.ops.aten._scaled_dot_product_efficient_attention](args = (%transpose_124, %transpose_125, %transpose_126, True), kwargs = {})
%getitem_276 : [#users=2] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_31, 0), kwargs = {})
%getitem_277 : [#users=0] = call_function[target=operator.getitem](args = (%_scaled_dot_product_efficient_attention_31, 1), kwargs = {})
%detach_151 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%getitem_276,), kwargs = {})
%transpose_127 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%getitem_276, 1, 2), kwargs = {})
%view_526 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%transpose_127, [2, -1, 320]), kwargs = {})
%_param_constant672 : [#users=1] = get_attr[target=_param_constant672]
%t_212 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant672,), kwargs = {})
%view_527 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_526, [18432, 320]), kwargs = {})
%_param_constant673 : [#users=1] = get_attr[target=_param_constant673]
%addmm_100 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant673, %view_527, %t_212), kwargs = {})
%view_528 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_100, [2, 9216, 320]), kwargs = {})
%add_335 : [#users=3] = call_function[target=torch.ops.aten.add](args = (%view_528, %add_332), kwargs = {})
%_to_copy_336 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_335,), kwargs = {dtype: torch.float32})
%var_mean_107 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_336, [2]), kwargs = {correction: 0, keepdim: True})
%getitem_278 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_107, 0), kwargs = {})
%getitem_279 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_107, 1), kwargs = {})
%add_336 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_278, 1e-05), kwargs = {})
%rsqrt_107 : [#users=1] = call_function[target=torch.ops.aten.rsqrt](args = (%add_336,), kwargs = {})
%sub_107 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%add_335, %getitem_279), kwargs = {})
%mul_232 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_107, %rsqrt_107), kwargs = {})
%_param_constant674 : [#users=1] = get_attr[target=_param_constant674]
%mul_233 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_232, %_param_constant674), kwargs = {})
%_param_constant675 : [#users=1] = get_attr[target=_param_constant675]
%add_337 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_233, %_param_constant675), kwargs = {})
%_to_copy_337 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_337,), kwargs = {dtype: torch.float16})
%_param_constant676 : [#users=1] = get_attr[target=_param_constant676]
%t_213 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant676,), kwargs = {})
%view_529 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_to_copy_337, [18432, 320]), kwargs = {})
%_param_constant677 : [#users=1] = get_attr[target=_param_constant677]
%addmm_101 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant677, %view_529, %t_213), kwargs = {})
%view_530 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%addmm_101, [2, 9216, 2560]), kwargs = {})
%slice_81 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_530, -1, 0, 1280), kwargs = {})
%slice_82 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%view_530, -1, 1280, 2560), kwargs = {})
%gelu_15 : [#users=1] = call_function[target=torch.ops.aten.gelu](args = (%slice_82,), kwargs = {})
%mul_234 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%slice_81, %gelu_15), kwargs = {})
%_param_constant678 : [#users=1] = get_attr[target=_param_constant678]
%t_214 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant678,), kwargs = {})
%view_531 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_234, [18432, 1280]), kwargs = {})
%_param_constant679 : [#users=1] = get_attr[target=_param_constant679]
%addmm_102 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant679, %view_531, %t_214), kwargs = {})
%view_532 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_102, [2, 9216, 320]), kwargs = {})
%add_338 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%view_532, %add_335), kwargs = {})
%_param_constant680 : [#users=1] = get_attr[target=_param_constant680]
%t_215 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant680,), kwargs = {})
%view_533 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_338, [18432, 320]), kwargs = {})
%_param_constant681 : [#users=1] = get_attr[target=_param_constant681]
%addmm_103 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant681, %view_533, %t_215), kwargs = {})
%view_534 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_103, [2, 9216, 320]), kwargs = {})
%view_535 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%view_534, [2, 96, 96, 320]), kwargs = {})
%permute_31 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_535, [0, 3, 1, 2]), kwargs = {})
%clone_31 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_31,), kwargs = {memory_format: torch.contiguous_format})
%add_339 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%clone_31, %div_22), kwargs = {})
%view_536 : [#users=2] = call_function[target=torch.ops.aten.view](args = (%add_339, [2, 32, 10, 9216]), kwargs = {})
%_to_copy_338 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%view_536,), kwargs = {dtype: torch.float32})
%var_mean_108 : [#users=2] = call_function[target=torch.ops.aten.var_mean](args = (%_to_copy_338, [2, 3]), kwargs = {correction: 0, keepdim: True})
%getitem_280 : [#users=1] = call_function[target=operator.getitem](args = (%var_mean_108, 0), kwargs = {})
%getitem_281 : [#users=2] = call_function[target=operator.getitem](args = (%var_mean_108, 1), kwargs = {})
%add_340 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%getitem_280, 1e-05), kwargs = {})
%rsqrt_108 : [#users=2] = call_function[target=torch.ops.aten.rsqrt](args = (%add_340,), kwargs = {})
%sub_108 : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%view_536, %getitem_281), kwargs = {})
%mul_235 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub_108, %rsqrt_108), kwargs = {})
%view_537 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_235, [2, 320, 96, 96]), kwargs = {})
%_param_constant682 : [#users=1] = get_attr[target=_param_constant682]
%unsqueeze_406 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant682, 0), kwargs = {})
%unsqueeze_407 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_406, 2), kwargs = {})
%unsqueeze_408 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_407, 3), kwargs = {})
%_param_constant683 : [#users=1] = get_attr[target=_param_constant683]
%unsqueeze_409 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%_param_constant683, 0), kwargs = {})
%unsqueeze_410 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_409, 2), kwargs = {})
%unsqueeze_411 : [#users=1] = call_function[target=torch.ops.aten.unsqueeze](args = (%unsqueeze_410, 3), kwargs = {})
%mul_236 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_537, %unsqueeze_411), kwargs = {})
%add_341 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_236, %unsqueeze_408), kwargs = {})
%_to_copy_339 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%add_341,), kwargs = {dtype: torch.float16})
%_to_copy_340 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%getitem_281,), kwargs = {dtype: torch.float16})
%_to_copy_341 : [#users=1] = call_function[target=torch.ops.aten._to_copy](args = (%rsqrt_108,), kwargs = {dtype: torch.float16})
%squeeze_120 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_340, [2, 3]), kwargs = {})
%squeeze_121 : [#users=1] = call_function[target=torch.ops.aten.squeeze](args = (%_to_copy_341, [2, 3]), kwargs = {})
%detach_152 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_120,), kwargs = {})
%detach_153 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%squeeze_121,), kwargs = {})
%silu_67 : [#users=1] = call_function[target=torch.ops.aten.silu](args = (%_to_copy_339,), kwargs = {})
%_param_constant684 : [#users=1] = get_attr[target=_param_constant684]
%_param_constant685 : [#users=1] = get_attr[target=_param_constant685]
%convolution_65 : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%silu_67, %_param_constant684, %_param_constant685, [1, 1], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
return convolution_65
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment