Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active September 26, 2022 16:34
Show Gist options
  • Save AmosLewis/88d7d29e7ddfe9319a1aaa9bb8bb24cc to your computer and use it in GitHub Desktop.
Save AmosLewis/88d7d29e7ddfe9319a1aaa9bb8bb24cc to your computer and use it in GitHub Desktop.
bloomsymbolic
# from torch.fx import symbolic_trace
# # Symbolic tracing frontend - captures the semantics of the module
# symbolic_traced : torch.fx.GraphModule = symbolic_trace(fx_g)
# # High-level intermediate representation (IR) - Graph representation
# print(symbolic_traced.graph)
graph():
%arg0_1 : [#users=2] = placeholder[target=arg0_1]
%view : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg0_1, [-1, 128]), kwargs = {})
%_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
%embedding : [#users=1] = call_function[target=torch.ops.aten.embedding](args = (%_param_constant0, %view), kwargs = {})
%_param_constant1 : [#users=1] = get_attr[target=_param_constant1]
%_param_constant2 : [#users=1] = get_attr[target=_param_constant2]
%native_layer_norm : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%embedding, [1024], %_param_constant1, %_param_constant2, 1e-05), kwargs = {})
%getitem : [#users=2] = call_function[target=operator.getitem](args = (%native_layer_norm, 0), kwargs = {})
%getitem_1 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm, 1), kwargs = {})
%getitem_2 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm, 2), kwargs = {})
%_param_constant3 : [#users=1] = get_attr[target=_param_constant3]
%_param_constant4 : [#users=1] = get_attr[target=_param_constant4]
%native_layer_norm_1 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%getitem, [1024], %_param_constant3, %_param_constant4, 1e-05), kwargs = {})
%getitem_3 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_1, 0), kwargs = {})
%getitem_4 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_1, 1), kwargs = {})
%getitem_5 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_1, 2), kwargs = {})
%_param_constant5 : [#users=1] = get_attr[target=_param_constant5]
%t : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant5,), kwargs = {})
%view_1 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_3, [128, 1024]), kwargs = {})
%_param_constant6 : [#users=1] = get_attr[target=_param_constant6]
%addmm : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant6, %view_1, %t), kwargs = {})
%view_2 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm, [1, 128, 3072]), kwargs = {})
%_reshape_alias : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_2, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_1 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias, -1, 0, 64), kwargs = {})
%slice_2 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias, -1, 64, 128), kwargs = {})
%slice_3 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias, -1, 128, 192), kwargs = {})
%transpose : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_1, 1, 2), kwargs = {})
%_reshape_alias_1 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_2, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_2 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_1, %_reshape_alias_2), kwargs = {})
%mul : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm, 0.125), kwargs = {})
%_tensor_constant25 : [#users=1] = get_attr[target=_tensor_constant25]
%add : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul, %_tensor_constant25), kwargs = {})
%view_3 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add, [-1, 16, 128, 128]), kwargs = {})
%mul_1 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_3, 1), kwargs = {})
%_tensor_constant26 : [#users=1] = get_attr[target=_tensor_constant26]
%add_1 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_1, %_tensor_constant26), kwargs = {})
%_tensor_constant27 : [#users=1] = get_attr[target=_tensor_constant27]
%maximum : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_1, %_tensor_constant27), kwargs = {})
%_softmax : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum, -1, False), kwargs = {})
%detach : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax,), kwargs = {})
%_tensor_constant28 : [#users=1] = get_attr[target=_tensor_constant28]
%mul_2 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax, %_tensor_constant28), kwargs = {})
%view_4 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_2, [16, 128, 128]), kwargs = {})
%transpose_1 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_3, 1, 2), kwargs = {})
%_reshape_alias_3 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_1, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_1 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_4, %_reshape_alias_3), kwargs = {})
%view_5 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_1, [1, 16, 128, 64]), kwargs = {})
%permute_1 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_5, [0, 2, 1, 3]), kwargs = {})
%clone : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone, [1, 128, 1024]), kwargs = {})
%_param_constant7 : [#users=1] = get_attr[target=_param_constant7]
%t_1 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant7,), kwargs = {})
%view_6 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view, [128, 1024]), kwargs = {})
%_param_constant8 : [#users=1] = get_attr[target=_param_constant8]
%addmm_1 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant8, %view_6, %t_1), kwargs = {})
%view_7 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_1, [1, 128, 1024]), kwargs = {})
%add_2 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%getitem, %view_7), kwargs = {})
%_param_constant9 : [#users=1] = get_attr[target=_param_constant9]
%_param_constant10 : [#users=1] = get_attr[target=_param_constant10]
%native_layer_norm_2 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_2, [1024], %_param_constant9, %_param_constant10, 1e-05), kwargs = {})
%getitem_6 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_2, 0), kwargs = {})
%getitem_7 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_2, 1), kwargs = {})
%getitem_8 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_2, 2), kwargs = {})
%_param_constant11 : [#users=1] = get_attr[target=_param_constant11]
%t_2 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant11,), kwargs = {})
%view_8 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_6, [128, 1024]), kwargs = {})
%_param_constant12 : [#users=1] = get_attr[target=_param_constant12]
%addmm_2 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant12, %view_8, %t_2), kwargs = {})
%view_9 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_2, [1, 128, 4096]), kwargs = {})
%mul_3 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_9, 0.5), kwargs = {})
%mul_4 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_9, 0.79788456), kwargs = {})
%mul_5 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_9, 0.044715), kwargs = {})
%mul_6 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_5, %view_9), kwargs = {})
%add_3 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_6, 1), kwargs = {})
%mul_7 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_4, %add_3), kwargs = {})
%tanh : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_7,), kwargs = {})
%detach_1 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh,), kwargs = {})
%add_4 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh, 1.0), kwargs = {})
%mul_8 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_3, %add_4), kwargs = {})
%_param_constant13 : [#users=1] = get_attr[target=_param_constant13]
%t_3 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant13,), kwargs = {})
%view_10 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_8, [128, 4096]), kwargs = {})
%_param_constant14 : [#users=1] = get_attr[target=_param_constant14]
%addmm_3 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant14, %view_10, %t_3), kwargs = {})
%view_11 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_3, [1, 128, 1024]), kwargs = {})
%add_5 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_2, %view_11), kwargs = {})
%_param_constant15 : [#users=1] = get_attr[target=_param_constant15]
%_param_constant16 : [#users=1] = get_attr[target=_param_constant16]
%native_layer_norm_3 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_5, [1024], %_param_constant15, %_param_constant16, 1e-05), kwargs = {})
%getitem_9 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_3, 0), kwargs = {})
%getitem_10 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_3, 1), kwargs = {})
%getitem_11 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_3, 2), kwargs = {})
%_param_constant17 : [#users=1] = get_attr[target=_param_constant17]
%t_4 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant17,), kwargs = {})
%view_12 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_9, [128, 1024]), kwargs = {})
%_param_constant18 : [#users=1] = get_attr[target=_param_constant18]
%addmm_4 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant18, %view_12, %t_4), kwargs = {})
%view_13 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_4, [1, 128, 3072]), kwargs = {})
%_reshape_alias_4 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_13, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_4 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_4, -1, 0, 64), kwargs = {})
%slice_5 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_4, -1, 64, 128), kwargs = {})
%slice_6 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_4, -1, 128, 192), kwargs = {})
%transpose_2 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_4, 1, 2), kwargs = {})
%_reshape_alias_5 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_2, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_2 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_5, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_6 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_2, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_2 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_5, %_reshape_alias_6), kwargs = {})
%mul_9 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_2, 0.125), kwargs = {})
%_tensor_constant29 : [#users=1] = get_attr[target=_tensor_constant29]
%add_6 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_9, %_tensor_constant29), kwargs = {})
%view_14 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_6, [-1, 16, 128, 128]), kwargs = {})
%mul_10 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_14, 1), kwargs = {})
%_tensor_constant26_1 : [#users=1] = get_attr[target=_tensor_constant26]
%add_7 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_10, %_tensor_constant26_1), kwargs = {})
%_tensor_constant30 : [#users=1] = get_attr[target=_tensor_constant30]
%maximum_1 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_7, %_tensor_constant30), kwargs = {})
%_softmax_1 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_1, -1, False), kwargs = {})
%detach_2 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_1,), kwargs = {})
%_tensor_constant31 : [#users=1] = get_attr[target=_tensor_constant31]
%mul_11 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_1, %_tensor_constant31), kwargs = {})
%view_15 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_11, [16, 128, 128]), kwargs = {})
%transpose_3 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_6, 1, 2), kwargs = {})
%_reshape_alias_7 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_3, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_3 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_15, %_reshape_alias_7), kwargs = {})
%view_16 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_3, [1, 16, 128, 64]), kwargs = {})
%permute_3 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_16, [0, 2, 1, 3]), kwargs = {})
%clone_1 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_1 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_1, [1, 128, 1024]), kwargs = {})
%_param_constant19 : [#users=1] = get_attr[target=_param_constant19]
%t_5 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant19,), kwargs = {})
%view_17 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_1, [128, 1024]), kwargs = {})
%_param_constant20 : [#users=1] = get_attr[target=_param_constant20]
%addmm_5 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant20, %view_17, %t_5), kwargs = {})
%view_18 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_5, [1, 128, 1024]), kwargs = {})
%add_8 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_5, %view_18), kwargs = {})
%_param_constant21 : [#users=1] = get_attr[target=_param_constant21]
%_param_constant22 : [#users=1] = get_attr[target=_param_constant22]
%native_layer_norm_4 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_8, [1024], %_param_constant21, %_param_constant22, 1e-05), kwargs = {})
%getitem_12 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_4, 0), kwargs = {})
%getitem_13 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_4, 1), kwargs = {})
%getitem_14 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_4, 2), kwargs = {})
%_param_constant23 : [#users=1] = get_attr[target=_param_constant23]
%t_6 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant23,), kwargs = {})
%view_19 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_12, [128, 1024]), kwargs = {})
%_param_constant24 : [#users=1] = get_attr[target=_param_constant24]
%addmm_6 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant24, %view_19, %t_6), kwargs = {})
%view_20 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_6, [1, 128, 4096]), kwargs = {})
%mul_12 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_20, 0.5), kwargs = {})
%mul_13 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_20, 0.79788456), kwargs = {})
%mul_14 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_20, 0.044715), kwargs = {})
%mul_15 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_14, %view_20), kwargs = {})
%add_9 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_15, 1), kwargs = {})
%mul_16 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_13, %add_9), kwargs = {})
%tanh_1 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_16,), kwargs = {})
%detach_3 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_1,), kwargs = {})
%add_10 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_1, 1.0), kwargs = {})
%mul_17 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_12, %add_10), kwargs = {})
%_param_constant25 : [#users=1] = get_attr[target=_param_constant25]
%t_7 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant25,), kwargs = {})
%view_21 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_17, [128, 4096]), kwargs = {})
%_param_constant26 : [#users=1] = get_attr[target=_param_constant26]
%addmm_7 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant26, %view_21, %t_7), kwargs = {})
%view_22 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_7, [1, 128, 1024]), kwargs = {})
%add_11 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_8, %view_22), kwargs = {})
%_param_constant27 : [#users=1] = get_attr[target=_param_constant27]
%_param_constant28 : [#users=1] = get_attr[target=_param_constant28]
%native_layer_norm_5 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_11, [1024], %_param_constant27, %_param_constant28, 1e-05), kwargs = {})
%getitem_15 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_5, 0), kwargs = {})
%getitem_16 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_5, 1), kwargs = {})
%getitem_17 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_5, 2), kwargs = {})
%_param_constant29 : [#users=1] = get_attr[target=_param_constant29]
%t_8 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant29,), kwargs = {})
%view_23 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_15, [128, 1024]), kwargs = {})
%_param_constant30 : [#users=1] = get_attr[target=_param_constant30]
%addmm_8 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant30, %view_23, %t_8), kwargs = {})
%view_24 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_8, [1, 128, 3072]), kwargs = {})
%_reshape_alias_8 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_24, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_7 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_8, -1, 0, 64), kwargs = {})
%slice_8 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_8, -1, 64, 128), kwargs = {})
%slice_9 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_8, -1, 128, 192), kwargs = {})
%transpose_4 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_7, 1, 2), kwargs = {})
%_reshape_alias_9 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_4, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_4 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_8, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_10 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_4, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_4 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_9, %_reshape_alias_10), kwargs = {})
%mul_18 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_4, 0.0625), kwargs = {})
%_tensor_constant32 : [#users=1] = get_attr[target=_tensor_constant32]
%add_12 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_18, %_tensor_constant32), kwargs = {})
%view_25 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_12, [-1, 16, 128, 128]), kwargs = {})
%mul_19 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_25, 2), kwargs = {})
%_tensor_constant26_2 : [#users=1] = get_attr[target=_tensor_constant26]
%add_13 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_19, %_tensor_constant26_2), kwargs = {})
%_tensor_constant33 : [#users=1] = get_attr[target=_tensor_constant33]
%maximum_2 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_13, %_tensor_constant33), kwargs = {})
%_softmax_2 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_2, -1, False), kwargs = {})
%detach_4 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_2,), kwargs = {})
%_tensor_constant34 : [#users=1] = get_attr[target=_tensor_constant34]
%mul_20 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_2, %_tensor_constant34), kwargs = {})
%view_26 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_20, [16, 128, 128]), kwargs = {})
%transpose_5 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_9, 1, 2), kwargs = {})
%_reshape_alias_11 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_5, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_5 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_26, %_reshape_alias_11), kwargs = {})
%view_27 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_5, [1, 16, 128, 64]), kwargs = {})
%permute_5 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_27, [0, 2, 1, 3]), kwargs = {})
%clone_2 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_5,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_2 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_2, [1, 128, 1024]), kwargs = {})
%_param_constant31 : [#users=1] = get_attr[target=_param_constant31]
%t_9 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant31,), kwargs = {})
%view_28 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_2, [128, 1024]), kwargs = {})
%_param_constant32 : [#users=1] = get_attr[target=_param_constant32]
%addmm_9 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant32, %view_28, %t_9), kwargs = {})
%view_29 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_9, [1, 128, 1024]), kwargs = {})
%add_14 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_11, %view_29), kwargs = {})
%_param_constant33 : [#users=1] = get_attr[target=_param_constant33]
%_param_constant34 : [#users=1] = get_attr[target=_param_constant34]
%native_layer_norm_6 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_14, [1024], %_param_constant33, %_param_constant34, 1e-05), kwargs = {})
%getitem_18 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_6, 0), kwargs = {})
%getitem_19 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_6, 1), kwargs = {})
%getitem_20 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_6, 2), kwargs = {})
%_param_constant35 : [#users=1] = get_attr[target=_param_constant35]
%t_10 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant35,), kwargs = {})
%view_30 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_18, [128, 1024]), kwargs = {})
%_param_constant36 : [#users=1] = get_attr[target=_param_constant36]
%addmm_10 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant36, %view_30, %t_10), kwargs = {})
%view_31 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_10, [1, 128, 4096]), kwargs = {})
%mul_21 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_31, 0.5), kwargs = {})
%mul_22 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_31, 0.79788456), kwargs = {})
%mul_23 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_31, 0.044715), kwargs = {})
%mul_24 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_23, %view_31), kwargs = {})
%add_15 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_24, 1), kwargs = {})
%mul_25 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_22, %add_15), kwargs = {})
%tanh_2 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_25,), kwargs = {})
%detach_5 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_2,), kwargs = {})
%add_16 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_2, 1.0), kwargs = {})
%mul_26 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_21, %add_16), kwargs = {})
%_param_constant37 : [#users=1] = get_attr[target=_param_constant37]
%t_11 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant37,), kwargs = {})
%view_32 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_26, [128, 4096]), kwargs = {})
%_param_constant38 : [#users=1] = get_attr[target=_param_constant38]
%addmm_11 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant38, %view_32, %t_11), kwargs = {})
%view_33 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_11, [1, 128, 1024]), kwargs = {})
%add_17 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_14, %view_33), kwargs = {})
%_param_constant39 : [#users=1] = get_attr[target=_param_constant39]
%_param_constant40 : [#users=1] = get_attr[target=_param_constant40]
%native_layer_norm_7 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_17, [1024], %_param_constant39, %_param_constant40, 1e-05), kwargs = {})
%getitem_21 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_7, 0), kwargs = {})
%getitem_22 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_7, 1), kwargs = {})
%getitem_23 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_7, 2), kwargs = {})
%_param_constant41 : [#users=1] = get_attr[target=_param_constant41]
%t_12 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant41,), kwargs = {})
%view_34 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_21, [128, 1024]), kwargs = {})
%_param_constant42 : [#users=1] = get_attr[target=_param_constant42]
%addmm_12 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant42, %view_34, %t_12), kwargs = {})
%view_35 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_12, [1, 128, 3072]), kwargs = {})
%_reshape_alias_12 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_35, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_10 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_12, -1, 0, 64), kwargs = {})
%slice_11 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_12, -1, 64, 128), kwargs = {})
%slice_12 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_12, -1, 128, 192), kwargs = {})
%transpose_6 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_10, 1, 2), kwargs = {})
%_reshape_alias_13 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_6, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_6 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_11, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_14 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_6, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_6 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_13, %_reshape_alias_14), kwargs = {})
%mul_27 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_6, 0.041666666666666664), kwargs = {})
%_tensor_constant35 : [#users=1] = get_attr[target=_tensor_constant35]
%add_18 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_27, %_tensor_constant35), kwargs = {})
%view_36 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_18, [-1, 16, 128, 128]), kwargs = {})
%mul_28 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_36, 3), kwargs = {})
%_tensor_constant26_3 : [#users=1] = get_attr[target=_tensor_constant26]
%add_19 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_28, %_tensor_constant26_3), kwargs = {})
%_tensor_constant36 : [#users=1] = get_attr[target=_tensor_constant36]
%maximum_3 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_19, %_tensor_constant36), kwargs = {})
%_softmax_3 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_3, -1, False), kwargs = {})
%detach_6 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_3,), kwargs = {})
%_tensor_constant37 : [#users=1] = get_attr[target=_tensor_constant37]
%mul_29 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_3, %_tensor_constant37), kwargs = {})
%view_37 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_29, [16, 128, 128]), kwargs = {})
%transpose_7 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_12, 1, 2), kwargs = {})
%_reshape_alias_15 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_7, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_7 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_37, %_reshape_alias_15), kwargs = {})
%view_38 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_7, [1, 16, 128, 64]), kwargs = {})
%permute_7 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_38, [0, 2, 1, 3]), kwargs = {})
%clone_3 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_7,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_3 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_3, [1, 128, 1024]), kwargs = {})
%_param_constant43 : [#users=1] = get_attr[target=_param_constant43]
%t_13 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant43,), kwargs = {})
%view_39 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_3, [128, 1024]), kwargs = {})
%_param_constant44 : [#users=1] = get_attr[target=_param_constant44]
%addmm_13 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant44, %view_39, %t_13), kwargs = {})
%view_40 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_13, [1, 128, 1024]), kwargs = {})
%add_20 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_17, %view_40), kwargs = {})
%_param_constant45 : [#users=1] = get_attr[target=_param_constant45]
%_param_constant46 : [#users=1] = get_attr[target=_param_constant46]
%native_layer_norm_8 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_20, [1024], %_param_constant45, %_param_constant46, 1e-05), kwargs = {})
%getitem_24 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_8, 0), kwargs = {})
%getitem_25 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_8, 1), kwargs = {})
%getitem_26 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_8, 2), kwargs = {})
%_param_constant47 : [#users=1] = get_attr[target=_param_constant47]
%t_14 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant47,), kwargs = {})
%view_41 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_24, [128, 1024]), kwargs = {})
%_param_constant48 : [#users=1] = get_attr[target=_param_constant48]
%addmm_14 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant48, %view_41, %t_14), kwargs = {})
%view_42 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_14, [1, 128, 4096]), kwargs = {})
%mul_30 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_42, 0.5), kwargs = {})
%mul_31 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_42, 0.79788456), kwargs = {})
%mul_32 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_42, 0.044715), kwargs = {})
%mul_33 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_32, %view_42), kwargs = {})
%add_21 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_33, 1), kwargs = {})
%mul_34 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_31, %add_21), kwargs = {})
%tanh_3 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_34,), kwargs = {})
%detach_7 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_3,), kwargs = {})
%add_22 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_3, 1.0), kwargs = {})
%mul_35 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_30, %add_22), kwargs = {})
%_param_constant49 : [#users=1] = get_attr[target=_param_constant49]
%t_15 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant49,), kwargs = {})
%view_43 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_35, [128, 4096]), kwargs = {})
%_param_constant50 : [#users=1] = get_attr[target=_param_constant50]
%addmm_15 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant50, %view_43, %t_15), kwargs = {})
%view_44 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_15, [1, 128, 1024]), kwargs = {})
%add_23 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_20, %view_44), kwargs = {})
%_param_constant51 : [#users=1] = get_attr[target=_param_constant51]
%_param_constant52 : [#users=1] = get_attr[target=_param_constant52]
%native_layer_norm_9 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_23, [1024], %_param_constant51, %_param_constant52, 1e-05), kwargs = {})
%getitem_27 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_9, 0), kwargs = {})
%getitem_28 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_9, 1), kwargs = {})
%getitem_29 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_9, 2), kwargs = {})
%_param_constant53 : [#users=1] = get_attr[target=_param_constant53]
%t_16 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant53,), kwargs = {})
%view_45 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_27, [128, 1024]), kwargs = {})
%_param_constant54 : [#users=1] = get_attr[target=_param_constant54]
%addmm_16 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant54, %view_45, %t_16), kwargs = {})
%view_46 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_16, [1, 128, 3072]), kwargs = {})
%_reshape_alias_16 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_46, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_13 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_16, -1, 0, 64), kwargs = {})
%slice_14 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_16, -1, 64, 128), kwargs = {})
%slice_15 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_16, -1, 128, 192), kwargs = {})
%transpose_8 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_13, 1, 2), kwargs = {})
%_reshape_alias_17 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_8, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_8 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_14, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_18 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_8, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_8 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_17, %_reshape_alias_18), kwargs = {})
%mul_36 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_8, 0.03125), kwargs = {})
%_tensor_constant38 : [#users=1] = get_attr[target=_tensor_constant38]
%add_24 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_36, %_tensor_constant38), kwargs = {})
%view_47 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_24, [-1, 16, 128, 128]), kwargs = {})
%mul_37 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_47, 4), kwargs = {})
%_tensor_constant26_4 : [#users=1] = get_attr[target=_tensor_constant26]
%add_25 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_37, %_tensor_constant26_4), kwargs = {})
%_tensor_constant39 : [#users=1] = get_attr[target=_tensor_constant39]
%maximum_4 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_25, %_tensor_constant39), kwargs = {})
%_softmax_4 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_4, -1, False), kwargs = {})
%detach_8 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_4,), kwargs = {})
%_tensor_constant40 : [#users=1] = get_attr[target=_tensor_constant40]
%mul_38 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_4, %_tensor_constant40), kwargs = {})
%view_48 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_38, [16, 128, 128]), kwargs = {})
%transpose_9 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_15, 1, 2), kwargs = {})
%_reshape_alias_19 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_9, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_9 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_48, %_reshape_alias_19), kwargs = {})
%view_49 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_9, [1, 16, 128, 64]), kwargs = {})
%permute_9 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_49, [0, 2, 1, 3]), kwargs = {})
%clone_4 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_9,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_4 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_4, [1, 128, 1024]), kwargs = {})
%_param_constant55 : [#users=1] = get_attr[target=_param_constant55]
%t_17 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant55,), kwargs = {})
%view_50 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_4, [128, 1024]), kwargs = {})
%_param_constant56 : [#users=1] = get_attr[target=_param_constant56]
%addmm_17 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant56, %view_50, %t_17), kwargs = {})
%view_51 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_17, [1, 128, 1024]), kwargs = {})
%add_26 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_23, %view_51), kwargs = {})
%_param_constant57 : [#users=1] = get_attr[target=_param_constant57]
%_param_constant58 : [#users=1] = get_attr[target=_param_constant58]
%native_layer_norm_10 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_26, [1024], %_param_constant57, %_param_constant58, 1e-05), kwargs = {})
%getitem_30 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_10, 0), kwargs = {})
%getitem_31 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_10, 1), kwargs = {})
%getitem_32 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_10, 2), kwargs = {})
%_param_constant59 : [#users=1] = get_attr[target=_param_constant59]
%t_18 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant59,), kwargs = {})
%view_52 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_30, [128, 1024]), kwargs = {})
%_param_constant60 : [#users=1] = get_attr[target=_param_constant60]
%addmm_18 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant60, %view_52, %t_18), kwargs = {})
%view_53 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_18, [1, 128, 4096]), kwargs = {})
%mul_39 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_53, 0.5), kwargs = {})
%mul_40 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_53, 0.79788456), kwargs = {})
%mul_41 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_53, 0.044715), kwargs = {})
%mul_42 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_41, %view_53), kwargs = {})
%add_27 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_42, 1), kwargs = {})
%mul_43 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_40, %add_27), kwargs = {})
%tanh_4 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_43,), kwargs = {})
%detach_9 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_4,), kwargs = {})
%add_28 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_4, 1.0), kwargs = {})
%mul_44 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_39, %add_28), kwargs = {})
%_param_constant61 : [#users=1] = get_attr[target=_param_constant61]
%t_19 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant61,), kwargs = {})
%view_54 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_44, [128, 4096]), kwargs = {})
%_param_constant62 : [#users=1] = get_attr[target=_param_constant62]
%addmm_19 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant62, %view_54, %t_19), kwargs = {})
%view_55 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_19, [1, 128, 1024]), kwargs = {})
%add_29 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_26, %view_55), kwargs = {})
%_param_constant63 : [#users=1] = get_attr[target=_param_constant63]
%_param_constant64 : [#users=1] = get_attr[target=_param_constant64]
%native_layer_norm_11 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_29, [1024], %_param_constant63, %_param_constant64, 1e-05), kwargs = {})
%getitem_33 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_11, 0), kwargs = {})
%getitem_34 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_11, 1), kwargs = {})
%getitem_35 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_11, 2), kwargs = {})
%_param_constant65 : [#users=1] = get_attr[target=_param_constant65]
%t_20 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant65,), kwargs = {})
%view_56 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_33, [128, 1024]), kwargs = {})
%_param_constant66 : [#users=1] = get_attr[target=_param_constant66]
%addmm_20 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant66, %view_56, %t_20), kwargs = {})
%view_57 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_20, [1, 128, 3072]), kwargs = {})
%_reshape_alias_20 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_57, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_16 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_20, -1, 0, 64), kwargs = {})
%slice_17 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_20, -1, 64, 128), kwargs = {})
%slice_18 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_20, -1, 128, 192), kwargs = {})
%transpose_10 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_16, 1, 2), kwargs = {})
%_reshape_alias_21 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_10, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_10 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_17, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_22 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_10, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_10 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_21, %_reshape_alias_22), kwargs = {})
%mul_45 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_10, 0.025), kwargs = {})
%_tensor_constant41 : [#users=1] = get_attr[target=_tensor_constant41]
%add_30 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_45, %_tensor_constant41), kwargs = {})
%view_58 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_30, [-1, 16, 128, 128]), kwargs = {})
%mul_46 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_58, 5), kwargs = {})
%_tensor_constant26_5 : [#users=1] = get_attr[target=_tensor_constant26]
%add_31 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_46, %_tensor_constant26_5), kwargs = {})
%_tensor_constant42 : [#users=1] = get_attr[target=_tensor_constant42]
%maximum_5 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_31, %_tensor_constant42), kwargs = {})
%_softmax_5 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_5, -1, False), kwargs = {})
%detach_10 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_5,), kwargs = {})
%_tensor_constant43 : [#users=1] = get_attr[target=_tensor_constant43]
%mul_47 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_5, %_tensor_constant43), kwargs = {})
%view_59 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_47, [16, 128, 128]), kwargs = {})
%transpose_11 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_18, 1, 2), kwargs = {})
%_reshape_alias_23 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_11, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_11 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_59, %_reshape_alias_23), kwargs = {})
%view_60 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_11, [1, 16, 128, 64]), kwargs = {})
%permute_11 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_60, [0, 2, 1, 3]), kwargs = {})
%clone_5 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_11,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_5 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_5, [1, 128, 1024]), kwargs = {})
%_param_constant67 : [#users=1] = get_attr[target=_param_constant67]
%t_21 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant67,), kwargs = {})
%view_61 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_5, [128, 1024]), kwargs = {})
%_param_constant68 : [#users=1] = get_attr[target=_param_constant68]
%addmm_21 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant68, %view_61, %t_21), kwargs = {})
%view_62 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_21, [1, 128, 1024]), kwargs = {})
%add_32 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_29, %view_62), kwargs = {})
%_param_constant69 : [#users=1] = get_attr[target=_param_constant69]
%_param_constant70 : [#users=1] = get_attr[target=_param_constant70]
%native_layer_norm_12 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_32, [1024], %_param_constant69, %_param_constant70, 1e-05), kwargs = {})
%getitem_36 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_12, 0), kwargs = {})
%getitem_37 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_12, 1), kwargs = {})
%getitem_38 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_12, 2), kwargs = {})
%_param_constant71 : [#users=1] = get_attr[target=_param_constant71]
%t_22 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant71,), kwargs = {})
%view_63 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_36, [128, 1024]), kwargs = {})
%_param_constant72 : [#users=1] = get_attr[target=_param_constant72]
%addmm_22 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant72, %view_63, %t_22), kwargs = {})
%view_64 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_22, [1, 128, 4096]), kwargs = {})
%mul_48 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_64, 0.5), kwargs = {})
%mul_49 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_64, 0.79788456), kwargs = {})
%mul_50 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_64, 0.044715), kwargs = {})
%mul_51 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_50, %view_64), kwargs = {})
%add_33 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_51, 1), kwargs = {})
%mul_52 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_49, %add_33), kwargs = {})
%tanh_5 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_52,), kwargs = {})
%detach_11 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_5,), kwargs = {})
%add_34 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_5, 1.0), kwargs = {})
%mul_53 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_48, %add_34), kwargs = {})
%_param_constant73 : [#users=1] = get_attr[target=_param_constant73]
%t_23 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant73,), kwargs = {})
%view_65 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_53, [128, 4096]), kwargs = {})
%_param_constant74 : [#users=1] = get_attr[target=_param_constant74]
%addmm_23 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant74, %view_65, %t_23), kwargs = {})
%view_66 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_23, [1, 128, 1024]), kwargs = {})
%add_35 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_32, %view_66), kwargs = {})
%_param_constant75 : [#users=1] = get_attr[target=_param_constant75]
%_param_constant76 : [#users=1] = get_attr[target=_param_constant76]
%native_layer_norm_13 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_35, [1024], %_param_constant75, %_param_constant76, 1e-05), kwargs = {})
%getitem_39 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_13, 0), kwargs = {})
%getitem_40 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_13, 1), kwargs = {})
%getitem_41 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_13, 2), kwargs = {})
%_param_constant77 : [#users=1] = get_attr[target=_param_constant77]
%t_24 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant77,), kwargs = {})
%view_67 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_39, [128, 1024]), kwargs = {})
%_param_constant78 : [#users=1] = get_attr[target=_param_constant78]
%addmm_24 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant78, %view_67, %t_24), kwargs = {})
%view_68 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_24, [1, 128, 3072]), kwargs = {})
%_reshape_alias_24 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_68, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_19 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_24, -1, 0, 64), kwargs = {})
%slice_20 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_24, -1, 64, 128), kwargs = {})
%slice_21 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_24, -1, 128, 192), kwargs = {})
%transpose_12 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_19, 1, 2), kwargs = {})
%_reshape_alias_25 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_12, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_12 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_20, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_26 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_12, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_12 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_25, %_reshape_alias_26), kwargs = {})
%mul_54 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_12, 0.020833333333333332), kwargs = {})
%_tensor_constant44 : [#users=1] = get_attr[target=_tensor_constant44]
%add_36 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_54, %_tensor_constant44), kwargs = {})
%view_69 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_36, [-1, 16, 128, 128]), kwargs = {})
%mul_55 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_69, 6), kwargs = {})
%_tensor_constant26_6 : [#users=1] = get_attr[target=_tensor_constant26]
%add_37 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_55, %_tensor_constant26_6), kwargs = {})
%_tensor_constant45 : [#users=1] = get_attr[target=_tensor_constant45]
%maximum_6 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_37, %_tensor_constant45), kwargs = {})
%_softmax_6 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_6, -1, False), kwargs = {})
%detach_12 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_6,), kwargs = {})
%_tensor_constant46 : [#users=1] = get_attr[target=_tensor_constant46]
%mul_56 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_6, %_tensor_constant46), kwargs = {})
%view_70 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_56, [16, 128, 128]), kwargs = {})
%transpose_13 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_21, 1, 2), kwargs = {})
%_reshape_alias_27 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_13, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_13 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_70, %_reshape_alias_27), kwargs = {})
%view_71 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_13, [1, 16, 128, 64]), kwargs = {})
%permute_13 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_71, [0, 2, 1, 3]), kwargs = {})
%clone_6 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_13,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_6 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_6, [1, 128, 1024]), kwargs = {})
%_param_constant79 : [#users=1] = get_attr[target=_param_constant79]
%t_25 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant79,), kwargs = {})
%view_72 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_6, [128, 1024]), kwargs = {})
%_param_constant80 : [#users=1] = get_attr[target=_param_constant80]
%addmm_25 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant80, %view_72, %t_25), kwargs = {})
%view_73 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_25, [1, 128, 1024]), kwargs = {})
%add_38 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_35, %view_73), kwargs = {})
%_param_constant81 : [#users=1] = get_attr[target=_param_constant81]
%_param_constant82 : [#users=1] = get_attr[target=_param_constant82]
%native_layer_norm_14 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_38, [1024], %_param_constant81, %_param_constant82, 1e-05), kwargs = {})
%getitem_42 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_14, 0), kwargs = {})
%getitem_43 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_14, 1), kwargs = {})
%getitem_44 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_14, 2), kwargs = {})
%_param_constant83 : [#users=1] = get_attr[target=_param_constant83]
%t_26 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant83,), kwargs = {})
%view_74 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_42, [128, 1024]), kwargs = {})
%_param_constant84 : [#users=1] = get_attr[target=_param_constant84]
%addmm_26 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant84, %view_74, %t_26), kwargs = {})
%view_75 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_26, [1, 128, 4096]), kwargs = {})
%mul_57 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_75, 0.5), kwargs = {})
%mul_58 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_75, 0.79788456), kwargs = {})
%mul_59 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_75, 0.044715), kwargs = {})
%mul_60 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_59, %view_75), kwargs = {})
%add_39 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_60, 1), kwargs = {})
%mul_61 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_58, %add_39), kwargs = {})
%tanh_6 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_61,), kwargs = {})
%detach_13 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_6,), kwargs = {})
%add_40 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_6, 1.0), kwargs = {})
%mul_62 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_57, %add_40), kwargs = {})
%_param_constant85 : [#users=1] = get_attr[target=_param_constant85]
%t_27 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant85,), kwargs = {})
%view_76 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_62, [128, 4096]), kwargs = {})
%_param_constant86 : [#users=1] = get_attr[target=_param_constant86]
%addmm_27 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant86, %view_76, %t_27), kwargs = {})
%view_77 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_27, [1, 128, 1024]), kwargs = {})
%add_41 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_38, %view_77), kwargs = {})
%_param_constant87 : [#users=1] = get_attr[target=_param_constant87]
%_param_constant88 : [#users=1] = get_attr[target=_param_constant88]
%native_layer_norm_15 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_41, [1024], %_param_constant87, %_param_constant88, 1e-05), kwargs = {})
%getitem_45 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_15, 0), kwargs = {})
%getitem_46 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_15, 1), kwargs = {})
%getitem_47 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_15, 2), kwargs = {})
%_param_constant89 : [#users=1] = get_attr[target=_param_constant89]
%t_28 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant89,), kwargs = {})
%view_78 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_45, [128, 1024]), kwargs = {})
%_param_constant90 : [#users=1] = get_attr[target=_param_constant90]
%addmm_28 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant90, %view_78, %t_28), kwargs = {})
%view_79 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_28, [1, 128, 3072]), kwargs = {})
%_reshape_alias_28 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_79, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_22 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_28, -1, 0, 64), kwargs = {})
%slice_23 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_28, -1, 64, 128), kwargs = {})
%slice_24 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_28, -1, 128, 192), kwargs = {})
%transpose_14 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_22, 1, 2), kwargs = {})
%_reshape_alias_29 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_14, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_14 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_23, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_30 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_14, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_14 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_29, %_reshape_alias_30), kwargs = {})
%mul_63 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_14, 0.017857142857142856), kwargs = {})
%_tensor_constant47 : [#users=1] = get_attr[target=_tensor_constant47]
%add_42 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_63, %_tensor_constant47), kwargs = {})
%view_80 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_42, [-1, 16, 128, 128]), kwargs = {})
%mul_64 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_80, 7), kwargs = {})
%_tensor_constant26_7 : [#users=1] = get_attr[target=_tensor_constant26]
%add_43 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_64, %_tensor_constant26_7), kwargs = {})
%_tensor_constant48 : [#users=1] = get_attr[target=_tensor_constant48]
%maximum_7 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_43, %_tensor_constant48), kwargs = {})
%_softmax_7 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_7, -1, False), kwargs = {})
%detach_14 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_7,), kwargs = {})
%_tensor_constant49 : [#users=1] = get_attr[target=_tensor_constant49]
%mul_65 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_7, %_tensor_constant49), kwargs = {})
%view_81 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_65, [16, 128, 128]), kwargs = {})
%transpose_15 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_24, 1, 2), kwargs = {})
%_reshape_alias_31 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_15, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_15 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_81, %_reshape_alias_31), kwargs = {})
%view_82 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_15, [1, 16, 128, 64]), kwargs = {})
%permute_15 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_82, [0, 2, 1, 3]), kwargs = {})
%clone_7 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_15,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_7 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_7, [1, 128, 1024]), kwargs = {})
%_param_constant91 : [#users=1] = get_attr[target=_param_constant91]
%t_29 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant91,), kwargs = {})
%view_83 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_7, [128, 1024]), kwargs = {})
%_param_constant92 : [#users=1] = get_attr[target=_param_constant92]
%addmm_29 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant92, %view_83, %t_29), kwargs = {})
%view_84 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_29, [1, 128, 1024]), kwargs = {})
%add_44 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_41, %view_84), kwargs = {})
%_param_constant93 : [#users=1] = get_attr[target=_param_constant93]
%_param_constant94 : [#users=1] = get_attr[target=_param_constant94]
%native_layer_norm_16 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_44, [1024], %_param_constant93, %_param_constant94, 1e-05), kwargs = {})
%getitem_48 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_16, 0), kwargs = {})
%getitem_49 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_16, 1), kwargs = {})
%getitem_50 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_16, 2), kwargs = {})
%_param_constant95 : [#users=1] = get_attr[target=_param_constant95]
%t_30 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant95,), kwargs = {})
%view_85 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_48, [128, 1024]), kwargs = {})
%_param_constant96 : [#users=1] = get_attr[target=_param_constant96]
%addmm_30 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant96, %view_85, %t_30), kwargs = {})
%view_86 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_30, [1, 128, 4096]), kwargs = {})
%mul_66 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_86, 0.5), kwargs = {})
%mul_67 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_86, 0.79788456), kwargs = {})
%mul_68 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_86, 0.044715), kwargs = {})
%mul_69 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_68, %view_86), kwargs = {})
%add_45 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_69, 1), kwargs = {})
%mul_70 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_67, %add_45), kwargs = {})
%tanh_7 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_70,), kwargs = {})
%detach_15 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_7,), kwargs = {})
%add_46 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_7, 1.0), kwargs = {})
%mul_71 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_66, %add_46), kwargs = {})
%_param_constant97 : [#users=1] = get_attr[target=_param_constant97]
%t_31 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant97,), kwargs = {})
%view_87 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_71, [128, 4096]), kwargs = {})
%_param_constant98 : [#users=1] = get_attr[target=_param_constant98]
%addmm_31 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant98, %view_87, %t_31), kwargs = {})
%view_88 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_31, [1, 128, 1024]), kwargs = {})
%add_47 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_44, %view_88), kwargs = {})
%_param_constant99 : [#users=1] = get_attr[target=_param_constant99]
%_param_constant100 : [#users=1] = get_attr[target=_param_constant100]
%native_layer_norm_17 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_47, [1024], %_param_constant99, %_param_constant100, 1e-05), kwargs = {})
%getitem_51 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_17, 0), kwargs = {})
%getitem_52 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_17, 1), kwargs = {})
%getitem_53 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_17, 2), kwargs = {})
%_param_constant101 : [#users=1] = get_attr[target=_param_constant101]
%t_32 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant101,), kwargs = {})
%view_89 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_51, [128, 1024]), kwargs = {})
%_param_constant102 : [#users=1] = get_attr[target=_param_constant102]
%addmm_32 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant102, %view_89, %t_32), kwargs = {})
%view_90 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_32, [1, 128, 3072]), kwargs = {})
%_reshape_alias_32 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_90, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_25 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_32, -1, 0, 64), kwargs = {})
%slice_26 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_32, -1, 64, 128), kwargs = {})
%slice_27 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_32, -1, 128, 192), kwargs = {})
%transpose_16 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_25, 1, 2), kwargs = {})
%_reshape_alias_33 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_16, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_16 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_26, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_34 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_16, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_16 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_33, %_reshape_alias_34), kwargs = {})
%mul_72 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_16, 0.015625), kwargs = {})
%_tensor_constant50 : [#users=1] = get_attr[target=_tensor_constant50]
%add_48 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_72, %_tensor_constant50), kwargs = {})
%view_91 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_48, [-1, 16, 128, 128]), kwargs = {})
%mul_73 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_91, 8), kwargs = {})
%_tensor_constant26_8 : [#users=1] = get_attr[target=_tensor_constant26]
%add_49 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_73, %_tensor_constant26_8), kwargs = {})
%_tensor_constant51 : [#users=1] = get_attr[target=_tensor_constant51]
%maximum_8 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_49, %_tensor_constant51), kwargs = {})
%_softmax_8 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_8, -1, False), kwargs = {})
%detach_16 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_8,), kwargs = {})
%_tensor_constant52 : [#users=1] = get_attr[target=_tensor_constant52]
%mul_74 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_8, %_tensor_constant52), kwargs = {})
%view_92 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_74, [16, 128, 128]), kwargs = {})
%transpose_17 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_27, 1, 2), kwargs = {})
%_reshape_alias_35 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_17, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_17 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_92, %_reshape_alias_35), kwargs = {})
%view_93 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_17, [1, 16, 128, 64]), kwargs = {})
%permute_17 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_93, [0, 2, 1, 3]), kwargs = {})
%clone_8 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_17,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_8 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_8, [1, 128, 1024]), kwargs = {})
%_param_constant103 : [#users=1] = get_attr[target=_param_constant103]
%t_33 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant103,), kwargs = {})
%view_94 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_8, [128, 1024]), kwargs = {})
%_param_constant104 : [#users=1] = get_attr[target=_param_constant104]
%addmm_33 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant104, %view_94, %t_33), kwargs = {})
%view_95 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_33, [1, 128, 1024]), kwargs = {})
%add_50 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_47, %view_95), kwargs = {})
%_param_constant105 : [#users=1] = get_attr[target=_param_constant105]
%_param_constant106 : [#users=1] = get_attr[target=_param_constant106]
%native_layer_norm_18 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_50, [1024], %_param_constant105, %_param_constant106, 1e-05), kwargs = {})
%getitem_54 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_18, 0), kwargs = {})
%getitem_55 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_18, 1), kwargs = {})
%getitem_56 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_18, 2), kwargs = {})
%_param_constant107 : [#users=1] = get_attr[target=_param_constant107]
%t_34 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant107,), kwargs = {})
%view_96 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_54, [128, 1024]), kwargs = {})
%_param_constant108 : [#users=1] = get_attr[target=_param_constant108]
%addmm_34 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant108, %view_96, %t_34), kwargs = {})
%view_97 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_34, [1, 128, 4096]), kwargs = {})
%mul_75 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_97, 0.5), kwargs = {})
%mul_76 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_97, 0.79788456), kwargs = {})
%mul_77 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_97, 0.044715), kwargs = {})
%mul_78 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_77, %view_97), kwargs = {})
%add_51 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_78, 1), kwargs = {})
%mul_79 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_76, %add_51), kwargs = {})
%tanh_8 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_79,), kwargs = {})
%detach_17 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_8,), kwargs = {})
%add_52 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_8, 1.0), kwargs = {})
%mul_80 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_75, %add_52), kwargs = {})
%_param_constant109 : [#users=1] = get_attr[target=_param_constant109]
%t_35 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant109,), kwargs = {})
%view_98 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_80, [128, 4096]), kwargs = {})
%_param_constant110 : [#users=1] = get_attr[target=_param_constant110]
%addmm_35 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant110, %view_98, %t_35), kwargs = {})
%view_99 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_35, [1, 128, 1024]), kwargs = {})
%add_53 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_50, %view_99), kwargs = {})
%_param_constant111 : [#users=1] = get_attr[target=_param_constant111]
%_param_constant112 : [#users=1] = get_attr[target=_param_constant112]
%native_layer_norm_19 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_53, [1024], %_param_constant111, %_param_constant112, 1e-05), kwargs = {})
%getitem_57 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_19, 0), kwargs = {})
%getitem_58 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_19, 1), kwargs = {})
%getitem_59 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_19, 2), kwargs = {})
%_param_constant113 : [#users=1] = get_attr[target=_param_constant113]
%t_36 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant113,), kwargs = {})
%view_100 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_57, [128, 1024]), kwargs = {})
%_param_constant114 : [#users=1] = get_attr[target=_param_constant114]
%addmm_36 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant114, %view_100, %t_36), kwargs = {})
%view_101 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_36, [1, 128, 3072]), kwargs = {})
%_reshape_alias_36 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_101, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_28 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_36, -1, 0, 64), kwargs = {})
%slice_29 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_36, -1, 64, 128), kwargs = {})
%slice_30 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_36, -1, 128, 192), kwargs = {})
%transpose_18 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_28, 1, 2), kwargs = {})
%_reshape_alias_37 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_18, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_18 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_29, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_38 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_18, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_18 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_37, %_reshape_alias_38), kwargs = {})
%mul_81 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_18, 0.013888888888888888), kwargs = {})
%_tensor_constant53 : [#users=1] = get_attr[target=_tensor_constant53]
%add_54 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_81, %_tensor_constant53), kwargs = {})
%view_102 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_54, [-1, 16, 128, 128]), kwargs = {})
%mul_82 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_102, 9), kwargs = {})
%_tensor_constant26_9 : [#users=1] = get_attr[target=_tensor_constant26]
%add_55 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_82, %_tensor_constant26_9), kwargs = {})
%_tensor_constant54 : [#users=1] = get_attr[target=_tensor_constant54]
%maximum_9 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_55, %_tensor_constant54), kwargs = {})
%_softmax_9 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_9, -1, False), kwargs = {})
%detach_18 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_9,), kwargs = {})
%_tensor_constant55 : [#users=1] = get_attr[target=_tensor_constant55]
%mul_83 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_9, %_tensor_constant55), kwargs = {})
%view_103 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_83, [16, 128, 128]), kwargs = {})
%transpose_19 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_30, 1, 2), kwargs = {})
%_reshape_alias_39 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_19, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_19 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_103, %_reshape_alias_39), kwargs = {})
%view_104 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_19, [1, 16, 128, 64]), kwargs = {})
%permute_19 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_104, [0, 2, 1, 3]), kwargs = {})
%clone_9 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_19,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_9 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_9, [1, 128, 1024]), kwargs = {})
%_param_constant115 : [#users=1] = get_attr[target=_param_constant115]
%t_37 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant115,), kwargs = {})
%view_105 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_9, [128, 1024]), kwargs = {})
%_param_constant116 : [#users=1] = get_attr[target=_param_constant116]
%addmm_37 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant116, %view_105, %t_37), kwargs = {})
%view_106 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_37, [1, 128, 1024]), kwargs = {})
%add_56 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_53, %view_106), kwargs = {})
%_param_constant117 : [#users=1] = get_attr[target=_param_constant117]
%_param_constant118 : [#users=1] = get_attr[target=_param_constant118]
%native_layer_norm_20 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_56, [1024], %_param_constant117, %_param_constant118, 1e-05), kwargs = {})
%getitem_60 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_20, 0), kwargs = {})
%getitem_61 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_20, 1), kwargs = {})
%getitem_62 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_20, 2), kwargs = {})
%_param_constant119 : [#users=1] = get_attr[target=_param_constant119]
%t_38 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant119,), kwargs = {})
%view_107 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_60, [128, 1024]), kwargs = {})
%_param_constant120 : [#users=1] = get_attr[target=_param_constant120]
%addmm_38 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant120, %view_107, %t_38), kwargs = {})
%view_108 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_38, [1, 128, 4096]), kwargs = {})
%mul_84 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_108, 0.5), kwargs = {})
%mul_85 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_108, 0.79788456), kwargs = {})
%mul_86 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_108, 0.044715), kwargs = {})
%mul_87 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_86, %view_108), kwargs = {})
%add_57 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_87, 1), kwargs = {})
%mul_88 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_85, %add_57), kwargs = {})
%tanh_9 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_88,), kwargs = {})
%detach_19 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_9,), kwargs = {})
%add_58 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_9, 1.0), kwargs = {})
%mul_89 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_84, %add_58), kwargs = {})
%_param_constant121 : [#users=1] = get_attr[target=_param_constant121]
%t_39 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant121,), kwargs = {})
%view_109 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_89, [128, 4096]), kwargs = {})
%_param_constant122 : [#users=1] = get_attr[target=_param_constant122]
%addmm_39 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant122, %view_109, %t_39), kwargs = {})
%view_110 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_39, [1, 128, 1024]), kwargs = {})
%add_59 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_56, %view_110), kwargs = {})
%_param_constant123 : [#users=1] = get_attr[target=_param_constant123]
%_param_constant124 : [#users=1] = get_attr[target=_param_constant124]
%native_layer_norm_21 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_59, [1024], %_param_constant123, %_param_constant124, 1e-05), kwargs = {})
%getitem_63 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_21, 0), kwargs = {})
%getitem_64 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_21, 1), kwargs = {})
%getitem_65 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_21, 2), kwargs = {})
%_param_constant125 : [#users=1] = get_attr[target=_param_constant125]
%t_40 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant125,), kwargs = {})
%view_111 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_63, [128, 1024]), kwargs = {})
%_param_constant126 : [#users=1] = get_attr[target=_param_constant126]
%addmm_40 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant126, %view_111, %t_40), kwargs = {})
%view_112 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_40, [1, 128, 3072]), kwargs = {})
%_reshape_alias_40 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_112, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_31 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_40, -1, 0, 64), kwargs = {})
%slice_32 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_40, -1, 64, 128), kwargs = {})
%slice_33 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_40, -1, 128, 192), kwargs = {})
%transpose_20 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_31, 1, 2), kwargs = {})
%_reshape_alias_41 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_20, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_20 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_32, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_42 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_20, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_20 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_41, %_reshape_alias_42), kwargs = {})
%mul_90 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_20, 0.0125), kwargs = {})
%_tensor_constant56 : [#users=1] = get_attr[target=_tensor_constant56]
%add_60 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_90, %_tensor_constant56), kwargs = {})
%view_113 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_60, [-1, 16, 128, 128]), kwargs = {})
%mul_91 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_113, 10), kwargs = {})
%_tensor_constant26_10 : [#users=1] = get_attr[target=_tensor_constant26]
%add_61 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_91, %_tensor_constant26_10), kwargs = {})
%_tensor_constant57 : [#users=1] = get_attr[target=_tensor_constant57]
%maximum_10 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_61, %_tensor_constant57), kwargs = {})
%_softmax_10 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_10, -1, False), kwargs = {})
%detach_20 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_10,), kwargs = {})
%_tensor_constant58 : [#users=1] = get_attr[target=_tensor_constant58]
%mul_92 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_10, %_tensor_constant58), kwargs = {})
%view_114 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_92, [16, 128, 128]), kwargs = {})
%transpose_21 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_33, 1, 2), kwargs = {})
%_reshape_alias_43 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_21, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_21 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_114, %_reshape_alias_43), kwargs = {})
%view_115 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_21, [1, 16, 128, 64]), kwargs = {})
%permute_21 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_115, [0, 2, 1, 3]), kwargs = {})
%clone_10 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_21,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_10 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_10, [1, 128, 1024]), kwargs = {})
%_param_constant127 : [#users=1] = get_attr[target=_param_constant127]
%t_41 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant127,), kwargs = {})
%view_116 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_10, [128, 1024]), kwargs = {})
%_param_constant128 : [#users=1] = get_attr[target=_param_constant128]
%addmm_41 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant128, %view_116, %t_41), kwargs = {})
%view_117 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_41, [1, 128, 1024]), kwargs = {})
%add_62 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_59, %view_117), kwargs = {})
%_param_constant129 : [#users=1] = get_attr[target=_param_constant129]
%_param_constant130 : [#users=1] = get_attr[target=_param_constant130]
%native_layer_norm_22 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_62, [1024], %_param_constant129, %_param_constant130, 1e-05), kwargs = {})
%getitem_66 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_22, 0), kwargs = {})
%getitem_67 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_22, 1), kwargs = {})
%getitem_68 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_22, 2), kwargs = {})
%_param_constant131 : [#users=1] = get_attr[target=_param_constant131]
%t_42 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant131,), kwargs = {})
%view_118 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_66, [128, 1024]), kwargs = {})
%_param_constant132 : [#users=1] = get_attr[target=_param_constant132]
%addmm_42 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant132, %view_118, %t_42), kwargs = {})
%view_119 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_42, [1, 128, 4096]), kwargs = {})
%mul_93 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_119, 0.5), kwargs = {})
%mul_94 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_119, 0.79788456), kwargs = {})
%mul_95 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_119, 0.044715), kwargs = {})
%mul_96 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_95, %view_119), kwargs = {})
%add_63 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_96, 1), kwargs = {})
%mul_97 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_94, %add_63), kwargs = {})
%tanh_10 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_97,), kwargs = {})
%detach_21 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_10,), kwargs = {})
%add_64 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_10, 1.0), kwargs = {})
%mul_98 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_93, %add_64), kwargs = {})
%_param_constant133 : [#users=1] = get_attr[target=_param_constant133]
%t_43 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant133,), kwargs = {})
%view_120 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_98, [128, 4096]), kwargs = {})
%_param_constant134 : [#users=1] = get_attr[target=_param_constant134]
%addmm_43 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant134, %view_120, %t_43), kwargs = {})
%view_121 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_43, [1, 128, 1024]), kwargs = {})
%add_65 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_62, %view_121), kwargs = {})
%_param_constant135 : [#users=1] = get_attr[target=_param_constant135]
%_param_constant136 : [#users=1] = get_attr[target=_param_constant136]
%native_layer_norm_23 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_65, [1024], %_param_constant135, %_param_constant136, 1e-05), kwargs = {})
%getitem_69 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_23, 0), kwargs = {})
%getitem_70 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_23, 1), kwargs = {})
%getitem_71 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_23, 2), kwargs = {})
%_param_constant137 : [#users=1] = get_attr[target=_param_constant137]
%t_44 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant137,), kwargs = {})
%view_122 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_69, [128, 1024]), kwargs = {})
%_param_constant138 : [#users=1] = get_attr[target=_param_constant138]
%addmm_44 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant138, %view_122, %t_44), kwargs = {})
%view_123 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_44, [1, 128, 3072]), kwargs = {})
%_reshape_alias_44 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_123, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_34 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_44, -1, 0, 64), kwargs = {})
%slice_35 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_44, -1, 64, 128), kwargs = {})
%slice_36 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_44, -1, 128, 192), kwargs = {})
%transpose_22 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_34, 1, 2), kwargs = {})
%_reshape_alias_45 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_22, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_22 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_35, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_46 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_22, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_22 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_45, %_reshape_alias_46), kwargs = {})
%mul_99 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_22, 0.011363636363636364), kwargs = {})
%_tensor_constant59 : [#users=1] = get_attr[target=_tensor_constant59]
%add_66 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_99, %_tensor_constant59), kwargs = {})
%view_124 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_66, [-1, 16, 128, 128]), kwargs = {})
%mul_100 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_124, 11), kwargs = {})
%_tensor_constant26_11 : [#users=1] = get_attr[target=_tensor_constant26]
%add_67 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_100, %_tensor_constant26_11), kwargs = {})
%_tensor_constant60 : [#users=1] = get_attr[target=_tensor_constant60]
%maximum_11 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_67, %_tensor_constant60), kwargs = {})
%_softmax_11 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_11, -1, False), kwargs = {})
%detach_22 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_11,), kwargs = {})
%_tensor_constant61 : [#users=1] = get_attr[target=_tensor_constant61]
%mul_101 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_11, %_tensor_constant61), kwargs = {})
%view_125 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_101, [16, 128, 128]), kwargs = {})
%transpose_23 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_36, 1, 2), kwargs = {})
%_reshape_alias_47 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_23, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_23 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_125, %_reshape_alias_47), kwargs = {})
%view_126 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_23, [1, 16, 128, 64]), kwargs = {})
%permute_23 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_126, [0, 2, 1, 3]), kwargs = {})
%clone_11 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_23,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_11 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_11, [1, 128, 1024]), kwargs = {})
%_param_constant139 : [#users=1] = get_attr[target=_param_constant139]
%t_45 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant139,), kwargs = {})
%view_127 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_11, [128, 1024]), kwargs = {})
%_param_constant140 : [#users=1] = get_attr[target=_param_constant140]
%addmm_45 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant140, %view_127, %t_45), kwargs = {})
%view_128 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_45, [1, 128, 1024]), kwargs = {})
%add_68 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_65, %view_128), kwargs = {})
%_param_constant141 : [#users=1] = get_attr[target=_param_constant141]
%_param_constant142 : [#users=1] = get_attr[target=_param_constant142]
%native_layer_norm_24 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_68, [1024], %_param_constant141, %_param_constant142, 1e-05), kwargs = {})
%getitem_72 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_24, 0), kwargs = {})
%getitem_73 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_24, 1), kwargs = {})
%getitem_74 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_24, 2), kwargs = {})
%_param_constant143 : [#users=1] = get_attr[target=_param_constant143]
%t_46 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant143,), kwargs = {})
%view_129 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_72, [128, 1024]), kwargs = {})
%_param_constant144 : [#users=1] = get_attr[target=_param_constant144]
%addmm_46 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant144, %view_129, %t_46), kwargs = {})
%view_130 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_46, [1, 128, 4096]), kwargs = {})
%mul_102 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_130, 0.5), kwargs = {})
%mul_103 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_130, 0.79788456), kwargs = {})
%mul_104 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_130, 0.044715), kwargs = {})
%mul_105 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_104, %view_130), kwargs = {})
%add_69 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_105, 1), kwargs = {})
%mul_106 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_103, %add_69), kwargs = {})
%tanh_11 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_106,), kwargs = {})
%detach_23 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_11,), kwargs = {})
%add_70 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_11, 1.0), kwargs = {})
%mul_107 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_102, %add_70), kwargs = {})
%_param_constant145 : [#users=1] = get_attr[target=_param_constant145]
%t_47 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant145,), kwargs = {})
%view_131 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_107, [128, 4096]), kwargs = {})
%_param_constant146 : [#users=1] = get_attr[target=_param_constant146]
%addmm_47 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant146, %view_131, %t_47), kwargs = {})
%view_132 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_47, [1, 128, 1024]), kwargs = {})
%add_71 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_68, %view_132), kwargs = {})
%_param_constant147 : [#users=1] = get_attr[target=_param_constant147]
%_param_constant148 : [#users=1] = get_attr[target=_param_constant148]
%native_layer_norm_25 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_71, [1024], %_param_constant147, %_param_constant148, 1e-05), kwargs = {})
%getitem_75 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_25, 0), kwargs = {})
%getitem_76 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_25, 1), kwargs = {})
%getitem_77 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_25, 2), kwargs = {})
%_param_constant149 : [#users=1] = get_attr[target=_param_constant149]
%t_48 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant149,), kwargs = {})
%view_133 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_75, [128, 1024]), kwargs = {})
%_param_constant150 : [#users=1] = get_attr[target=_param_constant150]
%addmm_48 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant150, %view_133, %t_48), kwargs = {})
%view_134 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_48, [1, 128, 3072]), kwargs = {})
%_reshape_alias_48 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_134, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_37 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_48, -1, 0, 64), kwargs = {})
%slice_38 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_48, -1, 64, 128), kwargs = {})
%slice_39 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_48, -1, 128, 192), kwargs = {})
%transpose_24 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_37, 1, 2), kwargs = {})
%_reshape_alias_49 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_24, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_24 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_38, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_50 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_24, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_24 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_49, %_reshape_alias_50), kwargs = {})
%mul_108 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_24, 0.010416666666666666), kwargs = {})
%_tensor_constant62 : [#users=1] = get_attr[target=_tensor_constant62]
%add_72 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_108, %_tensor_constant62), kwargs = {})
%view_135 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_72, [-1, 16, 128, 128]), kwargs = {})
%mul_109 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_135, 12), kwargs = {})
%_tensor_constant26_12 : [#users=1] = get_attr[target=_tensor_constant26]
%add_73 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_109, %_tensor_constant26_12), kwargs = {})
%_tensor_constant63 : [#users=1] = get_attr[target=_tensor_constant63]
%maximum_12 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_73, %_tensor_constant63), kwargs = {})
%_softmax_12 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_12, -1, False), kwargs = {})
%detach_24 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_12,), kwargs = {})
%_tensor_constant64 : [#users=1] = get_attr[target=_tensor_constant64]
%mul_110 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_12, %_tensor_constant64), kwargs = {})
%view_136 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_110, [16, 128, 128]), kwargs = {})
%transpose_25 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_39, 1, 2), kwargs = {})
%_reshape_alias_51 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_25, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_25 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_136, %_reshape_alias_51), kwargs = {})
%view_137 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_25, [1, 16, 128, 64]), kwargs = {})
%permute_25 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_137, [0, 2, 1, 3]), kwargs = {})
%clone_12 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_25,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_12 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_12, [1, 128, 1024]), kwargs = {})
%_param_constant151 : [#users=1] = get_attr[target=_param_constant151]
%t_49 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant151,), kwargs = {})
%view_138 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_12, [128, 1024]), kwargs = {})
%_param_constant152 : [#users=1] = get_attr[target=_param_constant152]
%addmm_49 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant152, %view_138, %t_49), kwargs = {})
%view_139 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_49, [1, 128, 1024]), kwargs = {})
%add_74 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_71, %view_139), kwargs = {})
%_param_constant153 : [#users=1] = get_attr[target=_param_constant153]
%_param_constant154 : [#users=1] = get_attr[target=_param_constant154]
%native_layer_norm_26 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_74, [1024], %_param_constant153, %_param_constant154, 1e-05), kwargs = {})
%getitem_78 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_26, 0), kwargs = {})
%getitem_79 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_26, 1), kwargs = {})
%getitem_80 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_26, 2), kwargs = {})
%_param_constant155 : [#users=1] = get_attr[target=_param_constant155]
%t_50 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant155,), kwargs = {})
%view_140 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_78, [128, 1024]), kwargs = {})
%_param_constant156 : [#users=1] = get_attr[target=_param_constant156]
%addmm_50 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant156, %view_140, %t_50), kwargs = {})
%view_141 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_50, [1, 128, 4096]), kwargs = {})
%mul_111 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_141, 0.5), kwargs = {})
%mul_112 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_141, 0.79788456), kwargs = {})
%mul_113 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_141, 0.044715), kwargs = {})
%mul_114 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_113, %view_141), kwargs = {})
%add_75 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_114, 1), kwargs = {})
%mul_115 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_112, %add_75), kwargs = {})
%tanh_12 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_115,), kwargs = {})
%detach_25 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_12,), kwargs = {})
%add_76 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_12, 1.0), kwargs = {})
%mul_116 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_111, %add_76), kwargs = {})
%_param_constant157 : [#users=1] = get_attr[target=_param_constant157]
%t_51 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant157,), kwargs = {})
%view_142 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_116, [128, 4096]), kwargs = {})
%_param_constant158 : [#users=1] = get_attr[target=_param_constant158]
%addmm_51 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant158, %view_142, %t_51), kwargs = {})
%view_143 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_51, [1, 128, 1024]), kwargs = {})
%add_77 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_74, %view_143), kwargs = {})
%_param_constant159 : [#users=1] = get_attr[target=_param_constant159]
%_param_constant160 : [#users=1] = get_attr[target=_param_constant160]
%native_layer_norm_27 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_77, [1024], %_param_constant159, %_param_constant160, 1e-05), kwargs = {})
%getitem_81 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_27, 0), kwargs = {})
%getitem_82 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_27, 1), kwargs = {})
%getitem_83 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_27, 2), kwargs = {})
%_param_constant161 : [#users=1] = get_attr[target=_param_constant161]
%t_52 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant161,), kwargs = {})
%view_144 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_81, [128, 1024]), kwargs = {})
%_param_constant162 : [#users=1] = get_attr[target=_param_constant162]
%addmm_52 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant162, %view_144, %t_52), kwargs = {})
%view_145 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_52, [1, 128, 3072]), kwargs = {})
%_reshape_alias_52 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_145, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_40 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_52, -1, 0, 64), kwargs = {})
%slice_41 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_52, -1, 64, 128), kwargs = {})
%slice_42 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_52, -1, 128, 192), kwargs = {})
%transpose_26 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_40, 1, 2), kwargs = {})
%_reshape_alias_53 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_26, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_26 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_41, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_54 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_26, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_26 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_53, %_reshape_alias_54), kwargs = {})
%mul_117 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_26, 0.009615384615384616), kwargs = {})
%_tensor_constant65 : [#users=1] = get_attr[target=_tensor_constant65]
%add_78 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_117, %_tensor_constant65), kwargs = {})
%view_146 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_78, [-1, 16, 128, 128]), kwargs = {})
%mul_118 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_146, 13), kwargs = {})
%_tensor_constant26_13 : [#users=1] = get_attr[target=_tensor_constant26]
%add_79 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_118, %_tensor_constant26_13), kwargs = {})
%_tensor_constant66 : [#users=1] = get_attr[target=_tensor_constant66]
%maximum_13 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_79, %_tensor_constant66), kwargs = {})
%_softmax_13 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_13, -1, False), kwargs = {})
%detach_26 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_13,), kwargs = {})
%_tensor_constant67 : [#users=1] = get_attr[target=_tensor_constant67]
%mul_119 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_13, %_tensor_constant67), kwargs = {})
%view_147 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_119, [16, 128, 128]), kwargs = {})
%transpose_27 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_42, 1, 2), kwargs = {})
%_reshape_alias_55 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_27, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_27 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_147, %_reshape_alias_55), kwargs = {})
%view_148 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_27, [1, 16, 128, 64]), kwargs = {})
%permute_27 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_148, [0, 2, 1, 3]), kwargs = {})
%clone_13 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_27,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_13 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_13, [1, 128, 1024]), kwargs = {})
%_param_constant163 : [#users=1] = get_attr[target=_param_constant163]
%t_53 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant163,), kwargs = {})
%view_149 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_13, [128, 1024]), kwargs = {})
%_param_constant164 : [#users=1] = get_attr[target=_param_constant164]
%addmm_53 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant164, %view_149, %t_53), kwargs = {})
%view_150 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_53, [1, 128, 1024]), kwargs = {})
%add_80 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_77, %view_150), kwargs = {})
%_param_constant165 : [#users=1] = get_attr[target=_param_constant165]
%_param_constant166 : [#users=1] = get_attr[target=_param_constant166]
%native_layer_norm_28 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_80, [1024], %_param_constant165, %_param_constant166, 1e-05), kwargs = {})
%getitem_84 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_28, 0), kwargs = {})
%getitem_85 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_28, 1), kwargs = {})
%getitem_86 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_28, 2), kwargs = {})
%_param_constant167 : [#users=1] = get_attr[target=_param_constant167]
%t_54 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant167,), kwargs = {})
%view_151 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_84, [128, 1024]), kwargs = {})
%_param_constant168 : [#users=1] = get_attr[target=_param_constant168]
%addmm_54 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant168, %view_151, %t_54), kwargs = {})
%view_152 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_54, [1, 128, 4096]), kwargs = {})
%mul_120 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_152, 0.5), kwargs = {})
%mul_121 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_152, 0.79788456), kwargs = {})
%mul_122 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_152, 0.044715), kwargs = {})
%mul_123 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_122, %view_152), kwargs = {})
%add_81 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_123, 1), kwargs = {})
%mul_124 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_121, %add_81), kwargs = {})
%tanh_13 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_124,), kwargs = {})
%detach_27 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_13,), kwargs = {})
%add_82 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_13, 1.0), kwargs = {})
%mul_125 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_120, %add_82), kwargs = {})
%_param_constant169 : [#users=1] = get_attr[target=_param_constant169]
%t_55 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant169,), kwargs = {})
%view_153 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_125, [128, 4096]), kwargs = {})
%_param_constant170 : [#users=1] = get_attr[target=_param_constant170]
%addmm_55 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant170, %view_153, %t_55), kwargs = {})
%view_154 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_55, [1, 128, 1024]), kwargs = {})
%add_83 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_80, %view_154), kwargs = {})
%_param_constant171 : [#users=1] = get_attr[target=_param_constant171]
%_param_constant172 : [#users=1] = get_attr[target=_param_constant172]
%native_layer_norm_29 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_83, [1024], %_param_constant171, %_param_constant172, 1e-05), kwargs = {})
%getitem_87 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_29, 0), kwargs = {})
%getitem_88 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_29, 1), kwargs = {})
%getitem_89 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_29, 2), kwargs = {})
%_param_constant173 : [#users=1] = get_attr[target=_param_constant173]
%t_56 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant173,), kwargs = {})
%view_155 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_87, [128, 1024]), kwargs = {})
%_param_constant174 : [#users=1] = get_attr[target=_param_constant174]
%addmm_56 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant174, %view_155, %t_56), kwargs = {})
%view_156 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_56, [1, 128, 3072]), kwargs = {})
%_reshape_alias_56 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_156, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_43 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_56, -1, 0, 64), kwargs = {})
%slice_44 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_56, -1, 64, 128), kwargs = {})
%slice_45 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_56, -1, 128, 192), kwargs = {})
%transpose_28 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_43, 1, 2), kwargs = {})
%_reshape_alias_57 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_28, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_28 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_44, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_58 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_28, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_28 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_57, %_reshape_alias_58), kwargs = {})
%mul_126 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_28, 0.008928571428571428), kwargs = {})
%_tensor_constant68 : [#users=1] = get_attr[target=_tensor_constant68]
%add_84 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_126, %_tensor_constant68), kwargs = {})
%view_157 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_84, [-1, 16, 128, 128]), kwargs = {})
%mul_127 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_157, 14), kwargs = {})
%_tensor_constant26_14 : [#users=1] = get_attr[target=_tensor_constant26]
%add_85 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_127, %_tensor_constant26_14), kwargs = {})
%_tensor_constant69 : [#users=1] = get_attr[target=_tensor_constant69]
%maximum_14 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_85, %_tensor_constant69), kwargs = {})
%_softmax_14 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_14, -1, False), kwargs = {})
%detach_28 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_14,), kwargs = {})
%_tensor_constant70 : [#users=1] = get_attr[target=_tensor_constant70]
%mul_128 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_14, %_tensor_constant70), kwargs = {})
%view_158 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_128, [16, 128, 128]), kwargs = {})
%transpose_29 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_45, 1, 2), kwargs = {})
%_reshape_alias_59 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_29, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_29 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_158, %_reshape_alias_59), kwargs = {})
%view_159 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_29, [1, 16, 128, 64]), kwargs = {})
%permute_29 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_159, [0, 2, 1, 3]), kwargs = {})
%clone_14 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_29,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_14 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_14, [1, 128, 1024]), kwargs = {})
%_param_constant175 : [#users=1] = get_attr[target=_param_constant175]
%t_57 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant175,), kwargs = {})
%view_160 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_14, [128, 1024]), kwargs = {})
%_param_constant176 : [#users=1] = get_attr[target=_param_constant176]
%addmm_57 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant176, %view_160, %t_57), kwargs = {})
%view_161 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_57, [1, 128, 1024]), kwargs = {})
%add_86 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_83, %view_161), kwargs = {})
%_param_constant177 : [#users=1] = get_attr[target=_param_constant177]
%_param_constant178 : [#users=1] = get_attr[target=_param_constant178]
%native_layer_norm_30 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_86, [1024], %_param_constant177, %_param_constant178, 1e-05), kwargs = {})
%getitem_90 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_30, 0), kwargs = {})
%getitem_91 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_30, 1), kwargs = {})
%getitem_92 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_30, 2), kwargs = {})
%_param_constant179 : [#users=1] = get_attr[target=_param_constant179]
%t_58 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant179,), kwargs = {})
%view_162 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_90, [128, 1024]), kwargs = {})
%_param_constant180 : [#users=1] = get_attr[target=_param_constant180]
%addmm_58 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant180, %view_162, %t_58), kwargs = {})
%view_163 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_58, [1, 128, 4096]), kwargs = {})
%mul_129 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_163, 0.5), kwargs = {})
%mul_130 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_163, 0.79788456), kwargs = {})
%mul_131 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_163, 0.044715), kwargs = {})
%mul_132 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_131, %view_163), kwargs = {})
%add_87 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_132, 1), kwargs = {})
%mul_133 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_130, %add_87), kwargs = {})
%tanh_14 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_133,), kwargs = {})
%detach_29 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_14,), kwargs = {})
%add_88 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_14, 1.0), kwargs = {})
%mul_134 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_129, %add_88), kwargs = {})
%_param_constant181 : [#users=1] = get_attr[target=_param_constant181]
%t_59 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant181,), kwargs = {})
%view_164 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_134, [128, 4096]), kwargs = {})
%_param_constant182 : [#users=1] = get_attr[target=_param_constant182]
%addmm_59 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant182, %view_164, %t_59), kwargs = {})
%view_165 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_59, [1, 128, 1024]), kwargs = {})
%add_89 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_86, %view_165), kwargs = {})
%_param_constant183 : [#users=1] = get_attr[target=_param_constant183]
%_param_constant184 : [#users=1] = get_attr[target=_param_constant184]
%native_layer_norm_31 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_89, [1024], %_param_constant183, %_param_constant184, 1e-05), kwargs = {})
%getitem_93 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_31, 0), kwargs = {})
%getitem_94 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_31, 1), kwargs = {})
%getitem_95 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_31, 2), kwargs = {})
%_param_constant185 : [#users=1] = get_attr[target=_param_constant185]
%t_60 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant185,), kwargs = {})
%view_166 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_93, [128, 1024]), kwargs = {})
%_param_constant186 : [#users=1] = get_attr[target=_param_constant186]
%addmm_60 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant186, %view_166, %t_60), kwargs = {})
%view_167 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_60, [1, 128, 3072]), kwargs = {})
%_reshape_alias_60 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_167, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_46 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_60, -1, 0, 64), kwargs = {})
%slice_47 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_60, -1, 64, 128), kwargs = {})
%slice_48 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_60, -1, 128, 192), kwargs = {})
%transpose_30 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_46, 1, 2), kwargs = {})
%_reshape_alias_61 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_30, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_30 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_47, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_62 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_30, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_30 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_61, %_reshape_alias_62), kwargs = {})
%mul_135 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_30, 0.008333333333333333), kwargs = {})
%_tensor_constant71 : [#users=1] = get_attr[target=_tensor_constant71]
%add_90 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_135, %_tensor_constant71), kwargs = {})
%view_168 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_90, [-1, 16, 128, 128]), kwargs = {})
%mul_136 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_168, 15), kwargs = {})
%_tensor_constant26_15 : [#users=1] = get_attr[target=_tensor_constant26]
%add_91 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_136, %_tensor_constant26_15), kwargs = {})
%_tensor_constant72 : [#users=1] = get_attr[target=_tensor_constant72]
%maximum_15 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_91, %_tensor_constant72), kwargs = {})
%_softmax_15 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_15, -1, False), kwargs = {})
%detach_30 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_15,), kwargs = {})
%_tensor_constant73 : [#users=1] = get_attr[target=_tensor_constant73]
%mul_137 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_15, %_tensor_constant73), kwargs = {})
%view_169 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_137, [16, 128, 128]), kwargs = {})
%transpose_31 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_48, 1, 2), kwargs = {})
%_reshape_alias_63 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_31, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_31 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_169, %_reshape_alias_63), kwargs = {})
%view_170 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_31, [1, 16, 128, 64]), kwargs = {})
%permute_31 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_170, [0, 2, 1, 3]), kwargs = {})
%clone_15 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_31,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_15 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_15, [1, 128, 1024]), kwargs = {})
%_param_constant187 : [#users=1] = get_attr[target=_param_constant187]
%t_61 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant187,), kwargs = {})
%view_171 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_15, [128, 1024]), kwargs = {})
%_param_constant188 : [#users=1] = get_attr[target=_param_constant188]
%addmm_61 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant188, %view_171, %t_61), kwargs = {})
%view_172 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_61, [1, 128, 1024]), kwargs = {})
%add_92 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_89, %view_172), kwargs = {})
%_param_constant189 : [#users=1] = get_attr[target=_param_constant189]
%_param_constant190 : [#users=1] = get_attr[target=_param_constant190]
%native_layer_norm_32 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_92, [1024], %_param_constant189, %_param_constant190, 1e-05), kwargs = {})
%getitem_96 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_32, 0), kwargs = {})
%getitem_97 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_32, 1), kwargs = {})
%getitem_98 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_32, 2), kwargs = {})
%_param_constant191 : [#users=1] = get_attr[target=_param_constant191]
%t_62 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant191,), kwargs = {})
%view_173 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_96, [128, 1024]), kwargs = {})
%_param_constant192 : [#users=1] = get_attr[target=_param_constant192]
%addmm_62 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant192, %view_173, %t_62), kwargs = {})
%view_174 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_62, [1, 128, 4096]), kwargs = {})
%mul_138 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_174, 0.5), kwargs = {})
%mul_139 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_174, 0.79788456), kwargs = {})
%mul_140 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_174, 0.044715), kwargs = {})
%mul_141 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_140, %view_174), kwargs = {})
%add_93 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_141, 1), kwargs = {})
%mul_142 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_139, %add_93), kwargs = {})
%tanh_15 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_142,), kwargs = {})
%detach_31 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_15,), kwargs = {})
%add_94 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_15, 1.0), kwargs = {})
%mul_143 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_138, %add_94), kwargs = {})
%_param_constant193 : [#users=1] = get_attr[target=_param_constant193]
%t_63 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant193,), kwargs = {})
%view_175 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_143, [128, 4096]), kwargs = {})
%_param_constant194 : [#users=1] = get_attr[target=_param_constant194]
%addmm_63 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant194, %view_175, %t_63), kwargs = {})
%view_176 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_63, [1, 128, 1024]), kwargs = {})
%add_95 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_92, %view_176), kwargs = {})
%_param_constant195 : [#users=1] = get_attr[target=_param_constant195]
%_param_constant196 : [#users=1] = get_attr[target=_param_constant196]
%native_layer_norm_33 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_95, [1024], %_param_constant195, %_param_constant196, 1e-05), kwargs = {})
%getitem_99 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_33, 0), kwargs = {})
%getitem_100 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_33, 1), kwargs = {})
%getitem_101 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_33, 2), kwargs = {})
%_param_constant197 : [#users=1] = get_attr[target=_param_constant197]
%t_64 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant197,), kwargs = {})
%view_177 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_99, [128, 1024]), kwargs = {})
%_param_constant198 : [#users=1] = get_attr[target=_param_constant198]
%addmm_64 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant198, %view_177, %t_64), kwargs = {})
%view_178 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_64, [1, 128, 3072]), kwargs = {})
%_reshape_alias_64 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_178, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_49 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_64, -1, 0, 64), kwargs = {})
%slice_50 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_64, -1, 64, 128), kwargs = {})
%slice_51 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_64, -1, 128, 192), kwargs = {})
%transpose_32 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_49, 1, 2), kwargs = {})
%_reshape_alias_65 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_32, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_32 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_50, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_66 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_32, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_32 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_65, %_reshape_alias_66), kwargs = {})
%mul_144 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_32, 0.0078125), kwargs = {})
%_tensor_constant74 : [#users=1] = get_attr[target=_tensor_constant74]
%add_96 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_144, %_tensor_constant74), kwargs = {})
%view_179 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_96, [-1, 16, 128, 128]), kwargs = {})
%mul_145 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_179, 16), kwargs = {})
%_tensor_constant26_16 : [#users=1] = get_attr[target=_tensor_constant26]
%add_97 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_145, %_tensor_constant26_16), kwargs = {})
%_tensor_constant75 : [#users=1] = get_attr[target=_tensor_constant75]
%maximum_16 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_97, %_tensor_constant75), kwargs = {})
%_softmax_16 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_16, -1, False), kwargs = {})
%detach_32 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_16,), kwargs = {})
%_tensor_constant76 : [#users=1] = get_attr[target=_tensor_constant76]
%mul_146 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_16, %_tensor_constant76), kwargs = {})
%view_180 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_146, [16, 128, 128]), kwargs = {})
%transpose_33 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_51, 1, 2), kwargs = {})
%_reshape_alias_67 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_33, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_33 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_180, %_reshape_alias_67), kwargs = {})
%view_181 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_33, [1, 16, 128, 64]), kwargs = {})
%permute_33 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_181, [0, 2, 1, 3]), kwargs = {})
%clone_16 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_33,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_16 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_16, [1, 128, 1024]), kwargs = {})
%_param_constant199 : [#users=1] = get_attr[target=_param_constant199]
%t_65 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant199,), kwargs = {})
%view_182 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_16, [128, 1024]), kwargs = {})
%_param_constant200 : [#users=1] = get_attr[target=_param_constant200]
%addmm_65 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant200, %view_182, %t_65), kwargs = {})
%view_183 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_65, [1, 128, 1024]), kwargs = {})
%add_98 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_95, %view_183), kwargs = {})
%_param_constant201 : [#users=1] = get_attr[target=_param_constant201]
%_param_constant202 : [#users=1] = get_attr[target=_param_constant202]
%native_layer_norm_34 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_98, [1024], %_param_constant201, %_param_constant202, 1e-05), kwargs = {})
%getitem_102 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_34, 0), kwargs = {})
%getitem_103 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_34, 1), kwargs = {})
%getitem_104 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_34, 2), kwargs = {})
%_param_constant203 : [#users=1] = get_attr[target=_param_constant203]
%t_66 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant203,), kwargs = {})
%view_184 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_102, [128, 1024]), kwargs = {})
%_param_constant204 : [#users=1] = get_attr[target=_param_constant204]
%addmm_66 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant204, %view_184, %t_66), kwargs = {})
%view_185 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_66, [1, 128, 4096]), kwargs = {})
%mul_147 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_185, 0.5), kwargs = {})
%mul_148 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_185, 0.79788456), kwargs = {})
%mul_149 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_185, 0.044715), kwargs = {})
%mul_150 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_149, %view_185), kwargs = {})
%add_99 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_150, 1), kwargs = {})
%mul_151 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_148, %add_99), kwargs = {})
%tanh_16 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_151,), kwargs = {})
%detach_33 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_16,), kwargs = {})
%add_100 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_16, 1.0), kwargs = {})
%mul_152 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_147, %add_100), kwargs = {})
%_param_constant205 : [#users=1] = get_attr[target=_param_constant205]
%t_67 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant205,), kwargs = {})
%view_186 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_152, [128, 4096]), kwargs = {})
%_param_constant206 : [#users=1] = get_attr[target=_param_constant206]
%addmm_67 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant206, %view_186, %t_67), kwargs = {})
%view_187 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_67, [1, 128, 1024]), kwargs = {})
%add_101 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_98, %view_187), kwargs = {})
%_param_constant207 : [#users=1] = get_attr[target=_param_constant207]
%_param_constant208 : [#users=1] = get_attr[target=_param_constant208]
%native_layer_norm_35 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_101, [1024], %_param_constant207, %_param_constant208, 1e-05), kwargs = {})
%getitem_105 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_35, 0), kwargs = {})
%getitem_106 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_35, 1), kwargs = {})
%getitem_107 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_35, 2), kwargs = {})
%_param_constant209 : [#users=1] = get_attr[target=_param_constant209]
%t_68 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant209,), kwargs = {})
%view_188 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_105, [128, 1024]), kwargs = {})
%_param_constant210 : [#users=1] = get_attr[target=_param_constant210]
%addmm_68 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant210, %view_188, %t_68), kwargs = {})
%view_189 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_68, [1, 128, 3072]), kwargs = {})
%_reshape_alias_68 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_189, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_52 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_68, -1, 0, 64), kwargs = {})
%slice_53 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_68, -1, 64, 128), kwargs = {})
%slice_54 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_68, -1, 128, 192), kwargs = {})
%transpose_34 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_52, 1, 2), kwargs = {})
%_reshape_alias_69 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_34, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_34 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_53, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_70 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_34, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_34 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_69, %_reshape_alias_70), kwargs = {})
%mul_153 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_34, 0.007352941176470588), kwargs = {})
%_tensor_constant77 : [#users=1] = get_attr[target=_tensor_constant77]
%add_102 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_153, %_tensor_constant77), kwargs = {})
%view_190 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_102, [-1, 16, 128, 128]), kwargs = {})
%mul_154 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_190, 17), kwargs = {})
%_tensor_constant26_17 : [#users=1] = get_attr[target=_tensor_constant26]
%add_103 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_154, %_tensor_constant26_17), kwargs = {})
%_tensor_constant78 : [#users=1] = get_attr[target=_tensor_constant78]
%maximum_17 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_103, %_tensor_constant78), kwargs = {})
%_softmax_17 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_17, -1, False), kwargs = {})
%detach_34 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_17,), kwargs = {})
%_tensor_constant79 : [#users=1] = get_attr[target=_tensor_constant79]
%mul_155 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_17, %_tensor_constant79), kwargs = {})
%view_191 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_155, [16, 128, 128]), kwargs = {})
%transpose_35 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_54, 1, 2), kwargs = {})
%_reshape_alias_71 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_35, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_35 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_191, %_reshape_alias_71), kwargs = {})
%view_192 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_35, [1, 16, 128, 64]), kwargs = {})
%permute_35 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_192, [0, 2, 1, 3]), kwargs = {})
%clone_17 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_35,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_17 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_17, [1, 128, 1024]), kwargs = {})
%_param_constant211 : [#users=1] = get_attr[target=_param_constant211]
%t_69 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant211,), kwargs = {})
%view_193 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_17, [128, 1024]), kwargs = {})
%_param_constant212 : [#users=1] = get_attr[target=_param_constant212]
%addmm_69 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant212, %view_193, %t_69), kwargs = {})
%view_194 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_69, [1, 128, 1024]), kwargs = {})
%add_104 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_101, %view_194), kwargs = {})
%_param_constant213 : [#users=1] = get_attr[target=_param_constant213]
%_param_constant214 : [#users=1] = get_attr[target=_param_constant214]
%native_layer_norm_36 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_104, [1024], %_param_constant213, %_param_constant214, 1e-05), kwargs = {})
%getitem_108 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_36, 0), kwargs = {})
%getitem_109 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_36, 1), kwargs = {})
%getitem_110 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_36, 2), kwargs = {})
%_param_constant215 : [#users=1] = get_attr[target=_param_constant215]
%t_70 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant215,), kwargs = {})
%view_195 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_108, [128, 1024]), kwargs = {})
%_param_constant216 : [#users=1] = get_attr[target=_param_constant216]
%addmm_70 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant216, %view_195, %t_70), kwargs = {})
%view_196 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_70, [1, 128, 4096]), kwargs = {})
%mul_156 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_196, 0.5), kwargs = {})
%mul_157 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_196, 0.79788456), kwargs = {})
%mul_158 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_196, 0.044715), kwargs = {})
%mul_159 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_158, %view_196), kwargs = {})
%add_105 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_159, 1), kwargs = {})
%mul_160 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_157, %add_105), kwargs = {})
%tanh_17 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_160,), kwargs = {})
%detach_35 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_17,), kwargs = {})
%add_106 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_17, 1.0), kwargs = {})
%mul_161 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_156, %add_106), kwargs = {})
%_param_constant217 : [#users=1] = get_attr[target=_param_constant217]
%t_71 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant217,), kwargs = {})
%view_197 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_161, [128, 4096]), kwargs = {})
%_param_constant218 : [#users=1] = get_attr[target=_param_constant218]
%addmm_71 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant218, %view_197, %t_71), kwargs = {})
%view_198 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_71, [1, 128, 1024]), kwargs = {})
%add_107 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_104, %view_198), kwargs = {})
%_param_constant219 : [#users=1] = get_attr[target=_param_constant219]
%_param_constant220 : [#users=1] = get_attr[target=_param_constant220]
%native_layer_norm_37 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_107, [1024], %_param_constant219, %_param_constant220, 1e-05), kwargs = {})
%getitem_111 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_37, 0), kwargs = {})
%getitem_112 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_37, 1), kwargs = {})
%getitem_113 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_37, 2), kwargs = {})
%_param_constant221 : [#users=1] = get_attr[target=_param_constant221]
%t_72 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant221,), kwargs = {})
%view_199 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_111, [128, 1024]), kwargs = {})
%_param_constant222 : [#users=1] = get_attr[target=_param_constant222]
%addmm_72 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant222, %view_199, %t_72), kwargs = {})
%view_200 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_72, [1, 128, 3072]), kwargs = {})
%_reshape_alias_72 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_200, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_55 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_72, -1, 0, 64), kwargs = {})
%slice_56 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_72, -1, 64, 128), kwargs = {})
%slice_57 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_72, -1, 128, 192), kwargs = {})
%transpose_36 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_55, 1, 2), kwargs = {})
%_reshape_alias_73 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_36, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_36 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_56, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_74 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_36, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_36 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_73, %_reshape_alias_74), kwargs = {})
%mul_162 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_36, 0.006944444444444444), kwargs = {})
%_tensor_constant80 : [#users=1] = get_attr[target=_tensor_constant80]
%add_108 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_162, %_tensor_constant80), kwargs = {})
%view_201 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_108, [-1, 16, 128, 128]), kwargs = {})
%mul_163 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_201, 18), kwargs = {})
%_tensor_constant26_18 : [#users=1] = get_attr[target=_tensor_constant26]
%add_109 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_163, %_tensor_constant26_18), kwargs = {})
%_tensor_constant81 : [#users=1] = get_attr[target=_tensor_constant81]
%maximum_18 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_109, %_tensor_constant81), kwargs = {})
%_softmax_18 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_18, -1, False), kwargs = {})
%detach_36 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_18,), kwargs = {})
%_tensor_constant82 : [#users=1] = get_attr[target=_tensor_constant82]
%mul_164 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_18, %_tensor_constant82), kwargs = {})
%view_202 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_164, [16, 128, 128]), kwargs = {})
%transpose_37 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_57, 1, 2), kwargs = {})
%_reshape_alias_75 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_37, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_37 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_202, %_reshape_alias_75), kwargs = {})
%view_203 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_37, [1, 16, 128, 64]), kwargs = {})
%permute_37 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_203, [0, 2, 1, 3]), kwargs = {})
%clone_18 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_37,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_18 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_18, [1, 128, 1024]), kwargs = {})
%_param_constant223 : [#users=1] = get_attr[target=_param_constant223]
%t_73 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant223,), kwargs = {})
%view_204 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_18, [128, 1024]), kwargs = {})
%_param_constant224 : [#users=1] = get_attr[target=_param_constant224]
%addmm_73 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant224, %view_204, %t_73), kwargs = {})
%view_205 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_73, [1, 128, 1024]), kwargs = {})
%add_110 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_107, %view_205), kwargs = {})
%_param_constant225 : [#users=1] = get_attr[target=_param_constant225]
%_param_constant226 : [#users=1] = get_attr[target=_param_constant226]
%native_layer_norm_38 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_110, [1024], %_param_constant225, %_param_constant226, 1e-05), kwargs = {})
%getitem_114 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_38, 0), kwargs = {})
%getitem_115 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_38, 1), kwargs = {})
%getitem_116 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_38, 2), kwargs = {})
%_param_constant227 : [#users=1] = get_attr[target=_param_constant227]
%t_74 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant227,), kwargs = {})
%view_206 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_114, [128, 1024]), kwargs = {})
%_param_constant228 : [#users=1] = get_attr[target=_param_constant228]
%addmm_74 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant228, %view_206, %t_74), kwargs = {})
%view_207 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_74, [1, 128, 4096]), kwargs = {})
%mul_165 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_207, 0.5), kwargs = {})
%mul_166 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_207, 0.79788456), kwargs = {})
%mul_167 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_207, 0.044715), kwargs = {})
%mul_168 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_167, %view_207), kwargs = {})
%add_111 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_168, 1), kwargs = {})
%mul_169 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_166, %add_111), kwargs = {})
%tanh_18 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_169,), kwargs = {})
%detach_37 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_18,), kwargs = {})
%add_112 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_18, 1.0), kwargs = {})
%mul_170 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_165, %add_112), kwargs = {})
%_param_constant229 : [#users=1] = get_attr[target=_param_constant229]
%t_75 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant229,), kwargs = {})
%view_208 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_170, [128, 4096]), kwargs = {})
%_param_constant230 : [#users=1] = get_attr[target=_param_constant230]
%addmm_75 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant230, %view_208, %t_75), kwargs = {})
%view_209 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_75, [1, 128, 1024]), kwargs = {})
%add_113 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_110, %view_209), kwargs = {})
%_param_constant231 : [#users=1] = get_attr[target=_param_constant231]
%_param_constant232 : [#users=1] = get_attr[target=_param_constant232]
%native_layer_norm_39 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_113, [1024], %_param_constant231, %_param_constant232, 1e-05), kwargs = {})
%getitem_117 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_39, 0), kwargs = {})
%getitem_118 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_39, 1), kwargs = {})
%getitem_119 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_39, 2), kwargs = {})
%_param_constant233 : [#users=1] = get_attr[target=_param_constant233]
%t_76 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant233,), kwargs = {})
%view_210 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_117, [128, 1024]), kwargs = {})
%_param_constant234 : [#users=1] = get_attr[target=_param_constant234]
%addmm_76 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant234, %view_210, %t_76), kwargs = {})
%view_211 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_76, [1, 128, 3072]), kwargs = {})
%_reshape_alias_76 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_211, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_58 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_76, -1, 0, 64), kwargs = {})
%slice_59 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_76, -1, 64, 128), kwargs = {})
%slice_60 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_76, -1, 128, 192), kwargs = {})
%transpose_38 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_58, 1, 2), kwargs = {})
%_reshape_alias_77 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_38, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_38 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_59, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_78 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_38, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_38 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_77, %_reshape_alias_78), kwargs = {})
%mul_171 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_38, 0.006578947368421052), kwargs = {})
%_tensor_constant83 : [#users=1] = get_attr[target=_tensor_constant83]
%add_114 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_171, %_tensor_constant83), kwargs = {})
%view_212 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_114, [-1, 16, 128, 128]), kwargs = {})
%mul_172 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_212, 19), kwargs = {})
%_tensor_constant26_19 : [#users=1] = get_attr[target=_tensor_constant26]
%add_115 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_172, %_tensor_constant26_19), kwargs = {})
%_tensor_constant84 : [#users=1] = get_attr[target=_tensor_constant84]
%maximum_19 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_115, %_tensor_constant84), kwargs = {})
%_softmax_19 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_19, -1, False), kwargs = {})
%detach_38 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_19,), kwargs = {})
%_tensor_constant85 : [#users=1] = get_attr[target=_tensor_constant85]
%mul_173 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_19, %_tensor_constant85), kwargs = {})
%view_213 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_173, [16, 128, 128]), kwargs = {})
%transpose_39 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_60, 1, 2), kwargs = {})
%_reshape_alias_79 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_39, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_39 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_213, %_reshape_alias_79), kwargs = {})
%view_214 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_39, [1, 16, 128, 64]), kwargs = {})
%permute_39 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_214, [0, 2, 1, 3]), kwargs = {})
%clone_19 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_39,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_19 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_19, [1, 128, 1024]), kwargs = {})
%_param_constant235 : [#users=1] = get_attr[target=_param_constant235]
%t_77 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant235,), kwargs = {})
%view_215 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_19, [128, 1024]), kwargs = {})
%_param_constant236 : [#users=1] = get_attr[target=_param_constant236]
%addmm_77 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant236, %view_215, %t_77), kwargs = {})
%view_216 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_77, [1, 128, 1024]), kwargs = {})
%add_116 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_113, %view_216), kwargs = {})
%_param_constant237 : [#users=1] = get_attr[target=_param_constant237]
%_param_constant238 : [#users=1] = get_attr[target=_param_constant238]
%native_layer_norm_40 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_116, [1024], %_param_constant237, %_param_constant238, 1e-05), kwargs = {})
%getitem_120 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_40, 0), kwargs = {})
%getitem_121 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_40, 1), kwargs = {})
%getitem_122 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_40, 2), kwargs = {})
%_param_constant239 : [#users=1] = get_attr[target=_param_constant239]
%t_78 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant239,), kwargs = {})
%view_217 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_120, [128, 1024]), kwargs = {})
%_param_constant240 : [#users=1] = get_attr[target=_param_constant240]
%addmm_78 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant240, %view_217, %t_78), kwargs = {})
%view_218 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_78, [1, 128, 4096]), kwargs = {})
%mul_174 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_218, 0.5), kwargs = {})
%mul_175 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_218, 0.79788456), kwargs = {})
%mul_176 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_218, 0.044715), kwargs = {})
%mul_177 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_176, %view_218), kwargs = {})
%add_117 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_177, 1), kwargs = {})
%mul_178 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_175, %add_117), kwargs = {})
%tanh_19 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_178,), kwargs = {})
%detach_39 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_19,), kwargs = {})
%add_118 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_19, 1.0), kwargs = {})
%mul_179 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_174, %add_118), kwargs = {})
%_param_constant241 : [#users=1] = get_attr[target=_param_constant241]
%t_79 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant241,), kwargs = {})
%view_219 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_179, [128, 4096]), kwargs = {})
%_param_constant242 : [#users=1] = get_attr[target=_param_constant242]
%addmm_79 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant242, %view_219, %t_79), kwargs = {})
%view_220 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_79, [1, 128, 1024]), kwargs = {})
%add_119 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_116, %view_220), kwargs = {})
%_param_constant243 : [#users=1] = get_attr[target=_param_constant243]
%_param_constant244 : [#users=1] = get_attr[target=_param_constant244]
%native_layer_norm_41 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_119, [1024], %_param_constant243, %_param_constant244, 1e-05), kwargs = {})
%getitem_123 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_41, 0), kwargs = {})
%getitem_124 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_41, 1), kwargs = {})
%getitem_125 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_41, 2), kwargs = {})
%_param_constant245 : [#users=1] = get_attr[target=_param_constant245]
%t_80 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant245,), kwargs = {})
%view_221 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_123, [128, 1024]), kwargs = {})
%_param_constant246 : [#users=1] = get_attr[target=_param_constant246]
%addmm_80 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant246, %view_221, %t_80), kwargs = {})
%view_222 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_80, [1, 128, 3072]), kwargs = {})
%_reshape_alias_80 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_222, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_61 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_80, -1, 0, 64), kwargs = {})
%slice_62 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_80, -1, 64, 128), kwargs = {})
%slice_63 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_80, -1, 128, 192), kwargs = {})
%transpose_40 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_61, 1, 2), kwargs = {})
%_reshape_alias_81 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_40, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_40 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_62, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_82 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_40, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_40 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_81, %_reshape_alias_82), kwargs = {})
%mul_180 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_40, 0.00625), kwargs = {})
%_tensor_constant86 : [#users=1] = get_attr[target=_tensor_constant86]
%add_120 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_180, %_tensor_constant86), kwargs = {})
%view_223 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_120, [-1, 16, 128, 128]), kwargs = {})
%mul_181 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_223, 20), kwargs = {})
%_tensor_constant26_20 : [#users=1] = get_attr[target=_tensor_constant26]
%add_121 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_181, %_tensor_constant26_20), kwargs = {})
%_tensor_constant87 : [#users=1] = get_attr[target=_tensor_constant87]
%maximum_20 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_121, %_tensor_constant87), kwargs = {})
%_softmax_20 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_20, -1, False), kwargs = {})
%detach_40 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_20,), kwargs = {})
%_tensor_constant88 : [#users=1] = get_attr[target=_tensor_constant88]
%mul_182 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_20, %_tensor_constant88), kwargs = {})
%view_224 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_182, [16, 128, 128]), kwargs = {})
%transpose_41 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_63, 1, 2), kwargs = {})
%_reshape_alias_83 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_41, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_41 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_224, %_reshape_alias_83), kwargs = {})
%view_225 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_41, [1, 16, 128, 64]), kwargs = {})
%permute_41 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_225, [0, 2, 1, 3]), kwargs = {})
%clone_20 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_41,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_20 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_20, [1, 128, 1024]), kwargs = {})
%_param_constant247 : [#users=1] = get_attr[target=_param_constant247]
%t_81 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant247,), kwargs = {})
%view_226 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_20, [128, 1024]), kwargs = {})
%_param_constant248 : [#users=1] = get_attr[target=_param_constant248]
%addmm_81 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant248, %view_226, %t_81), kwargs = {})
%view_227 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_81, [1, 128, 1024]), kwargs = {})
%add_122 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_119, %view_227), kwargs = {})
%_param_constant249 : [#users=1] = get_attr[target=_param_constant249]
%_param_constant250 : [#users=1] = get_attr[target=_param_constant250]
%native_layer_norm_42 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_122, [1024], %_param_constant249, %_param_constant250, 1e-05), kwargs = {})
%getitem_126 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_42, 0), kwargs = {})
%getitem_127 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_42, 1), kwargs = {})
%getitem_128 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_42, 2), kwargs = {})
%_param_constant251 : [#users=1] = get_attr[target=_param_constant251]
%t_82 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant251,), kwargs = {})
%view_228 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_126, [128, 1024]), kwargs = {})
%_param_constant252 : [#users=1] = get_attr[target=_param_constant252]
%addmm_82 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant252, %view_228, %t_82), kwargs = {})
%view_229 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_82, [1, 128, 4096]), kwargs = {})
%mul_183 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_229, 0.5), kwargs = {})
%mul_184 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_229, 0.79788456), kwargs = {})
%mul_185 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_229, 0.044715), kwargs = {})
%mul_186 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_185, %view_229), kwargs = {})
%add_123 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_186, 1), kwargs = {})
%mul_187 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_184, %add_123), kwargs = {})
%tanh_20 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_187,), kwargs = {})
%detach_41 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_20,), kwargs = {})
%add_124 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_20, 1.0), kwargs = {})
%mul_188 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_183, %add_124), kwargs = {})
%_param_constant253 : [#users=1] = get_attr[target=_param_constant253]
%t_83 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant253,), kwargs = {})
%view_230 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_188, [128, 4096]), kwargs = {})
%_param_constant254 : [#users=1] = get_attr[target=_param_constant254]
%addmm_83 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant254, %view_230, %t_83), kwargs = {})
%view_231 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_83, [1, 128, 1024]), kwargs = {})
%add_125 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_122, %view_231), kwargs = {})
%_param_constant255 : [#users=1] = get_attr[target=_param_constant255]
%_param_constant256 : [#users=1] = get_attr[target=_param_constant256]
%native_layer_norm_43 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_125, [1024], %_param_constant255, %_param_constant256, 1e-05), kwargs = {})
%getitem_129 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_43, 0), kwargs = {})
%getitem_130 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_43, 1), kwargs = {})
%getitem_131 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_43, 2), kwargs = {})
%_param_constant257 : [#users=1] = get_attr[target=_param_constant257]
%t_84 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant257,), kwargs = {})
%view_232 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_129, [128, 1024]), kwargs = {})
%_param_constant258 : [#users=1] = get_attr[target=_param_constant258]
%addmm_84 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant258, %view_232, %t_84), kwargs = {})
%view_233 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_84, [1, 128, 3072]), kwargs = {})
%_reshape_alias_84 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_233, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_64 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_84, -1, 0, 64), kwargs = {})
%slice_65 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_84, -1, 64, 128), kwargs = {})
%slice_66 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_84, -1, 128, 192), kwargs = {})
%transpose_42 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_64, 1, 2), kwargs = {})
%_reshape_alias_85 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_42, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_42 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_65, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_86 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_42, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_42 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_85, %_reshape_alias_86), kwargs = {})
%mul_189 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_42, 0.005952380952380952), kwargs = {})
%_tensor_constant89 : [#users=1] = get_attr[target=_tensor_constant89]
%add_126 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_189, %_tensor_constant89), kwargs = {})
%view_234 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_126, [-1, 16, 128, 128]), kwargs = {})
%mul_190 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_234, 21), kwargs = {})
%_tensor_constant26_21 : [#users=1] = get_attr[target=_tensor_constant26]
%add_127 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_190, %_tensor_constant26_21), kwargs = {})
%_tensor_constant90 : [#users=1] = get_attr[target=_tensor_constant90]
%maximum_21 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_127, %_tensor_constant90), kwargs = {})
%_softmax_21 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_21, -1, False), kwargs = {})
%detach_42 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_21,), kwargs = {})
%_tensor_constant91 : [#users=1] = get_attr[target=_tensor_constant91]
%mul_191 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_21, %_tensor_constant91), kwargs = {})
%view_235 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_191, [16, 128, 128]), kwargs = {})
%transpose_43 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_66, 1, 2), kwargs = {})
%_reshape_alias_87 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_43, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_43 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_235, %_reshape_alias_87), kwargs = {})
%view_236 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_43, [1, 16, 128, 64]), kwargs = {})
%permute_43 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_236, [0, 2, 1, 3]), kwargs = {})
%clone_21 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_43,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_21 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_21, [1, 128, 1024]), kwargs = {})
%_param_constant259 : [#users=1] = get_attr[target=_param_constant259]
%t_85 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant259,), kwargs = {})
%view_237 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_21, [128, 1024]), kwargs = {})
%_param_constant260 : [#users=1] = get_attr[target=_param_constant260]
%addmm_85 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant260, %view_237, %t_85), kwargs = {})
%view_238 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_85, [1, 128, 1024]), kwargs = {})
%add_128 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_125, %view_238), kwargs = {})
%_param_constant261 : [#users=1] = get_attr[target=_param_constant261]
%_param_constant262 : [#users=1] = get_attr[target=_param_constant262]
%native_layer_norm_44 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_128, [1024], %_param_constant261, %_param_constant262, 1e-05), kwargs = {})
%getitem_132 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_44, 0), kwargs = {})
%getitem_133 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_44, 1), kwargs = {})
%getitem_134 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_44, 2), kwargs = {})
%_param_constant263 : [#users=1] = get_attr[target=_param_constant263]
%t_86 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant263,), kwargs = {})
%view_239 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_132, [128, 1024]), kwargs = {})
%_param_constant264 : [#users=1] = get_attr[target=_param_constant264]
%addmm_86 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant264, %view_239, %t_86), kwargs = {})
%view_240 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_86, [1, 128, 4096]), kwargs = {})
%mul_192 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_240, 0.5), kwargs = {})
%mul_193 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_240, 0.79788456), kwargs = {})
%mul_194 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_240, 0.044715), kwargs = {})
%mul_195 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_194, %view_240), kwargs = {})
%add_129 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_195, 1), kwargs = {})
%mul_196 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_193, %add_129), kwargs = {})
%tanh_21 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_196,), kwargs = {})
%detach_43 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_21,), kwargs = {})
%add_130 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_21, 1.0), kwargs = {})
%mul_197 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_192, %add_130), kwargs = {})
%_param_constant265 : [#users=1] = get_attr[target=_param_constant265]
%t_87 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant265,), kwargs = {})
%view_241 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_197, [128, 4096]), kwargs = {})
%_param_constant266 : [#users=1] = get_attr[target=_param_constant266]
%addmm_87 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant266, %view_241, %t_87), kwargs = {})
%view_242 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_87, [1, 128, 1024]), kwargs = {})
%add_131 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_128, %view_242), kwargs = {})
%_param_constant267 : [#users=1] = get_attr[target=_param_constant267]
%_param_constant268 : [#users=1] = get_attr[target=_param_constant268]
%native_layer_norm_45 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_131, [1024], %_param_constant267, %_param_constant268, 1e-05), kwargs = {})
%getitem_135 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_45, 0), kwargs = {})
%getitem_136 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_45, 1), kwargs = {})
%getitem_137 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_45, 2), kwargs = {})
%_param_constant269 : [#users=1] = get_attr[target=_param_constant269]
%t_88 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant269,), kwargs = {})
%view_243 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_135, [128, 1024]), kwargs = {})
%_param_constant270 : [#users=1] = get_attr[target=_param_constant270]
%addmm_88 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant270, %view_243, %t_88), kwargs = {})
%view_244 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_88, [1, 128, 3072]), kwargs = {})
%_reshape_alias_88 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_244, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_67 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_88, -1, 0, 64), kwargs = {})
%slice_68 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_88, -1, 64, 128), kwargs = {})
%slice_69 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_88, -1, 128, 192), kwargs = {})
%transpose_44 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_67, 1, 2), kwargs = {})
%_reshape_alias_89 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_44, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_44 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_68, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_90 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_44, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_44 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_89, %_reshape_alias_90), kwargs = {})
%mul_198 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_44, 0.005681818181818182), kwargs = {})
%_tensor_constant92 : [#users=1] = get_attr[target=_tensor_constant92]
%add_132 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_198, %_tensor_constant92), kwargs = {})
%view_245 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_132, [-1, 16, 128, 128]), kwargs = {})
%mul_199 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_245, 22), kwargs = {})
%_tensor_constant26_22 : [#users=1] = get_attr[target=_tensor_constant26]
%add_133 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_199, %_tensor_constant26_22), kwargs = {})
%_tensor_constant93 : [#users=1] = get_attr[target=_tensor_constant93]
%maximum_22 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_133, %_tensor_constant93), kwargs = {})
%_softmax_22 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_22, -1, False), kwargs = {})
%detach_44 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_22,), kwargs = {})
%_tensor_constant94 : [#users=1] = get_attr[target=_tensor_constant94]
%mul_200 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_22, %_tensor_constant94), kwargs = {})
%view_246 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_200, [16, 128, 128]), kwargs = {})
%transpose_45 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_69, 1, 2), kwargs = {})
%_reshape_alias_91 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_45, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_45 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_246, %_reshape_alias_91), kwargs = {})
%view_247 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_45, [1, 16, 128, 64]), kwargs = {})
%permute_45 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_247, [0, 2, 1, 3]), kwargs = {})
%clone_22 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_45,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_22 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_22, [1, 128, 1024]), kwargs = {})
%_param_constant271 : [#users=1] = get_attr[target=_param_constant271]
%t_89 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant271,), kwargs = {})
%view_248 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_22, [128, 1024]), kwargs = {})
%_param_constant272 : [#users=1] = get_attr[target=_param_constant272]
%addmm_89 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant272, %view_248, %t_89), kwargs = {})
%view_249 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_89, [1, 128, 1024]), kwargs = {})
%add_134 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_131, %view_249), kwargs = {})
%_param_constant273 : [#users=1] = get_attr[target=_param_constant273]
%_param_constant274 : [#users=1] = get_attr[target=_param_constant274]
%native_layer_norm_46 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_134, [1024], %_param_constant273, %_param_constant274, 1e-05), kwargs = {})
%getitem_138 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_46, 0), kwargs = {})
%getitem_139 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_46, 1), kwargs = {})
%getitem_140 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_46, 2), kwargs = {})
%_param_constant275 : [#users=1] = get_attr[target=_param_constant275]
%t_90 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant275,), kwargs = {})
%view_250 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_138, [128, 1024]), kwargs = {})
%_param_constant276 : [#users=1] = get_attr[target=_param_constant276]
%addmm_90 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant276, %view_250, %t_90), kwargs = {})
%view_251 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_90, [1, 128, 4096]), kwargs = {})
%mul_201 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_251, 0.5), kwargs = {})
%mul_202 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_251, 0.79788456), kwargs = {})
%mul_203 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_251, 0.044715), kwargs = {})
%mul_204 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_203, %view_251), kwargs = {})
%add_135 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_204, 1), kwargs = {})
%mul_205 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_202, %add_135), kwargs = {})
%tanh_22 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_205,), kwargs = {})
%detach_45 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_22,), kwargs = {})
%add_136 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_22, 1.0), kwargs = {})
%mul_206 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_201, %add_136), kwargs = {})
%_param_constant277 : [#users=1] = get_attr[target=_param_constant277]
%t_91 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant277,), kwargs = {})
%view_252 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_206, [128, 4096]), kwargs = {})
%_param_constant278 : [#users=1] = get_attr[target=_param_constant278]
%addmm_91 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant278, %view_252, %t_91), kwargs = {})
%view_253 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_91, [1, 128, 1024]), kwargs = {})
%add_137 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_134, %view_253), kwargs = {})
%_param_constant279 : [#users=1] = get_attr[target=_param_constant279]
%_param_constant280 : [#users=1] = get_attr[target=_param_constant280]
%native_layer_norm_47 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_137, [1024], %_param_constant279, %_param_constant280, 1e-05), kwargs = {})
%getitem_141 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_47, 0), kwargs = {})
%getitem_142 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_47, 1), kwargs = {})
%getitem_143 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_47, 2), kwargs = {})
%_param_constant281 : [#users=1] = get_attr[target=_param_constant281]
%t_92 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant281,), kwargs = {})
%view_254 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_141, [128, 1024]), kwargs = {})
%_param_constant282 : [#users=1] = get_attr[target=_param_constant282]
%addmm_92 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant282, %view_254, %t_92), kwargs = {})
%view_255 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_92, [1, 128, 3072]), kwargs = {})
%_reshape_alias_92 : [#users=3] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_255, [1, 128, 16, 192], [393216, 3072, 192, 1]), kwargs = {})
%slice_70 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_92, -1, 0, 64), kwargs = {})
%slice_71 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_92, -1, 64, 128), kwargs = {})
%slice_72 : [#users=1] = call_function[target=torch.ops.aten.slice](args = (%_reshape_alias_92, -1, 128, 192), kwargs = {})
%transpose_46 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_70, 1, 2), kwargs = {})
%_reshape_alias_93 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_46, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%permute_46 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%slice_71, [0, 2, 3, 1]), kwargs = {})
%_reshape_alias_94 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%permute_46, [16, 64, 128], [192, 1, 3072]), kwargs = {})
%bmm_46 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%_reshape_alias_93, %_reshape_alias_94), kwargs = {})
%mul_207 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%bmm_46, 0.005434782608695652), kwargs = {})
%_tensor_constant95 : [#users=1] = get_attr[target=_tensor_constant95]
%add_138 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_207, %_tensor_constant95), kwargs = {})
%view_256 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%add_138, [-1, 16, 128, 128]), kwargs = {})
%mul_208 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_256, 23), kwargs = {})
%_tensor_constant26_23 : [#users=1] = get_attr[target=_tensor_constant26]
%add_139 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_208, %_tensor_constant26_23), kwargs = {})
%_tensor_constant96 : [#users=1] = get_attr[target=_tensor_constant96]
%maximum_23 : [#users=1] = call_function[target=torch.ops.aten.maximum](args = (%add_139, %_tensor_constant96), kwargs = {})
%_softmax_23 : [#users=2] = call_function[target=torch.ops.aten._softmax](args = (%maximum_23, -1, False), kwargs = {})
%detach_46 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%_softmax_23,), kwargs = {})
%_tensor_constant97 : [#users=1] = get_attr[target=_tensor_constant97]
%mul_209 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_softmax_23, %_tensor_constant97), kwargs = {})
%view_257 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_209, [16, 128, 128]), kwargs = {})
%transpose_47 : [#users=1] = call_function[target=torch.ops.aten.transpose](args = (%slice_72, 1, 2), kwargs = {})
%_reshape_alias_95 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%transpose_47, [16, 128, 64], [192, 3072, 1]), kwargs = {})
%bmm_47 : [#users=1] = call_function[target=torch.ops.aten.bmm](args = (%view_257, %_reshape_alias_95), kwargs = {})
%view_258 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%bmm_47, [1, 16, 128, 64]), kwargs = {})
%permute_47 : [#users=1] = call_function[target=torch.ops.aten.permute](args = (%view_258, [0, 2, 1, 3]), kwargs = {})
%clone_23 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%permute_47,), kwargs = {memory_format: torch.contiguous_format})
%_unsafe_view_23 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%clone_23, [1, 128, 1024]), kwargs = {})
%_param_constant283 : [#users=1] = get_attr[target=_param_constant283]
%t_93 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant283,), kwargs = {})
%view_259 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%_unsafe_view_23, [128, 1024]), kwargs = {})
%_param_constant284 : [#users=1] = get_attr[target=_param_constant284]
%addmm_93 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant284, %view_259, %t_93), kwargs = {})
%view_260 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_93, [1, 128, 1024]), kwargs = {})
%add_140 : [#users=2] = call_function[target=torch.ops.aten.add](args = (%add_137, %view_260), kwargs = {})
%_param_constant285 : [#users=1] = get_attr[target=_param_constant285]
%_param_constant286 : [#users=1] = get_attr[target=_param_constant286]
%native_layer_norm_48 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_140, [1024], %_param_constant285, %_param_constant286, 1e-05), kwargs = {})
%getitem_144 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_48, 0), kwargs = {})
%getitem_145 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_48, 1), kwargs = {})
%getitem_146 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_48, 2), kwargs = {})
%_param_constant287 : [#users=1] = get_attr[target=_param_constant287]
%t_94 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant287,), kwargs = {})
%view_261 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_144, [128, 1024]), kwargs = {})
%_param_constant288 : [#users=1] = get_attr[target=_param_constant288]
%addmm_94 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant288, %view_261, %t_94), kwargs = {})
%view_262 : [#users=4] = call_function[target=torch.ops.aten.view](args = (%addmm_94, [1, 128, 4096]), kwargs = {})
%mul_210 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_262, 0.5), kwargs = {})
%mul_211 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_262, 0.79788456), kwargs = {})
%mul_212 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%view_262, 0.044715), kwargs = {})
%mul_213 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_212, %view_262), kwargs = {})
%add_141 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%mul_213, 1), kwargs = {})
%mul_214 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_211, %add_141), kwargs = {})
%tanh_23 : [#users=2] = call_function[target=torch.ops.aten.tanh](args = (%mul_214,), kwargs = {})
%detach_47 : [#users=0] = call_function[target=torch.ops.aten.detach](args = (%tanh_23,), kwargs = {})
%add_142 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%tanh_23, 1.0), kwargs = {})
%mul_215 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%mul_210, %add_142), kwargs = {})
%_param_constant289 : [#users=1] = get_attr[target=_param_constant289]
%t_95 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant289,), kwargs = {})
%view_263 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%mul_215, [128, 4096]), kwargs = {})
%_param_constant290 : [#users=1] = get_attr[target=_param_constant290]
%addmm_95 : [#users=1] = call_function[target=torch.ops.aten.addmm](args = (%_param_constant290, %view_263, %t_95), kwargs = {})
%view_264 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%addmm_95, [1, 128, 1024]), kwargs = {})
%add_143 : [#users=1] = call_function[target=torch.ops.aten.add](args = (%add_140, %view_264), kwargs = {})
%_param_constant291 : [#users=1] = get_attr[target=_param_constant291]
%_param_constant292 : [#users=1] = get_attr[target=_param_constant292]
%native_layer_norm_49 : [#users=3] = call_function[target=torch.ops.aten.native_layer_norm](args = (%add_143, [1024], %_param_constant291, %_param_constant292, 1e-05), kwargs = {})
%getitem_147 : [#users=1] = call_function[target=operator.getitem](args = (%native_layer_norm_49, 0), kwargs = {})
%getitem_148 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_49, 1), kwargs = {})
%getitem_149 : [#users=0] = call_function[target=operator.getitem](args = (%native_layer_norm_49, 2), kwargs = {})
%view_265 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem_147, [1, 128, 1024]), kwargs = {})
%_param_constant293 : [#users=1] = get_attr[target=_param_constant293]
%t_96 : [#users=1] = call_function[target=torch.ops.aten.t](args = (%_param_constant293,), kwargs = {})
%_reshape_alias_96 : [#users=1] = call_function[target=torch.ops.aten._reshape_alias](args = (%view_265, [128, 1024], [1024, 1]), kwargs = {})
%mm : [#users=1] = call_function[target=torch.ops.aten.mm](args = (%_reshape_alias_96, %t_96), kwargs = {})
%_unsafe_view_24 : [#users=1] = call_function[target=torch.ops.aten._unsafe_view](args = (%mm, [1, 128, 2]), kwargs = {})
%ne : [#users=1] = call_function[target=torch.ops.aten.ne](args = (%arg0_1, 3), kwargs = {})
%sum_1 : [#users=1] = call_function[target=torch.ops.aten.sum](args = (%ne, [-1]), kwargs = {})
%sub : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%sum_1, 1), kwargs = {})
%_tensor_constant98 : [#users=1] = get_attr[target=_tensor_constant98]
%index : [#users=1] = call_function[target=torch.ops.aten.index](args = (%_unsafe_view_24, [%_tensor_constant98, %sub]), kwargs = {})
return index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment