ysdede · March 5, 2025 20:39
diff --git a/Phi-4-logs b/Phi-4-logs
 Training on 1 GPUs
 Loading checkpoint shards: 100%
  3/3 [00:02<00:00,  1.50it/s]
 Audio embed structure: Phi4MMAudioEmbedding(
  (drop): Dropout(p=0.0, inplace=False)
  (encoder): ConformerEncoder(
    (embed): NemoConvSubsampling(
      (out): Linear(in_features=10240, out_features=1024, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU()
        (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024)
        (3): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
        (4): ReLU()
        (5): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024)
        (6): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
        (7): ReLU()
      )
    )
    (pos_emb): AbsolutePositionalEncoding(
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (relative_attention_bias_layer): T5RelativeAttentionLogitBias(
      (bias_values): Embedding(1000, 16)
    )
    (encoders): MultiSequential(
      (0): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (1): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (2): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (3): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (4): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (5): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (6): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (7): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (8): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (9): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (10): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (11): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (12): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (13): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (14): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (15): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (16): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (17): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (18): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (19): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (20): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (21): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (22): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (23): CheckpointWrapper(
        (_checkpoint_wrapped_module): ConformerEncoderLayer(
          (feed_forward_in): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (self_attn): MultiHeadedAttention(
            (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
            (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (quant_q): QuantStub()
            (quant_x): QuantStub()
            (dequant): DeQuantStub()
            (ffunc): FloatFunctional(
              (activation_post_process): Identity()
            )
          )
          (conv): ConvModule(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (ln1): Identity()
            (glu): GLUPointWiseConv(
              (ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
              (glu_act): Swish(
                (act_fn): Sigmoid()
              )
            )
            (bn_layer): Identity()
            (ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            (squeeze_excitation): Identity()
            (act): Swish(
              (act_fn): Sigmoid()
            )
            (dropout): Dropout(p=0.0, inplace=False)
            (dw_sep_conv_1d): DepthWiseSeperableConv1d(
              (dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
              (pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
          )
          (feed_forward_out): FeedForward(
            (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (net): Sequential(
              (0): GLULinear(
                (linear): Linear(in_features=1024, out_features=3072, bias=True)
                (glu_act): GLU(
                  (act_fn): Swish(
                    (act_fn): Sigmoid()
                  )
                )
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1536, out_features=1024, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (encoder_embedding): MeanVarianceNormLayer()
  )
  (audio_projection): ModuleDict(
    (speech): Sequential(
      (0): lora.Linear(
        (base_layer): Linear(in_features=1024, out_features=3072, bias=True)
        (lora_dropout): ModuleDict(
          (default): Dropout(p=0.05, inplace=False)
        )
        (lora_A): ModuleDict(
          (default): Linear(in_features=1024, out_features=32, bias=False)
        )
        (lora_B): ModuleDict(
          (default): Linear(in_features=32, out_features=3072, bias=False)
        )
        (lora_embedding_A): ParameterDict()
        (lora_embedding_B): ParameterDict()
        (lora_magnitude_vector): ModuleDict()
      )
      (1): GELU(approximate='none')
      (2): lora.Linear(
        (base_layer): Linear(in_features=3072, out_features=3072, bias=True)
        (lora_dropout): ModuleDict(
          (default): Dropout(p=0.05, inplace=False)
        )
        (lora_A): ModuleDict(
          (default): Linear(in_features=3072, out_features=32, bias=False)
        )
        (lora_B): ModuleDict(
          (default): Linear(in_features=32, out_features=3072, bias=False)
        )
        (lora_embedding_A): ParameterDict()
        (lora_embedding_B): ParameterDict()
        (lora_magnitude_vector): ModuleDict()
      )
    )
    (vision): Sequential(
      (0): Linear(in_features=1024, out_features=3072, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=3072, out_features=3072, bias=True)
    )
  )
 )
 Conv layers: ['encoder.embed.out.weight', 'encoder.embed.out.bias', 'encoder.embed.conv.0.weight', 'encoder.embed.conv.0.bias', 'encoder.embed.conv.2.weight', 'encoder.embed.conv.2.bias', 'encoder.embed.conv.3.weight', 'encoder.embed.conv.3.bias', 'encoder.embed.conv.5.weight', 'encoder.embed.conv.5.bias', 'encoder.embed.conv.6.weight', 'encoder.embed.conv.6.bias', 'encoder.relative_attention_bias_layer.bias_values.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm.bias', 'audio_projection.speech.0.base_layer.weight', 'audio_projection.speech.0.base_layer.bias', 'audio_projection.speech.0.lora_A.default.weight', 'audio_projection.speech.0.lora_B.default.weight', 'audio_projection.speech.2.base_layer.weight', 'audio_projection.speech.2.base_layer.bias', 'audio_projection.speech.2.lora_A.default.weight', 'audio_projection.speech.2.lora_B.default.weight', 'audio_projection.vision.0.weight', 'audio_projection.vision.0.bias', 'audio_projection.vision.2.weight', 'audio_projection.vision.2.bias']
 Gradient accumulation steps: 1
 ---------------------------------------------------------------------------
 AttributeError                            Traceback (most recent call last)
 <ipython-input-35-f6dc1a090c48> in <cell line: 0>()
     91 param_groups = [
     92     # Audio encoder params from debug path
 ---> 93     {'params': model.base_model.model.model.embed_tokens_extend.audio_embed.audio_encoder.parameters(), 'lr': LEARNING_RATE/3},
     94 
     95     # LoRA params from verified target modules

 /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in __getattr__(self, name)
   1929             if name in modules:
   1930                 return modules[name]
 -> 1931         raise AttributeError(
   1932             f"'{type(self).__name__}' object has no attribute '{name}'"
   1933         )

 AttributeError: 'Phi4MMAudioEmbedding' object has no attribute 'audio_encoder'