Skip to content

Instantly share code, notes, and snippets.

@ysdede
Last active March 5, 2025 20:39
Show Gist options
  • Save ysdede/aedf98e5d41ab68b63e8f353af7af053 to your computer and use it in GitHub Desktop.
Save ysdede/aedf98e5d41ab68b63e8f353af7af053 to your computer and use it in GitHub Desktop.
Training on 1 GPUs
Loading checkpoint shards: 100%
 3/3 [00:02<00:00,  1.50it/s]
Audio embed structure: Phi4MMAudioEmbedding(
(drop): Dropout(p=0.0, inplace=False)
(encoder): ConformerEncoder(
(embed): NemoConvSubsampling(
(out): Linear(in_features=10240, out_features=1024, bias=True)
(conv): Sequential(
(0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(1): ReLU()
(2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024)
(3): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
(4): ReLU()
(5): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024)
(6): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
(7): ReLU()
)
)
(pos_emb): AbsolutePositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
(relative_attention_bias_layer): T5RelativeAttentionLogitBias(
(bias_values): Embedding(1000, 16)
)
(encoders): MultiSequential(
(0): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(1): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(2): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(3): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(4): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(5): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(6): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(7): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(8): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(9): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(10): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(11): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(12): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(13): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(14): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(15): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(16): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(17): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(18): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(19): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(20): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(21): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(22): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(23): CheckpointWrapper(
(_checkpoint_wrapped_module): ConformerEncoderLayer(
(feed_forward_in): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(quant_q): QuantStub()
(quant_x): QuantStub()
(dequant): DeQuantStub()
(ffunc): FloatFunctional(
(activation_post_process): Identity()
)
)
(conv): ConvModule(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln1): Identity()
(glu): GLUPointWiseConv(
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,))
(glu_act): Swish(
(act_fn): Sigmoid()
)
)
(bn_layer): Identity()
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(squeeze_excitation): Identity()
(act): Swish(
(act_fn): Sigmoid()
)
(dropout): Dropout(p=0.0, inplace=False)
(dw_sep_conv_1d): DepthWiseSeperableConv1d(
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024)
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
)
(feed_forward_out): FeedForward(
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(net): Sequential(
(0): GLULinear(
(linear): Linear(in_features=1024, out_features=3072, bias=True)
(glu_act): GLU(
(act_fn): Swish(
(act_fn): Sigmoid()
)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1536, out_features=1024, bias=True)
(3): Dropout(p=0.0, inplace=False)
)
)
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
)
(encoder_embedding): MeanVarianceNormLayer()
)
(audio_projection): ModuleDict(
(speech): Sequential(
(0): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=3072, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=32, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=32, out_features=3072, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(1): GELU(approximate='none')
(2): lora.Linear(
(base_layer): Linear(in_features=3072, out_features=3072, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3072, out_features=32, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=32, out_features=3072, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
(vision): Sequential(
(0): Linear(in_features=1024, out_features=3072, bias=True)
(1): GELU(approximate='none')
(2): Linear(in_features=3072, out_features=3072, bias=True)
)
)
)
Conv layers: ['encoder.embed.out.weight', 'encoder.embed.out.bias', 'encoder.embed.conv.0.weight', 'encoder.embed.conv.0.bias', 'encoder.embed.conv.2.weight', 'encoder.embed.conv.2.bias', 'encoder.embed.conv.3.weight', 'encoder.embed.conv.3.bias', 'encoder.embed.conv.5.weight', 'encoder.embed.conv.5.bias', 'encoder.embed.conv.6.weight', 'encoder.embed.conv.6.bias', 'encoder.relative_attention_bias_layer.bias_values.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm.bias', 'audio_projection.speech.0.base_layer.weight', 'audio_projection.speech.0.base_layer.bias', 'audio_projection.speech.0.lora_A.default.weight', 'audio_projection.speech.0.lora_B.default.weight', 'audio_projection.speech.2.base_layer.weight', 'audio_projection.speech.2.base_layer.bias', 'audio_projection.speech.2.lora_A.default.weight', 'audio_projection.speech.2.lora_B.default.weight', 'audio_projection.vision.0.weight', 'audio_projection.vision.0.bias', 'audio_projection.vision.2.weight', 'audio_projection.vision.2.bias']
Gradient accumulation steps: 1
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-35-f6dc1a090c48> in <cell line: 0>()
91 param_groups = [
92 # Audio encoder params from debug path
---> 93 {'params': model.base_model.model.model.embed_tokens_extend.audio_embed.audio_encoder.parameters(), 'lr': LEARNING_RATE/3},
94
95 # LoRA params from verified target modules
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in __getattr__(self, name)
1929 if name in modules:
1930 return modules[name]
-> 1931 raise AttributeError(
1932 f"'{type(self).__name__}' object has no attribute '{name}'"
1933 )
AttributeError: 'Phi4MMAudioEmbedding' object has no attribute 'audio_encoder'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment