Last active
March 5, 2025 20:39
-
-
Save ysdede/aedf98e5d41ab68b63e8f353af7af053 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Training on 1 GPUs | |
Loading checkpoint shards: 100% | |
3/3 [00:02<00:00, 1.50it/s] | |
Audio embed structure: Phi4MMAudioEmbedding( | |
(drop): Dropout(p=0.0, inplace=False) | |
(encoder): ConformerEncoder( | |
(embed): NemoConvSubsampling( | |
(out): Linear(in_features=10240, out_features=1024, bias=True) | |
(conv): Sequential( | |
(0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
(1): ReLU() | |
(2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024) | |
(3): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1)) | |
(4): ReLU() | |
(5): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024) | |
(6): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1)) | |
(7): ReLU() | |
) | |
) | |
(pos_emb): AbsolutePositionalEncoding( | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(relative_attention_bias_layer): T5RelativeAttentionLogitBias( | |
(bias_values): Embedding(1000, 16) | |
) | |
(encoders): MultiSequential( | |
(0): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(1): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(2): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(3): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(4): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(5): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(6): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(7): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(8): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(9): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(10): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(11): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(12): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(13): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(14): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(15): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(16): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(17): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(18): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(19): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(20): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(21): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(22): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
(23): CheckpointWrapper( | |
(_checkpoint_wrapped_module): ConformerEncoderLayer( | |
(feed_forward_in): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(self_attn): MultiHeadedAttention( | |
(linear_q): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_k): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_v): Linear(in_features=1024, out_features=1024, bias=True) | |
(linear_out): Linear(in_features=1024, out_features=1024, bias=True) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(quant_q): QuantStub() | |
(quant_x): QuantStub() | |
(dequant): DeQuantStub() | |
(ffunc): FloatFunctional( | |
(activation_post_process): Identity() | |
) | |
) | |
(conv): ConvModule( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(ln1): Identity() | |
(glu): GLUPointWiseConv( | |
(ext_pw_conv_1d): Conv1d(1024, 2048, kernel_size=(1,), stride=(1,)) | |
(glu_act): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
(bn_layer): Identity() | |
(ext_pw_conv_1d): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
(squeeze_excitation): Identity() | |
(act): Swish( | |
(act_fn): Sigmoid() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
(dw_sep_conv_1d): DepthWiseSeperableConv1d( | |
(dw_conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(2,), groups=1024) | |
(pw_conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,)) | |
) | |
) | |
(feed_forward_out): FeedForward( | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(net): Sequential( | |
(0): GLULinear( | |
(linear): Linear(in_features=1024, out_features=3072, bias=True) | |
(glu_act): GLU( | |
(act_fn): Swish( | |
(act_fn): Sigmoid() | |
) | |
) | |
) | |
(1): Dropout(p=0.0, inplace=False) | |
(2): Linear(in_features=1536, out_features=1024, bias=True) | |
(3): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(layer_norm_att): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
) | |
(encoder_embedding): MeanVarianceNormLayer() | |
) | |
(audio_projection): ModuleDict( | |
(speech): Sequential( | |
(0): lora.Linear( | |
(base_layer): Linear(in_features=1024, out_features=3072, bias=True) | |
(lora_dropout): ModuleDict( | |
(default): Dropout(p=0.05, inplace=False) | |
) | |
(lora_A): ModuleDict( | |
(default): Linear(in_features=1024, out_features=32, bias=False) | |
) | |
(lora_B): ModuleDict( | |
(default): Linear(in_features=32, out_features=3072, bias=False) | |
) | |
(lora_embedding_A): ParameterDict() | |
(lora_embedding_B): ParameterDict() | |
(lora_magnitude_vector): ModuleDict() | |
) | |
(1): GELU(approximate='none') | |
(2): lora.Linear( | |
(base_layer): Linear(in_features=3072, out_features=3072, bias=True) | |
(lora_dropout): ModuleDict( | |
(default): Dropout(p=0.05, inplace=False) | |
) | |
(lora_A): ModuleDict( | |
(default): Linear(in_features=3072, out_features=32, bias=False) | |
) | |
(lora_B): ModuleDict( | |
(default): Linear(in_features=32, out_features=3072, bias=False) | |
) | |
(lora_embedding_A): ParameterDict() | |
(lora_embedding_B): ParameterDict() | |
(lora_magnitude_vector): ModuleDict() | |
) | |
) | |
(vision): Sequential( | |
(0): Linear(in_features=1024, out_features=3072, bias=True) | |
(1): GELU(approximate='none') | |
(2): Linear(in_features=3072, out_features=3072, bias=True) | |
) | |
) | |
) | |
Conv layers: ['encoder.embed.out.weight', 'encoder.embed.out.bias', 'encoder.embed.conv.0.weight', 'encoder.embed.conv.0.bias', 'encoder.embed.conv.2.weight', 'encoder.embed.conv.2.bias', 'encoder.embed.conv.3.weight', 'encoder.embed.conv.3.bias', 'encoder.embed.conv.5.weight', 'encoder.embed.conv.5.bias', 'encoder.embed.conv.6.weight', 'encoder.embed.conv.6.bias', 'encoder.relative_attention_bias_layer.bias_values.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.0._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.0._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.0._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.0._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.1._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.1._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.1._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.1._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.2._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.2._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.2._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.2._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.3._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.3._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.3._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.3._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.4._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.4._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.4._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.4._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.5._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.5._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.5._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.5._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.6._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.6._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.6._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.6._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.7._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.7._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.7._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.7._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.8._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.8._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.8._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.8._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.9._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.9._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.9._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.9._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.10._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.10._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.10._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.10._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.11._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.11._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.11._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.11._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.12._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.12._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.12._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.12._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.13._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.13._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.13._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.13._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.14._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.14._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.14._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.14._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.15._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.15._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.15._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.15._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.16._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.16._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.16._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.16._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.17._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.17._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.17._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.17._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.18._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.18._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.18._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.18._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.19._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.19._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.19._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.19._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.20._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.20._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.20._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.20._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.21._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.21._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.21._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.21._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.22._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.22._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.22._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.22._checkpoint_wrapped_module.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.0.linear.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.0.linear.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.2.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_in.net.2.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_q.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_q.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_k.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_k.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_v.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_v.bias', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_out.weight', 'encoder.encoders.23._checkpoint_wrapped_module.self_attn.linear_out.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.b1', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.b2', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.glu.ext_pw_conv_1d.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.ext_pw_conv_1d.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.ext_pw_conv_1d.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.dw_conv.bias', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.weight', 'encoder.encoders.23._checkpoint_wrapped_module.conv.dw_sep_conv_1d.pw_conv.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.layer_norm.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.0.linear.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.0.linear.bias', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.2.weight', 'encoder.encoders.23._checkpoint_wrapped_module.feed_forward_out.net.2.bias', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm_att.weight', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm_att.bias', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm.weight', 'encoder.encoders.23._checkpoint_wrapped_module.layer_norm.bias', 'audio_projection.speech.0.base_layer.weight', 'audio_projection.speech.0.base_layer.bias', 'audio_projection.speech.0.lora_A.default.weight', 'audio_projection.speech.0.lora_B.default.weight', 'audio_projection.speech.2.base_layer.weight', 'audio_projection.speech.2.base_layer.bias', 'audio_projection.speech.2.lora_A.default.weight', 'audio_projection.speech.2.lora_B.default.weight', 'audio_projection.vision.0.weight', 'audio_projection.vision.0.bias', 'audio_projection.vision.2.weight', 'audio_projection.vision.2.bias'] | |
Gradient accumulation steps: 1 | |
--------------------------------------------------------------------------- | |
AttributeError Traceback (most recent call last) | |
<ipython-input-35-f6dc1a090c48> in <cell line: 0>() | |
91 param_groups = [ | |
92 # Audio encoder params from debug path | |
---> 93 {'params': model.base_model.model.model.embed_tokens_extend.audio_embed.audio_encoder.parameters(), 'lr': LEARNING_RATE/3}, | |
94 | |
95 # LoRA params from verified target modules | |
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in __getattr__(self, name) | |
1929 if name in modules: | |
1930 return modules[name] | |
-> 1931 raise AttributeError( | |
1932 f"'{type(self).__name__}' object has no attribute '{name}'" | |
1933 ) | |
AttributeError: 'Phi4MMAudioEmbedding' object has no attribute 'audio_encoder' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment