Last active
November 18, 2022 01:03
-
-
Save davidberard98/db7bdb10ec261383e89dcf36b632528a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5ForConditionalGeneration( | |
(shared): Embedding(32128, 1024) | |
(encoder): T5Stack( | |
(embed_tokens): Embedding(32128, 1024) | |
(block): ModuleList( | |
(0): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
(relative_attention_bias): Embedding(32, 16) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(1): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(2): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(3): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(4): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(5): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(6): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(7): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(8): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(9): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(10): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(11): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(12): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(13): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(14): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(15): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(16): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(17): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(18): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(19): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(20): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(21): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(22): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(23): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
) | |
(final_layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(decoder): T5Stack( | |
(embed_tokens): Embedding(32128, 1024) | |
(block): ModuleList( | |
(0): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
(relative_attention_bias): Embedding(32, 16) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(1): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(2): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(3): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(4): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(5): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(6): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(7): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(8): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(9): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(10): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(11): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(12): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(13): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(14): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(15): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(16): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(17): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(18): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(19): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(20): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(21): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(22): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(23): FullyShardedDataParallel( | |
(_fsdp_wrapped_module): T5Block( | |
(layer): ModuleList( | |
(0): T5LayerSelfAttention( | |
(SelfAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(1): T5LayerCrossAttention( | |
(EncDecAttention): T5Attention( | |
(q): Linear(in_features=1024, out_features=1024, bias=False) | |
(k): Linear(in_features=1024, out_features=1024, bias=False) | |
(v): Linear(in_features=1024, out_features=1024, bias=False) | |
(o): Linear(in_features=1024, out_features=1024, bias=False) | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(2): T5LayerFF( | |
(DenseReluDense): T5DenseActDense( | |
(wi): Linear(in_features=1024, out_features=4096, bias=False) | |
(wo): Linear(in_features=4096, out_features=1024, bias=False) | |
(dropout): Dropout(p=0.1, inplace=False) | |
(act): ReLU() | |
) | |
(layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
) | |
(final_layer_norm): T5LayerNorm() | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(lm_head): Linear(in_features=1024, out_features=32128, bias=False) | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment