Last active
March 18, 2021 23:30
-
-
Save ppwwyyxx/4c3527f9da8b0f49a8aaee5dcd734450 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import torch | |
from torch import nn | |
from fvcore.nn import FlopCountAnalysis, flop_count_table | |
from pypapi import events, papi_high as high | |
def main(): | |
model = nn.Conv2d(256, 128, 3, padding=1) | |
model.cpu() | |
model.double() | |
model.eval() | |
print("PAPI, theoretical") | |
for bs in range(1, 13): | |
input = torch.randn((bs, 256, 28, 28)).double() | |
with torch.no_grad(): | |
for evt in ['PAPI_DP_OPS']: | |
high.start_counters([getattr(events, evt)]) | |
_ = model(input) | |
papi_flop = high.stop_counters()[0] / 1e9 | |
flop = FlopCountAnalysis(model, input).total() / 1e9 | |
flop *= 2 # different convention | |
print(papi_flop, flop) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
#File: | |
import argparse | |
import torch | |
from torch import nn | |
from fvcore.nn import FlopCountAnalysis, flop_count_table, flop_count_str | |
from pypapi import events, papi_high as high | |
def main(): | |
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') | |
roberta.cpu() | |
roberta.double() | |
roberta.eval() | |
tgt_len = 50 | |
tokens = roberta.encode('hi'*(tgt_len - 3)).unsqueeze(0).to( device=roberta.device) | |
assert tokens.numel() == tgt_len | |
class A(nn.Module): | |
def forward(self, tokens): | |
return self.m.extract_features(tokens) | |
with torch.no_grad(): | |
for evt in ['PAPI_DP_OPS']: | |
high.start_counters([getattr(events, evt)]) | |
features, _ = roberta.model.extract_features(tokens) | |
papi_flops = high.stop_counters() | |
print('total flops (papi, {})'.format(evt), papi_flops[0] / 1e9) | |
model = A(); model.m = roberta.model | |
flop = FlopCountAnalysis(model, tokens) | |
print(flop_count_table(flop, max_depth=5, show_param_shapes=False)) | |
print(flop_count_str(flop)) | |
print("Total", flop.total() / 1e9) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
total flops (papi, PAPI_DP_OPS) 9.003876419 | |
| module | #parameters | #flops | | |
|:---------------------------------------------------------------|:--------------|:------------| | |
| model.m | 0.356G | 17.9G | | |
| m.encoder | 0.355G | 17.9G | | |
| m.encoder.sentence_encoder | 0.354G | 15.2G | | |
| m.encoder.sentence_encoder.embed_tokens | 51.5M | 0 | | |
| m.encoder.sentence_encoder.embed_positions | 0.526M | 0 | | |
| m.encoder.sentence_encoder.layernorm_embedding | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers | 0.302G | | | |
| m.encoder.sentence_encoder.layers.0 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.0.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.0.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.0.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.0.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.0.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.1 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.1.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.1.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.1.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.1.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.1.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.2 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.2.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.2.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.2.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.2.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.2.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.3 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.3.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.3.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.3.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.3.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.3.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.4 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.4.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.4.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.4.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.4.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.4.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.5 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.5.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.5.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.5.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.5.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.5.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.6 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.6.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.6.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.6.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.6.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.6.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.7 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.7.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.7.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.7.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.7.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.7.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.8 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.8.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.8.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.8.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.8.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.8.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.9 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.9.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.9.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.9.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.9.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.9.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.10 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.10.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.10.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.10.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.10.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.10.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.11 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.11.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.11.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.11.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.11.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.11.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.12 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.12.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.12.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.12.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.12.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.12.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.13 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.13.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.13.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.13.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.13.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.13.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.14 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.14.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.14.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.14.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.14.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.14.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.15 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.15.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.15.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.15.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.15.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.15.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.16 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.16.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.16.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.16.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.16.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.16.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.17 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.17.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.17.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.17.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.17.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.17.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.18 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.18.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.18.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.18.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.18.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.18.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.19 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.19.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.19.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.19.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.19.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.19.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.20 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.20.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.20.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.20.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.20.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.20.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.21 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.21.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.21.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.21.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.21.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.21.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.22 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.22.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.22.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.22.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.22.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.22.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.23 | 12.6M | 0.635G | | |
| m.encoder.sentence_encoder.layers.23.self_attn | 4.2M | 0.215G | | |
| m.encoder.sentence_encoder.layers.23.self_attn_layer_norm | 2.05K | 0.256M | | |
| m.encoder.sentence_encoder.layers.23.fc1 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.23.fc2 | 4.2M | 0.21G | | |
| m.encoder.sentence_encoder.layers.23.final_layer_norm | 2.05K | 0.256M | | |
| m.encoder.lm_head | 1.1M | 2.63G | | |
| m.encoder.lm_head.dense | 1.05M | 52.4M | | |
| m.encoder.lm_head.layer_norm | 2.05K | 0.256M | | |
| m.classification_heads.mnli | 1.05M | | | |
| m.classification_heads.mnli.dense | 1.05M | | | |
| m.classification_heads.mnli.out_proj | 3.08K | | | |
Input sizes (torch.Tensor only): [[50]] | |
N/A indicates a possibly missing statistic due to how the module was called. Missing values are still included in the parent's total. | |
A( | |
n_params: 0.356G, n_flops: 17.9G | |
(m): RobertaModel( | |
n_params: 0.356G, n_flops: 17.9G | |
(encoder): RobertaEncoder( | |
n_params: 0.355G, n_flops: 17.9G | |
(sentence_encoder): TransformerEncoder( | |
n_params: 0.354G, n_flops: 15.2G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(embed_tokens): Embedding( | |
50265, 1024, padding_idx=1 | |
n_params: 51.5M, n_flops: 0 | |
) | |
(embed_positions): LearnedPositionalEmbedding( | |
514, 1024, padding_idx=1 | |
n_params: 0.526M, n_flops: 0 | |
) | |
(layernorm_embedding): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(layers): ModuleList( | |
n_params: 0.302G, n_flops: N/A | |
(0): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(1): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(2): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(3): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(4): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(5): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(6): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(7): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(8): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(9): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(10): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(11): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(12): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(13): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(14): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(15): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(16): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(17): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(18): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(19): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(20): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(21): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(22): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
(23): TransformerEncoderLayer( | |
n_params: 12.6M, n_flops: 0.635G | |
(self_attn): MultiheadAttention( | |
n_params: 4.2M, n_flops: 0.215G | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(k_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(v_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(q_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
) | |
(self_attn_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
(fc1): Linear( | |
in_features=1024, out_features=4096, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(fc2): Linear( | |
in_features=4096, out_features=1024, bias=True | |
n_params: 4.2M, n_flops: 0.21G | |
) | |
(final_layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
) | |
) | |
(lm_head): RobertaLMHead( | |
n_params: 1.1M, n_flops: 2.63G | |
(dense): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: 52.4M | |
) | |
(layer_norm): LayerNorm( | |
(1024,), eps=1e-05, elementwise_affine=True | |
n_params: 2.05K, n_flops: 0.256M | |
) | |
) | |
) | |
(classification_heads): ModuleDict( | |
n_params: 1.05M, n_flops: N/A | |
(mnli): RobertaClassificationHead( | |
n_params: 1.05M, n_flops: N/A | |
(dense): Linear( | |
in_features=1024, out_features=1024, bias=True | |
n_params: 1.05M, n_flops: N/A | |
) | |
(dropout): Dropout( | |
p=0.3, inplace=False | |
n_params: 0, n_flops: N/A | |
) | |
(out_proj): Linear( | |
in_features=1024, out_features=3, bias=True | |
n_params: 3.08K, n_flops: N/A | |
) | |
) | |
) | |
) | |
) | |
Total 17.8611712 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
test-conv-flop.py prints: