Created
February 18, 2022 14:01
-
-
Save albertz/39813d93f2690b4c6d7347864a2b4b04 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from returnn.tf.util.data import Dim, batch_dim, single_step_dim, SpatialDim, FeatureDim | |
use_tensorflow = True | |
behavior_version = 12 | |
time_dim = SpatialDim('time') | |
input_dim = FeatureDim('input', 10) | |
dummy_input_feature_dim = FeatureDim('dummy-input-feature-dim', 1) | |
filter_dim0_dim = SpatialDim('filter-dim0', 3) | |
filter_dim1_dim = SpatialDim('filter-dim1', 3) | |
intermediate_out_sub_sample_dim = FeatureDim('intermediate_out_sub_sample', 14) | |
conv_subsample_layer_out_spatial_dim0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim0') | |
conv_subsample_layer_out_spatial_dim1_dim = SpatialDim('conv_subsample_layer:out-spatial-dim1', 5) | |
filter_dim0_0_dim = SpatialDim('filter-dim0', 3) | |
filter_dim1_0_dim = SpatialDim('filter-dim1', 3) | |
out_dim = FeatureDim('out', 14) | |
conv_subsample_layer_out_spatial_dim0_0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim0') | |
conv_subsample_layer_out_spatial_dim1_0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim1', 3) | |
conv_subsample_layer_out_dim = SpatialDim('conv_subsample_layer:out_dim') | |
ff_dim = FeatureDim('ff', 17) | |
num_heads_dim = SpatialDim('num_heads', 2) | |
layers_0_self_att_history_dim = SpatialDim('layers/0/self_att:history') | |
filter_dim0_1_dim = SpatialDim('filter-dim0', 32) | |
layers_1_self_att_history_dim = SpatialDim('layers/1/self_att:history') | |
extern_data = { | |
'data': { | |
'dim_tags': ( | |
batch_dim, | |
time_dim, | |
input_dim | |
), | |
'dtype': 'float32', | |
'available_for_inference': True | |
} | |
} | |
network = { | |
'conv_subsample_layer': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'split_dims': { | |
'class': 'split_dims', | |
'from': 'base:data:data', | |
'axis': input_dim, | |
'dims': ( | |
input_dim, | |
dummy_input_feature_dim | |
), | |
'out_shape': {batch_dim, time_dim, input_dim, dummy_input_feature_dim} | |
}, | |
'conv_layers.0': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
filter_dim0_dim, | |
filter_dim1_dim, | |
dummy_input_feature_dim, | |
intermediate_out_sub_sample_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.21081851067789195, | |
'maxval': 0.21081851067789195, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'conv': { | |
'class': 'conv', | |
'from': 'base:split_dims', | |
'in_dim': dummy_input_feature_dim, | |
'in_spatial_dims': [ | |
time_dim, | |
input_dim | |
], | |
'out_dim': intermediate_out_sub_sample_dim, | |
'out_spatial_dims': [ | |
time_dim, | |
input_dim | |
], | |
'filter_size': [3, 3], | |
'padding': 'same', | |
'filter': 'filter', | |
'with_bias': True, | |
'bias': 'bias', | |
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'conv', | |
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim} | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
intermediate_out_sub_sample_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
}, | |
'filter': { | |
'class': 'variable', | |
'shape': [ | |
filter_dim0_dim, | |
filter_dim1_dim, | |
dummy_input_feature_dim, | |
intermediate_out_sub_sample_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
} | |
}, | |
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim}, | |
'name_scope': 'conv_layers/0' | |
}, | |
'relu': { | |
'class': 'activation', | |
'from': 'conv_layers.0/conv', | |
'activation': 'relu', | |
'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim} | |
}, | |
'pool': { | |
'class': 'pool', | |
'from': 'relu', | |
'mode': 'max', | |
'pool_size': (2, 2), | |
'padding': 'same', | |
'in_spatial_dims': [ | |
time_dim, | |
input_dim | |
], | |
'out_spatial_dims': [ | |
conv_subsample_layer_out_spatial_dim0_dim, | |
conv_subsample_layer_out_spatial_dim1_dim | |
], | |
'out_shape': {batch_dim, intermediate_out_sub_sample_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'pool', | |
'dropout': 0.1, | |
'dropout_axis': intermediate_out_sub_sample_dim, | |
'out_shape': {batch_dim, intermediate_out_sub_sample_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim} | |
}, | |
'conv_layers.1': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
filter_dim0_0_dim, | |
filter_dim1_0_dim, | |
intermediate_out_sub_sample_dim, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.1543033499620919, | |
'maxval': 0.1543033499620919, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'conv': { | |
'class': 'conv', | |
'from': 'base:dropout_0', | |
'in_dim': intermediate_out_sub_sample_dim, | |
'in_spatial_dims': [ | |
conv_subsample_layer_out_spatial_dim0_dim, | |
conv_subsample_layer_out_spatial_dim1_dim | |
], | |
'out_dim': out_dim, | |
'out_spatial_dims': [ | |
conv_subsample_layer_out_spatial_dim0_dim, | |
conv_subsample_layer_out_spatial_dim1_dim | |
], | |
'filter_size': [3, 3], | |
'padding': 'same', | |
'filter': 'filter', | |
'with_bias': True, | |
'bias': 'bias', | |
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'conv', | |
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim} | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
}, | |
'filter': { | |
'class': 'variable', | |
'shape': [ | |
filter_dim0_0_dim, | |
filter_dim1_0_dim, | |
intermediate_out_sub_sample_dim, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim}, | |
'name_scope': 'conv_layers/1' | |
}, | |
'relu_0': { | |
'class': 'activation', | |
'from': 'conv_layers.1/conv', | |
'activation': 'relu', | |
'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim} | |
}, | |
'pool_0': { | |
'class': 'pool', | |
'from': 'relu_0', | |
'mode': 'max', | |
'pool_size': (2, 2), | |
'padding': 'same', | |
'in_spatial_dims': [ | |
conv_subsample_layer_out_spatial_dim0_dim, | |
conv_subsample_layer_out_spatial_dim1_dim | |
], | |
'out_spatial_dims': [ | |
conv_subsample_layer_out_spatial_dim0_0_dim, | |
conv_subsample_layer_out_spatial_dim1_0_dim | |
], | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_spatial_dim0_0_dim, conv_subsample_layer_out_spatial_dim1_0_dim} | |
}, | |
'dropout_1': { | |
'class': 'dropout', | |
'from': 'pool_0', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_spatial_dim0_0_dim, conv_subsample_layer_out_spatial_dim1_0_dim} | |
}, | |
'merge_dims': { | |
'class': 'merge_dims', | |
'from': 'dropout_1', | |
'axes': [ | |
conv_subsample_layer_out_spatial_dim0_0_dim, | |
conv_subsample_layer_out_spatial_dim1_0_dim | |
], | |
'out_dim': conv_subsample_layer_out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'merge_dims', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'linear': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim.copy(match_priority=1), | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.4629100498862757, | |
'maxval': 0.4629100498862757, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:conv_subsample_layer/merge_dims', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'dot', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim.copy(match_priority=1), | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'linear', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layers': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'0': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'layer_norm': { | |
'class': 'layer_norm', | |
'from': 'base:base:dropout_0', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'ffn1': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'linear_ff': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
ff_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
ff_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'swish': { | |
'class': 'activation', | |
'from': 'linear_ff', | |
'activation': 'swish', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'swish', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'linear_out': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
ff_dim, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:dropout_0', 'weight'], | |
'reduce': ff_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'linear_out', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'ffn1', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'constant': {'class': 'constant', 'value': 0.5}, | |
'mul': { | |
'class': 'combine', | |
'from': ['constant', 'dropout_0'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['mul', 'base:base:dropout_0'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_0': { | |
'class': 'layer_norm', | |
'from': 'add', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'self_att': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'qkv': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
3 * out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.32732683535398854, | |
'maxval': 0.32732683535398854, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm_0', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
3 * out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
3 * out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'qkv_split_dims': { | |
'class': 'split_dims', | |
'from': 'qkv', | |
'axis': 3 * out_dim, | |
'dims': ( | |
num_heads_dim, | |
3 * out_dim.div_left(num_heads_dim) | |
), | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
}, | |
'qkv_split': { | |
'class': 'split', | |
'from': 'qkv_split_dims', | |
'axis': 3 * out_dim.div_left(num_heads_dim), | |
'out_dims': ( | |
out_dim.div_left(num_heads_dim), | |
out_dim.div_left(num_heads_dim), | |
out_dim.div_left(num_heads_dim) | |
), | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
}, | |
'k_new_dim': { | |
'class': 'reinterpret_data', | |
'set_dim_tags': { | |
conv_subsample_layer_out_dim: layers_0_self_att_history_dim | |
}, | |
'from': 'qkv_split/1', | |
'out_shape': {batch_dim, num_heads_dim, layers_0_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'v_new_dim': { | |
'class': 'reinterpret_data', | |
'set_dim_tags': { | |
conv_subsample_layer_out_dim: layers_0_self_att_history_dim | |
}, | |
'from': 'qkv_split/2', | |
'out_shape': {batch_dim, num_heads_dim, layers_0_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'dot_attention': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'constant': {'class': 'constant', 'value': 0.37796447300922725}, | |
'mul': { | |
'class': 'combine', | |
'from': ['base:qkv_split/0', 'constant'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'energy': { | |
'class': 'dot', | |
'from': ['mul', 'base:k_new_dim'], | |
'reduce': out_dim.div_left(num_heads_dim), | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim} | |
}, | |
'att_weights': { | |
'class': 'softmax_over_spatial', | |
'from': 'energy', | |
'axis': layers_0_self_att_history_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim} | |
}, | |
'dropout': { | |
'class': 'dropout', | |
'from': 'att_weights', | |
'dropout': 0.1, | |
'dropout_axis': layers_0_self_att_history_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim} | |
}, | |
'att': { | |
'class': 'dot', | |
'from': ['dropout', 'base:v_new_dim'], | |
'reduce': layers_0_self_att_history_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'att', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'output_0': { | |
'class': 'merge_dims', | |
'from': 'dot_attention', | |
'axes': ( | |
num_heads_dim, | |
out_dim.div_left(num_heads_dim) | |
), | |
'out_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'output_0', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add_0': { | |
'class': 'combine', | |
'from': ['self_att', 'add'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_1': { | |
'class': 'layer_norm', | |
'from': 'add_0', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'conv_block': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'positionwise_conv1': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
2 * out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.3779644730092272, | |
'maxval': 0.3779644730092272, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm_1', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
2 * out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
2 * out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'glu': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'split': { | |
'class': 'split', | |
'from': 'base:positionwise_conv1', | |
'axis': 2 * out_dim, | |
'out_dims': [ | |
out_dim, | |
out_dim | |
], | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'sigmoid': { | |
'class': 'activation', | |
'from': 'split/1', | |
'activation': 'sigmoid', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'mul': { | |
'class': 'combine', | |
'from': ['split/0', 'sigmoid'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'depthwise_conv': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
filter_dim0_1_dim, | |
out_dim // 14, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.11180339887498948, | |
'maxval': 0.11180339887498948, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'conv': { | |
'class': 'conv', | |
'from': 'base:glu', | |
'in_dim': out_dim, | |
'in_spatial_dims': [ | |
conv_subsample_layer_out_dim | |
], | |
'out_dim': out_dim, | |
'out_spatial_dims': [ | |
conv_subsample_layer_out_dim | |
], | |
'filter_size': [32], | |
'padding': 'same', | |
'groups': 14, | |
'filter': 'filter', | |
'with_bias': True, | |
'bias': 'bias', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'conv', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
}, | |
'filter': { | |
'class': 'variable', | |
'shape': [ | |
filter_dim0_1_dim, | |
out_dim // 14, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'norm': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'batch_norm': { | |
'class': 'batch_norm', | |
'from': 'base:depthwise_conv/conv', | |
'in_dim': out_dim, | |
'use_std': True, | |
'use_shift': True, | |
'param_version': 2, | |
'reuse_params': { | |
'map': { | |
'batch_norm/v2_mean': {'layer_output': 'running_mean'}, | |
'batch_norm/v2_variance': {'layer_output': 'running_variance'}, | |
'batch_norm/v2_gamma': {'layer_output': 'gamma'}, | |
'batch_norm/v2_beta': {'layer_output': 'beta'} | |
} | |
}, | |
'momentum': 0.1, | |
'epsilon': 0.001, | |
'masked_time': False, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'batch_norm', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'beta': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
}, | |
'gamma': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 1.0 | |
}, | |
'running_mean': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'trainable': False, | |
'init': 0.0 | |
}, | |
'running_variance': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'trainable': False, | |
'init': 1.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'swish': { | |
'class': 'activation', | |
'from': 'norm', | |
'activation': 'swish', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'positionwise_conv2': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim.copy(match_priority=1), | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.4629100498862757, | |
'maxval': 0.4629100498862757, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:swish', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim.copy(match_priority=1), | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'positionwise_conv2', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_1': { | |
'class': 'dropout', | |
'from': 'conv_block', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add_1': { | |
'class': 'combine', | |
'from': ['dropout_1', 'add_0'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_2': { | |
'class': 'layer_norm', | |
'from': 'add_1', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'ffn2': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'linear_ff': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
ff_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm_2', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
ff_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'swish': { | |
'class': 'activation', | |
'from': 'linear_ff', | |
'activation': 'swish', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'swish', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'linear_out': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
ff_dim, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:dropout_0', 'weight'], | |
'reduce': ff_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'linear_out', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_2': { | |
'class': 'dropout', | |
'from': 'ffn2', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'constant_0': {'class': 'constant', 'value': 0.5}, | |
'mul_0': { | |
'class': 'combine', | |
'from': ['constant_0', 'dropout_2'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add_2': { | |
'class': 'combine', | |
'from': ['mul_0', 'add_1'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_3': { | |
'class': 'layer_norm', | |
'from': 'add_2', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'layer_norm_3', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'1': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'layer_norm': { | |
'class': 'layer_norm', | |
'from': 'base:0', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'ffn1': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'linear_ff': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
ff_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
ff_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'swish': { | |
'class': 'activation', | |
'from': 'linear_ff', | |
'activation': 'swish', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'swish', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'linear_out': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
ff_dim, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:dropout_0', 'weight'], | |
'reduce': ff_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'linear_out', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'ffn1', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'constant': {'class': 'constant', 'value': 0.5}, | |
'mul': { | |
'class': 'combine', | |
'from': ['constant', 'dropout_0'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['mul', 'base:0'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_0': { | |
'class': 'layer_norm', | |
'from': 'add', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'self_att': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'qkv': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
3 * out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.32732683535398854, | |
'maxval': 0.32732683535398854, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm_0', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
3 * out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
3 * out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
}, | |
'qkv_split_dims': { | |
'class': 'split_dims', | |
'from': 'qkv', | |
'axis': 3 * out_dim, | |
'dims': ( | |
num_heads_dim, | |
3 * out_dim.div_left(num_heads_dim) | |
), | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
}, | |
'qkv_split': { | |
'class': 'split', | |
'from': 'qkv_split_dims', | |
'axis': 3 * out_dim.div_left(num_heads_dim), | |
'out_dims': ( | |
out_dim.div_left(num_heads_dim), | |
out_dim.div_left(num_heads_dim), | |
out_dim.div_left(num_heads_dim) | |
), | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
}, | |
'k_new_dim': { | |
'class': 'reinterpret_data', | |
'set_dim_tags': { | |
conv_subsample_layer_out_dim: layers_1_self_att_history_dim | |
}, | |
'from': 'qkv_split/1', | |
'out_shape': {batch_dim, num_heads_dim, layers_1_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'v_new_dim': { | |
'class': 'reinterpret_data', | |
'set_dim_tags': { | |
conv_subsample_layer_out_dim: layers_1_self_att_history_dim | |
}, | |
'from': 'qkv_split/2', | |
'out_shape': {batch_dim, num_heads_dim, layers_1_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'dot_attention': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'constant': {'class': 'constant', 'value': 0.37796447300922725}, | |
'mul': { | |
'class': 'combine', | |
'from': ['base:qkv_split/0', 'constant'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'energy': { | |
'class': 'dot', | |
'from': ['mul', 'base:k_new_dim'], | |
'reduce': out_dim.div_left(num_heads_dim), | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim} | |
}, | |
'att_weights': { | |
'class': 'softmax_over_spatial', | |
'from': 'energy', | |
'axis': layers_1_self_att_history_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim} | |
}, | |
'dropout': { | |
'class': 'dropout', | |
'from': 'att_weights', | |
'dropout': 0.1, | |
'dropout_axis': layers_1_self_att_history_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim} | |
}, | |
'att': { | |
'class': 'dot', | |
'from': ['dropout', 'base:v_new_dim'], | |
'reduce': layers_1_self_att_history_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'att', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
}, | |
'output_0': { | |
'class': 'merge_dims', | |
'from': 'dot_attention', | |
'axes': ( | |
num_heads_dim, | |
out_dim.div_left(num_heads_dim) | |
), | |
'out_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'output_0', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add_0': { | |
'class': 'combine', | |
'from': ['self_att', 'add'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_1': { | |
'class': 'layer_norm', | |
'from': 'add_0', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'conv_block': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'positionwise_conv1': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
2 * out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.3779644730092272, | |
'maxval': 0.3779644730092272, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm_1', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
2 * out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
2 * out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'glu': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'split': { | |
'class': 'split', | |
'from': 'base:positionwise_conv1', | |
'axis': 2 * out_dim, | |
'out_dims': [ | |
out_dim, | |
out_dim | |
], | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
}, | |
'sigmoid': { | |
'class': 'activation', | |
'from': 'split/1', | |
'activation': 'sigmoid', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'mul': { | |
'class': 'combine', | |
'from': ['split/0', 'sigmoid'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'depthwise_conv': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
filter_dim0_1_dim, | |
out_dim // 14, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.11180339887498948, | |
'maxval': 0.11180339887498948, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'conv': { | |
'class': 'conv', | |
'from': 'base:glu', | |
'in_dim': out_dim, | |
'in_spatial_dims': [ | |
conv_subsample_layer_out_dim | |
], | |
'out_dim': out_dim, | |
'out_spatial_dims': [ | |
conv_subsample_layer_out_dim | |
], | |
'filter_size': [32], | |
'padding': 'same', | |
'groups': 14, | |
'filter': 'filter', | |
'with_bias': True, | |
'bias': 'bias', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'conv', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
}, | |
'filter': { | |
'class': 'variable', | |
'shape': [ | |
filter_dim0_1_dim, | |
out_dim // 14, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'norm': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'batch_norm': { | |
'class': 'batch_norm', | |
'from': 'base:depthwise_conv/conv', | |
'in_dim': out_dim, | |
'use_std': True, | |
'use_shift': True, | |
'param_version': 2, | |
'reuse_params': { | |
'map': { | |
'batch_norm/v2_mean': {'layer_output': 'running_mean'}, | |
'batch_norm/v2_variance': {'layer_output': 'running_variance'}, | |
'batch_norm/v2_gamma': {'layer_output': 'gamma'}, | |
'batch_norm/v2_beta': {'layer_output': 'beta'} | |
} | |
}, | |
'momentum': 0.1, | |
'epsilon': 0.001, | |
'masked_time': False, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'batch_norm', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'beta': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
}, | |
'gamma': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 1.0 | |
}, | |
'running_mean': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'trainable': False, | |
'init': 0.0 | |
}, | |
'running_variance': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'trainable': False, | |
'init': 1.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'swish': { | |
'class': 'activation', | |
'from': 'norm', | |
'activation': 'swish', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'positionwise_conv2': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim.copy(match_priority=1), | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.4629100498862757, | |
'maxval': 0.4629100498862757, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:swish', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim.copy(match_priority=1), | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'positionwise_conv2', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_1': { | |
'class': 'dropout', | |
'from': 'conv_block', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add_1': { | |
'class': 'combine', | |
'from': ['dropout_1', 'add_0'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_2': { | |
'class': 'layer_norm', | |
'from': 'add_1', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'ffn2': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'linear_ff': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
out_dim, | |
ff_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:base:layer_norm_2', 'weight'], | |
'reduce': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
out_dim, | |
ff_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'swish': { | |
'class': 'activation', | |
'from': 'linear_ff', | |
'activation': 'swish', | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'dropout_0': { | |
'class': 'dropout', | |
'from': 'swish', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
}, | |
'linear_out': { | |
'class': 'subnetwork', | |
'from': [], | |
'subnetwork': { | |
'random': { | |
'class': 'random', | |
'shape': ( | |
ff_dim, | |
out_dim | |
), | |
'distribution': 'uniform', | |
'minval': -0.43994134506405985, | |
'maxval': 0.43994134506405985, | |
'dtype': 'float32', | |
'static': True | |
}, | |
'dot': { | |
'class': 'dot', | |
'from': ['base:dropout_0', 'weight'], | |
'reduce': ff_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add': { | |
'class': 'combine', | |
'from': ['dot', 'bias'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'weight': { | |
'class': 'variable', | |
'shape': [ | |
ff_dim, | |
out_dim | |
], | |
'param_name': 'param', | |
'init_by_layer': 'random' | |
}, | |
'bias': { | |
'class': 'variable', | |
'shape': [ | |
out_dim | |
], | |
'param_name': 'param', | |
'init': 0.0 | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'linear_out', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'dropout_2': { | |
'class': 'dropout', | |
'from': 'ffn2', | |
'dropout': 0.1, | |
'dropout_axis': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'constant_0': {'class': 'constant', 'value': 0.5}, | |
'mul_0': { | |
'class': 'combine', | |
'from': ['constant_0', 'dropout_2'], | |
'kind': 'mul', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'add_2': { | |
'class': 'combine', | |
'from': ['mul_0', 'add_1'], | |
'kind': 'add', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'layer_norm_3': { | |
'class': 'layer_norm', | |
'from': 'add_2', | |
'in_dim': out_dim, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'layer_norm_3', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': '1', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
}, | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
}, | |
'output': { | |
'class': 'copy', | |
'from': 'layers', | |
'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment