-
-
Save CookiePPP/ddbb8a5a9bf18c2e6f79ce3957bd4600 to your computer and use it in GitHub Desktop.
Stepwise Monotonic Attention
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Forked From: | |
https://gist.github.com/mutiann/38a7638f75c21479582d7391490df37c | |
Implementation for https://arxiv.org/abs/1906.00672 | |
Tips: You may use "hard" for hard inference, or "parallel" for training or | |
soft inference. | |
You possibly have to tune the score_bias_init, which, similar to that in Raffel et al., 2017, is determined a priori to | |
suit the moving speed of the alignments, i.e. speed of speech of your training corpus in TTS cases. So | |
score_bias_init=3.5, is a good one for our data, but not necessarily for yours, and our experiments find that the | |
results are sensitive to this bias: When the parameter is deviated from the best value, by, say, a small amount of | |
0.5, the whole training process may fail. sigmoid_noise=2.0 is enough in our experiments, but if you found that the | |
resultant alignments are far from binary, adding more noise (or annealing the noise) might be useful. Other | |
hyperparameters in our experiments simply follow the original Tacotron2 settings, and they work. | |
''' | |
class StepwiseMonotonicAttention(nn.Module): | |
def __init__(self, query_dim, value_dim, attention_dim, | |
sigmoid_noise=2.0, score_bias_init=3.5, | |
use_hard_attention=False): | |
super(StepwiseMonotonicAttention, self).__init__() | |
self.query_layer = nn.Linear(query_dim, attention_dim, bias=False), | |
self.memory_layer = nn.Linear(value_dim, attention_dim, bias=False), | |
self.v = LinearNorm(attention_dim, 1, bias=True) | |
self.v.bias.data.fill_(score_bias_init) | |
self.use_hard_attention = use_hard_attention | |
self.score_mask_value = 0.0 | |
def set_soft_attention(self): | |
self.use_hard_attention = False | |
def set_hard_attention(self): | |
self.use_hard_attention = True | |
def monotonic_stepwise_attention(self, p_choose_i, previous_attention, hard_attention): | |
# p_choose_i, previous_alignments, previous_score: [B, memory_size] | |
# p_choose_i: probability to keep attended to the last attended entry i | |
if hard_attention: | |
# Given that previous_alignments is one_hot | |
move_next_mask = F.pad(previous_attention[:, :-1]], (1, 0)) | |
stay_prob = torch.sum(p_choose_i * previous_attention, dim=1) # [B, memory_size] -> [B] | |
attention = torch.where(stay_prob > 0.5, previous_attention, move_next_mask) | |
else: | |
attention = previous_attention * p_choose_i + F.pad(previous_attention[:, :-1] * (1.0 - p_choose_i[:, :-1]), (1, 0)) | |
return attention | |
def _stepwise_monotonic_probability_fn(self, score, previous_alignments, sigmoid_noise, hard_attention): | |
""" | |
score: [B, enc_T] | |
previous_alignments: [B, enc_T] | |
""" | |
if sigmoid_noise > 0: | |
noise = torch.randn(score.shape, device=score.device, dtype=score.dtype) | |
score += sigmoid_noise * noise | |
if hard_attention: | |
# When mode is hard, use a hard sigmoid | |
p_choose_i = (score > 0.).to(score.dtype) | |
else: | |
p_choose_i = score.sigmoid() | |
alignments = self.monotonic_stepwise_attention(p_choose_i, previous_alignments, hard_attention) | |
return alignments | |
def get_alignment_energies(self, query, processed_memory, previous_alignments): | |
processed = self.query_layer(query.unsqueeze(1)).expand_as(processed_memory) # [B, enc_T, attention_dim] | |
processed.add_( processed_memory ) # [B, enc_T, attention_dim] # unsqueeze, matmul, expand_as, add_ | |
# add_ | |
score = self.v(torch.tanh( processed )).squeeze(2) # [B, enc_T, attention_dim] -> [B, enc_T] | |
# tanh, matmul | |
# [B, enc_T], [B, enc_T] -> [B, enc_T] | |
alignments = self._stepwise_montonic_probability_fn(score, previous_alignments, sigmoid_noise, self.use_hard_attention) | |
return alignment | |
def forward(self, query, memory, processed_memory, previous_alignments, | |
mask, attention_weights=None) | |
if attention_weights is None: | |
alignment = self.get_alignment_energies( | |
attention_weights, processed_memory, previous_alignments) | |
if mask is not None: | |
alignment.data.masked_fill_(mask, self.score_mask_value)# [B, enc_T] | |
attention_weights = alignment # normalise? | |
attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)# unsqueeze, bmm | |
# [B, 1, enc_T] @ [B, enc_T, enc_dim] -> [B, 1, enc_dim] | |
attention_context = attention_context.squeeze(1)# [B, 1, enc_dim] -> [B, enc_dim] # squeeze | |
return attention_context, attention_weights# [B, enc_dim], [B, enc_T] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment