CookiePPP · August 15, 2020 11:24
diff --git a/stepwise.py b/stepwise.py
 '''
 Forked From:
 https://gist.github.com/mutiann/38a7638f75c21479582d7391490df37c

 Implementation for https://arxiv.org/abs/1906.00672

 Tips: You may use "hard" for hard inference, or "parallel" for training or 
 soft inference.

 You possibly have to tune the score_bias_init, which, similar to that in Raffel et al., 2017, is determined a priori to 
 suit the moving speed of the alignments, i.e. speed of speech of your training corpus in TTS cases. So 
 score_bias_init=3.5, is a good one for our data, but not necessarily for yours, and our experiments find that the 
 results are sensitive to this bias: When the parameter is deviated from the best value, by, say, a small amount of 
 0.5, the whole training process may fail. sigmoid_noise=2.0 is enough in our experiments, but if you found that the 
 resultant alignments are far from binary, adding more noise (or annealing the noise) might be useful. Other 
 hyperparameters in our experiments simply follow the original Tacotron2 settings, and they work.
 '''

 class StepwiseMonotonicAttention(nn.Module):
    def __init__(self, query_dim, value_dim, attention_dim,
                 sigmoid_noise=2.0, score_bias_init=3.5,
                 use_hard_attention=False):
        super(StepwiseMonotonicAttention, self).__init__()
        self.query_layer = nn.Linear(query_dim, attention_dim, bias=False),
        self.memory_layer = nn.Linear(value_dim, attention_dim, bias=False),
        self.v = LinearNorm(attention_dim, 1, bias=True)
        self.v.bias.data.fill_(score_bias_init)
        self.use_hard_attention = use_hard_attention
        self.score_mask_value = 0.0
    
    def set_soft_attention(self):
        self.use_hard_attention = False
    
    def set_hard_attention(self):
        self.use_hard_attention = True
    
    def monotonic_stepwise_attention(self, p_choose_i, previous_attention, hard_attention):
        # p_choose_i, previous_alignments, previous_score: [B, memory_size]
        # p_choose_i: probability to keep attended to the last attended entry i
        if hard_attention:
            # Given that previous_alignments is one_hot
            move_next_mask = F.pad(previous_attention[:, :-1]], (1, 0))
            stay_prob = torch.sum(p_choose_i * previous_attention, dim=1) # [B, memory_size] -> [B]
            attention = torch.where(stay_prob > 0.5, previous_attention, move_next_mask)
        else:
            attention = previous_attention * p_choose_i + F.pad(previous_attention[:, :-1] * (1.0 - p_choose_i[:, :-1]), (1, 0))
        return attention
    
    def _stepwise_monotonic_probability_fn(self, score, previous_alignments, sigmoid_noise, hard_attention):
        """
        score: [B, enc_T]
        previous_alignments: [B, enc_T]
        """
        if sigmoid_noise > 0:
            noise = torch.randn(score.shape, device=score.device, dtype=score.dtype)
            score += sigmoid_noise * noise
        if hard_attention:
            # When mode is hard, use a hard sigmoid
            p_choose_i = (score > 0.).to(score.dtype)
        else:
            p_choose_i = score.sigmoid()
        alignments = self.monotonic_stepwise_attention(p_choose_i, previous_alignments, hard_attention)
        return alignments
    
    def get_alignment_energies(self, query, processed_memory, previous_alignments):
        processed = self.query_layer(query.unsqueeze(1)).expand_as(processed_memory) # [B, enc_T, attention_dim] 
        processed.add_( processed_memory ) # [B, enc_T, attention_dim]               # unsqueeze, matmul, expand_as, add_
                                           # add_
        score = self.v(torch.tanh( processed )).squeeze(2) # [B, enc_T, attention_dim] -> [B, enc_T]
                                                           # tanh, matmul
        
        # [B, enc_T], [B, enc_T] -> [B, enc_T]
        alignments = self._stepwise_montonic_probability_fn(score, previous_alignments, sigmoid_noise, self.use_hard_attention)
        return alignment
    
    def forward(self, query, memory, processed_memory, previous_alignments,
                                                    mask, attention_weights=None)
        if attention_weights is None:
            alignment = self.get_alignment_energies(
                attention_weights, processed_memory, previous_alignments)
            
            if mask is not None:
                alignment.data.masked_fill_(mask, self.score_mask_value)# [B, enc_T]
            
            attention_weights = alignment # normalise?
        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)# unsqueeze, bmm
                                # [B, 1, enc_T] @ [B, enc_T, enc_dim] -> [B, 1, enc_dim]
        attention_context = attention_context.squeeze(1)# [B, 1, enc_dim] -> [B, enc_dim] # squeeze
        
        return attention_context, attention_weights# [B, enc_dim], [B, enc_T]
	'''
	Forked From:
	https://gist.github.com/mutiann/38a7638f75c21479582d7391490df37c

	Implementation for https://arxiv.org/abs/1906.00672

	Tips: You may use "hard" for hard inference, or "parallel" for training or
	soft inference.

	You possibly have to tune the score_bias_init, which, similar to that in Raffel et al., 2017, is determined a priori to
	suit the moving speed of the alignments, i.e. speed of speech of your training corpus in TTS cases. So
	score_bias_init=3.5, is a good one for our data, but not necessarily for yours, and our experiments find that the
	results are sensitive to this bias: When the parameter is deviated from the best value, by, say, a small amount of
	0.5, the whole training process may fail. sigmoid_noise=2.0 is enough in our experiments, but if you found that the
	resultant alignments are far from binary, adding more noise (or annealing the noise) might be useful. Other
	hyperparameters in our experiments simply follow the original Tacotron2 settings, and they work.
	'''

	class StepwiseMonotonicAttention(nn.Module):
	def __init__(self, query_dim, value_dim, attention_dim,
	sigmoid_noise=2.0, score_bias_init=3.5,
	use_hard_attention=False):
	super(StepwiseMonotonicAttention, self).__init__()
	self.query_layer = nn.Linear(query_dim, attention_dim, bias=False),
	self.memory_layer = nn.Linear(value_dim, attention_dim, bias=False),
	self.v = LinearNorm(attention_dim, 1, bias=True)
	self.v.bias.data.fill_(score_bias_init)
	self.use_hard_attention = use_hard_attention
	self.score_mask_value = 0.0

	def set_soft_attention(self):
	self.use_hard_attention = False

	def set_hard_attention(self):
	self.use_hard_attention = True

	def monotonic_stepwise_attention(self, p_choose_i, previous_attention, hard_attention):
	# p_choose_i, previous_alignments, previous_score: [B, memory_size]
	# p_choose_i: probability to keep attended to the last attended entry i
	if hard_attention:
	# Given that previous_alignments is one_hot
	move_next_mask = F.pad(previous_attention[:, :-1]], (1, 0))
	stay_prob = torch.sum(p_choose_i * previous_attention, dim=1) # [B, memory_size] -> [B]
	attention = torch.where(stay_prob > 0.5, previous_attention, move_next_mask)
	else:
	attention = previous_attention * p_choose_i + F.pad(previous_attention[:, :-1] * (1.0 - p_choose_i[:, :-1]), (1, 0))
	return attention

	def _stepwise_monotonic_probability_fn(self, score, previous_alignments, sigmoid_noise, hard_attention):
	"""
	score: [B, enc_T]
	previous_alignments: [B, enc_T]
	"""
	if sigmoid_noise > 0:
	noise = torch.randn(score.shape, device=score.device, dtype=score.dtype)
	score += sigmoid_noise * noise
	if hard_attention:
	# When mode is hard, use a hard sigmoid
	p_choose_i = (score > 0.).to(score.dtype)
	else:
	p_choose_i = score.sigmoid()
	alignments = self.monotonic_stepwise_attention(p_choose_i, previous_alignments, hard_attention)
	return alignments

	def get_alignment_energies(self, query, processed_memory, previous_alignments):
	processed = self.query_layer(query.unsqueeze(1)).expand_as(processed_memory) # [B, enc_T, attention_dim]
	processed.add_( processed_memory ) # [B, enc_T, attention_dim] # unsqueeze, matmul, expand_as, add_
	# add_
	score = self.v(torch.tanh( processed )).squeeze(2) # [B, enc_T, attention_dim] -> [B, enc_T]
	# tanh, matmul

	# [B, enc_T], [B, enc_T] -> [B, enc_T]
	alignments = self._stepwise_montonic_probability_fn(score, previous_alignments, sigmoid_noise, self.use_hard_attention)
	return alignment

	def forward(self, query, memory, processed_memory, previous_alignments,
	mask, attention_weights=None)
	if attention_weights is None:
	alignment = self.get_alignment_energies(
	attention_weights, processed_memory, previous_alignments)

	if mask is not None:
	alignment.data.masked_fill_(mask, self.score_mask_value)# [B, enc_T]

	attention_weights = alignment # normalise?
	attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)# unsqueeze, bmm
	# [B, 1, enc_T] @ [B, enc_T, enc_dim] -> [B, 1, enc_dim]
	attention_context = attention_context.squeeze(1)# [B, 1, enc_dim] -> [B, enc_dim] # squeeze

	return attention_context, attention_weights# [B, enc_dim], [B, enc_T]