Skip to content

Instantly share code, notes, and snippets.

@theeluwin
Created December 6, 2017 04:52
Show Gist options
  • Save theeluwin/54d9681cf21c0dd09f8e7a83650649ba to your computer and use it in GitHub Desktop.
Save theeluwin/54d9681cf21c0dd09f8e7a83650649ba to your computer and use it in GitHub Desktop.
ASR by CNN-Highway-RNN CTC
from model import *
class HighwaySpeech(DeepSpeech):
def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=1024, nb_layers=3, audio_conf=None, bidirectional=True):
super(HighwaySpeech, self).__init__()
# fixed-values
rnn_type = nn.LSTM
nb_layers = 3
rnn_hidden_size = 1024
self._rnn_type = rnn_type
self._labels = labels
self._hidden_size = rnn_hidden_size
self._hidden_layers = nb_layers
self._audio_conf = audio_conf or {}
self._bidirectional = True
sample_rate = self._audio_conf.get('sample_rate', 16000)
window_size = self._audio_conf.get('window_size', 0.02)
num_classes = len(self._labels)
self.conv = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(0, 10)),
nn.BatchNorm2d(32),
nn.Hardtanh(0, 20, inplace=True),
nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1)),
nn.BatchNorm2d(32),
nn.Hardtanh(0, 20, inplace=True),
)
rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
rnn_input_size = int(math.floor(rnn_input_size - 41) / 2 + 1)
rnn_input_size = int(math.floor(rnn_input_size - 21) / 2 + 1) * 32
self.rnn0 = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, bidirectional=True, batch_norm=False)
self.rnn1 = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, bidirectional=True)
self.rnn2 = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, bidirectional=True)
self.rnn3 = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, bidirectional=True)
self.gate = nn.Sequential(SequenceWise(nn.Sequential(
nn.Linear(rnn_hidden_size, rnn_hidden_size),
)))
self.fc = nn.Sequential(SequenceWise(nn.Sequential(
nn.BatchNorm1d(rnn_hidden_size),
nn.Linear(rnn_hidden_size, num_classes, bias=False),
)))
self.inference_softmax = InferenceBatchSoftmax()
def forward(self, x):
out = self.conv(x)
sizes = out.size()
out = out.view(-1, sizes[1] * sizes[2], sizes[3])
out = out.transpose(1, 2).transpose(0, 1).contiguous()
out0 = self.rnn0(out)
out1 = self.rnn1(out0)
out2 = self.rnn2(out0)
gate = self.gate(out0)
gate = F.sigmoid(gate)
out = (1 - gate) * out1 + gate * out2
out = self.rnn3(out)
out = self.fc(out)
out = out.transpose(0, 1)
out = self.inference_softmax(out)
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment