Skip to content

Instantly share code, notes, and snippets.

@musyoku
Created November 1, 2017 16:47
Show Gist options
  • Save musyoku/3f5758b77edbdf91f42033891d309b21 to your computer and use it in GitHub Desktop.
Save musyoku/3f5758b77edbdf91f42033891d309b21 to your computer and use it in GitHub Desktop.
# coding: utf-8
from __future__ import division
import time, chainer, argparse, sys
import fft, config
import numpy as np
from chainer import cuda
import alsaaudio, pyaudio
from multiprocessing import Process, Queue
from model import load_model
def get_vocab():
characters = [
u"_", # blank
u"あ",u"い",u"う",u"え",u"お",
u"か",u"き",u"く",u"け",u"こ",
u"さ",u"し",u"す",u"せ",u"そ",
u"た",u"ち",u"つ",u"て",u"と",
u"な",u"に",u"ぬ",u"ね",u"の",
u"は",u"ひ",u"ふ",u"へ",u"ほ",
u"ま",u"み",u"む",u"め",u"も",
u"や",u"ゆ",u"よ",
u"ら",u"り",u"る",u"れ",u"ろ",
u"わ",u"を",u"ん",
u"が",u"ぎ",u"ぐ",u"げ",u"ご",
u"ざ",u"じ",u"ず",u"ぜ",u"ぞ",
u"だ",u"ぢ",u"づ",u"で",u"ど",
u"ば",u"び",u"ぶ",u"べ",u"ぼ",
u"ぱ",u"ぴ",u"ぷ",u"ぺ",u"ぽ",
u"ぁ",u"ぃ",u"ぅ",u"ぇ",u"ぉ",
u"ゃ",u"ゅ",u"ょ",
u"っ",
u"ー",
]
vocab = {}
for char in characters:
vocab[char] = len(vocab)
vocab_inv = {}
for char, char_id in vocab.items():
vocab_inv[char_id] = char
id_blank = 0
return vocab, vocab_inv, id_blank
def reading_audio_loop(queue, device_index):
recorder = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK, device="pulse")
recorder.setchannels(1)
recorder.setrate(16000)
recorder.setformat(alsaaudio.PCM_FORMAT_S16_LE)
recorder.setperiodsize(1024)
config = chainer.config
fbank = fft.get_filterbanks(nfft=config.num_fft, nfilt=config.num_mel_filters, samplerate=config.sampling_rate)
mean = np.load("mean.npy").astype(np.float32)
std = np.load("std.npy").astype(np.float32)
# Cepstral Mean Normalization
cepstral_mean = None
if True:
cepstral_mean = 0
total_samples = 3000
specgram_sequence = None
signal_sequence = None
itr = 0
signal = None
while True:
length, data = recorder.read()
if length > 0:
data = np.fromstring(data, 'int16')
signal = data if signal is None else np.concatenate((signal, data), axis=0)
if len(signal) > 1024:
specgram = fft.get_specgram(signal, config.sampling_rate, nfft=config.num_fft, winlen=config.frame_width, winstep=config.frame_shift, winfunc=config.window_func)
specgram = np.log(np.abs(specgram))
cepstral_mean += np.mean(specgram, axis=0) / len(data)
sys.stdout.write("\rCollecting signals for CMN ... {}/{}".format(itr + 1, total_samples))
sys.stdout.flush()
specgram_sequence = specgram if specgram_sequence is None else np.concatenate((specgram_sequence, specgram), axis=0)
signal_sequence = signal if signal_sequence is None else np.concatenate((signal_sequence, signal), axis=0)
signal = None
itr += 1
if itr >= total_samples:
break
cepstral_mean /= total_samples
cepstral_mean_sequence = np.broadcast_to(cepstral_mean[None, ...], specgram_sequence.shape)
print("Done.")
signal = None
signal_sequence = np.zeros((1024 * 2,), dtype=np.int16)
while True:
length, data = recorder.read()
if length > 0:
data = np.fromstring(data, 'int16')
# time.sleep(1024 / 16000)
signal = data if signal is None else np.concatenate((signal, data), axis=0)
if len(signal) > 1024:
chunksize = len(signal)
signal_sequence = np.roll(signal_sequence, -chunksize, axis=0)
signal_sequence[-chunksize:] = signal
specgram = fft.get_specgram(signal_sequence, config.sampling_rate, nfft=config.num_fft, winlen=config.frame_width, winstep=config.frame_shift, winfunc=config.window_func)
if cepstral_mean is not None:
specgram = np.exp(np.log(np.abs(specgram)) - cepstral_mean)
# specgram = fft.normalize_vocal_tract(specgram, ratio=1.5)
logmel = fft.compute_logmel(specgram, config.sampling_rate, fbank=fbank, nfft=config.num_fft, nfilt=config.num_mel_filters)
logmel, delta, delta_delta = fft.compute_deltas(logmel)
logmel = logmel.T
delta = delta.T
delta_delta = delta_delta.T
x_batch = np.zeros((3, logmel.shape[0], logmel.shape[1]), dtype=np.float32)
x_batch[0] = logmel
x_batch[1] = delta
x_batch[2] = delta_delta
x_batch = (x_batch - mean) / std
queue.put(x_batch)
signal = None
def main():
model = load_model(args.model_dir)
assert model is not None
if args.gpu_device >= 0:
chainer.cuda.get_device(args.gpu_device).use()
model.to_gpu(args.gpu_device)
vocab, vocab_inv, BLANK = get_vocab()
queue = Queue()
preloading_process = Process(target=reading_audio_loop, args=[queue, 0])
preloading_process.start()
config = chainer.config
required_length = (5 - 1) * 5 + 1
xp = model.xp
feature_seqence = xp.zeros((1, 3, config.num_mel_filters, required_length * 4), dtype=np.float32)
signal_sequence = np.zeros((1024 * 2,), dtype=np.int16)
mean = np.load("mean.npy").astype(np.float32)
std = np.load("std.npy").astype(np.float32)
last_token = BLANK
no_signal_count = 0
timer_count = 0
token_sequence = []
prev_output_token = BLANK
pred_sentence = ""
while True:
try:
x_batch = queue.get(False)
if args.gpu_device != -1:
x_batch = cuda.to_gpu(x_batch.astype(np.float32))
length = x_batch.shape[2]
feature_seqence = xp.roll(feature_seqence, -length, axis=3)
feature_seqence[0, ..., -length:] = x_batch
y_batch = model(feature_seqence, split_into_variables=False)
y_batch = xp.argmax(y_batch.data, axis=2)[0, -length:]
pred_seqence = [] if last_token == BLANK else [last_token]
prev_token = last_token
for token in y_batch:
token = int(token)
if token == BLANK:
prev_token = BLANK
continue
if token == prev_token:
continue
pred_seqence.append(token)
prev_token = token
last_token = prev_token
if len(pred_seqence) > 0:
if last_token != BLANK:
last_token = pred_seqence[-1]
pred_seqence = pred_seqence[:-1]
token_sequence += pred_seqence
signal = None
except:
time.sleep(1024 / 16000)
no_signal_count += 1
continue
if len(token_sequence) == 0:
no_signal_count += 1
else:
no_signal_count = 0
timer_count += 1
if timer_count > 5:
if len(token_sequence) > 0:
pred_str = ""
for token in token_sequence:
if token == prev_output_token:
continue
pred_str += vocab_inv[token]
prev_output_token = token
if len(pred_str) > 0:
print(pred_str)
pred_sentence += pred_str
timer_count = 0
token_sequence = []
if no_signal_count > 5:
if len(pred_sentence) > 0:
print("out:", pred_sentence)
pred_sentence = ""
no_signal_count = 0
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--gpu-device", "-g", type=int, default=0)
parser.add_argument("--model-dir", "-m", type=str, default="model")
args = parser.parse_args()
# p = pyaudio.PyAudio()
# count = p.get_device_count()
# devices = []
# for i in range(count):
# devices.append(p.get_device_info_by_index(i))
# for i, dev in enumerate(devices):
# print (i, dev["name"])
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment