Created
November 1, 2017 16:47
-
-
Save musyoku/3f5758b77edbdf91f42033891d309b21 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| from __future__ import division | |
| import time, chainer, argparse, sys | |
| import fft, config | |
| import numpy as np | |
| from chainer import cuda | |
| import alsaaudio, pyaudio | |
| from multiprocessing import Process, Queue | |
| from model import load_model | |
| def get_vocab(): | |
| characters = [ | |
| u"_", # blank | |
| u"あ",u"い",u"う",u"え",u"お", | |
| u"か",u"き",u"く",u"け",u"こ", | |
| u"さ",u"し",u"す",u"せ",u"そ", | |
| u"た",u"ち",u"つ",u"て",u"と", | |
| u"な",u"に",u"ぬ",u"ね",u"の", | |
| u"は",u"ひ",u"ふ",u"へ",u"ほ", | |
| u"ま",u"み",u"む",u"め",u"も", | |
| u"や",u"ゆ",u"よ", | |
| u"ら",u"り",u"る",u"れ",u"ろ", | |
| u"わ",u"を",u"ん", | |
| u"が",u"ぎ",u"ぐ",u"げ",u"ご", | |
| u"ざ",u"じ",u"ず",u"ぜ",u"ぞ", | |
| u"だ",u"ぢ",u"づ",u"で",u"ど", | |
| u"ば",u"び",u"ぶ",u"べ",u"ぼ", | |
| u"ぱ",u"ぴ",u"ぷ",u"ぺ",u"ぽ", | |
| u"ぁ",u"ぃ",u"ぅ",u"ぇ",u"ぉ", | |
| u"ゃ",u"ゅ",u"ょ", | |
| u"っ", | |
| u"ー", | |
| ] | |
| vocab = {} | |
| for char in characters: | |
| vocab[char] = len(vocab) | |
| vocab_inv = {} | |
| for char, char_id in vocab.items(): | |
| vocab_inv[char_id] = char | |
| id_blank = 0 | |
| return vocab, vocab_inv, id_blank | |
| def reading_audio_loop(queue, device_index): | |
| recorder = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK, device="pulse") | |
| recorder.setchannels(1) | |
| recorder.setrate(16000) | |
| recorder.setformat(alsaaudio.PCM_FORMAT_S16_LE) | |
| recorder.setperiodsize(1024) | |
| config = chainer.config | |
| fbank = fft.get_filterbanks(nfft=config.num_fft, nfilt=config.num_mel_filters, samplerate=config.sampling_rate) | |
| mean = np.load("mean.npy").astype(np.float32) | |
| std = np.load("std.npy").astype(np.float32) | |
| # Cepstral Mean Normalization | |
| cepstral_mean = None | |
| if True: | |
| cepstral_mean = 0 | |
| total_samples = 3000 | |
| specgram_sequence = None | |
| signal_sequence = None | |
| itr = 0 | |
| signal = None | |
| while True: | |
| length, data = recorder.read() | |
| if length > 0: | |
| data = np.fromstring(data, 'int16') | |
| signal = data if signal is None else np.concatenate((signal, data), axis=0) | |
| if len(signal) > 1024: | |
| specgram = fft.get_specgram(signal, config.sampling_rate, nfft=config.num_fft, winlen=config.frame_width, winstep=config.frame_shift, winfunc=config.window_func) | |
| specgram = np.log(np.abs(specgram)) | |
| cepstral_mean += np.mean(specgram, axis=0) / len(data) | |
| sys.stdout.write("\rCollecting signals for CMN ... {}/{}".format(itr + 1, total_samples)) | |
| sys.stdout.flush() | |
| specgram_sequence = specgram if specgram_sequence is None else np.concatenate((specgram_sequence, specgram), axis=0) | |
| signal_sequence = signal if signal_sequence is None else np.concatenate((signal_sequence, signal), axis=0) | |
| signal = None | |
| itr += 1 | |
| if itr >= total_samples: | |
| break | |
| cepstral_mean /= total_samples | |
| cepstral_mean_sequence = np.broadcast_to(cepstral_mean[None, ...], specgram_sequence.shape) | |
| print("Done.") | |
| signal = None | |
| signal_sequence = np.zeros((1024 * 2,), dtype=np.int16) | |
| while True: | |
| length, data = recorder.read() | |
| if length > 0: | |
| data = np.fromstring(data, 'int16') | |
| # time.sleep(1024 / 16000) | |
| signal = data if signal is None else np.concatenate((signal, data), axis=0) | |
| if len(signal) > 1024: | |
| chunksize = len(signal) | |
| signal_sequence = np.roll(signal_sequence, -chunksize, axis=0) | |
| signal_sequence[-chunksize:] = signal | |
| specgram = fft.get_specgram(signal_sequence, config.sampling_rate, nfft=config.num_fft, winlen=config.frame_width, winstep=config.frame_shift, winfunc=config.window_func) | |
| if cepstral_mean is not None: | |
| specgram = np.exp(np.log(np.abs(specgram)) - cepstral_mean) | |
| # specgram = fft.normalize_vocal_tract(specgram, ratio=1.5) | |
| logmel = fft.compute_logmel(specgram, config.sampling_rate, fbank=fbank, nfft=config.num_fft, nfilt=config.num_mel_filters) | |
| logmel, delta, delta_delta = fft.compute_deltas(logmel) | |
| logmel = logmel.T | |
| delta = delta.T | |
| delta_delta = delta_delta.T | |
| x_batch = np.zeros((3, logmel.shape[0], logmel.shape[1]), dtype=np.float32) | |
| x_batch[0] = logmel | |
| x_batch[1] = delta | |
| x_batch[2] = delta_delta | |
| x_batch = (x_batch - mean) / std | |
| queue.put(x_batch) | |
| signal = None | |
| def main(): | |
| model = load_model(args.model_dir) | |
| assert model is not None | |
| if args.gpu_device >= 0: | |
| chainer.cuda.get_device(args.gpu_device).use() | |
| model.to_gpu(args.gpu_device) | |
| vocab, vocab_inv, BLANK = get_vocab() | |
| queue = Queue() | |
| preloading_process = Process(target=reading_audio_loop, args=[queue, 0]) | |
| preloading_process.start() | |
| config = chainer.config | |
| required_length = (5 - 1) * 5 + 1 | |
| xp = model.xp | |
| feature_seqence = xp.zeros((1, 3, config.num_mel_filters, required_length * 4), dtype=np.float32) | |
| signal_sequence = np.zeros((1024 * 2,), dtype=np.int16) | |
| mean = np.load("mean.npy").astype(np.float32) | |
| std = np.load("std.npy").astype(np.float32) | |
| last_token = BLANK | |
| no_signal_count = 0 | |
| timer_count = 0 | |
| token_sequence = [] | |
| prev_output_token = BLANK | |
| pred_sentence = "" | |
| while True: | |
| try: | |
| x_batch = queue.get(False) | |
| if args.gpu_device != -1: | |
| x_batch = cuda.to_gpu(x_batch.astype(np.float32)) | |
| length = x_batch.shape[2] | |
| feature_seqence = xp.roll(feature_seqence, -length, axis=3) | |
| feature_seqence[0, ..., -length:] = x_batch | |
| y_batch = model(feature_seqence, split_into_variables=False) | |
| y_batch = xp.argmax(y_batch.data, axis=2)[0, -length:] | |
| pred_seqence = [] if last_token == BLANK else [last_token] | |
| prev_token = last_token | |
| for token in y_batch: | |
| token = int(token) | |
| if token == BLANK: | |
| prev_token = BLANK | |
| continue | |
| if token == prev_token: | |
| continue | |
| pred_seqence.append(token) | |
| prev_token = token | |
| last_token = prev_token | |
| if len(pred_seqence) > 0: | |
| if last_token != BLANK: | |
| last_token = pred_seqence[-1] | |
| pred_seqence = pred_seqence[:-1] | |
| token_sequence += pred_seqence | |
| signal = None | |
| except: | |
| time.sleep(1024 / 16000) | |
| no_signal_count += 1 | |
| continue | |
| if len(token_sequence) == 0: | |
| no_signal_count += 1 | |
| else: | |
| no_signal_count = 0 | |
| timer_count += 1 | |
| if timer_count > 5: | |
| if len(token_sequence) > 0: | |
| pred_str = "" | |
| for token in token_sequence: | |
| if token == prev_output_token: | |
| continue | |
| pred_str += vocab_inv[token] | |
| prev_output_token = token | |
| if len(pred_str) > 0: | |
| print(pred_str) | |
| pred_sentence += pred_str | |
| timer_count = 0 | |
| token_sequence = [] | |
| if no_signal_count > 5: | |
| if len(pred_sentence) > 0: | |
| print("out:", pred_sentence) | |
| pred_sentence = "" | |
| no_signal_count = 0 | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--gpu-device", "-g", type=int, default=0) | |
| parser.add_argument("--model-dir", "-m", type=str, default="model") | |
| args = parser.parse_args() | |
| # p = pyaudio.PyAudio() | |
| # count = p.get_device_count() | |
| # devices = [] | |
| # for i in range(count): | |
| # devices.append(p.get_device_info_by_index(i)) | |
| # for i, dev in enumerate(devices): | |
| # print (i, dev["name"]) | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment