Created
September 19, 2019 17:59
-
-
Save anotherdirtbag/a03a8826ddf04bed0f1433bbc84032b0 to your computer and use it in GitHub Desktop.
Working example of Mozilla TTS tacotron2+wavernn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#following instructions from https://github.com/mozilla/TTS/blob/master/notebooks/Benchmark.ipynb | |
#%load_ext autoreload | |
#%autoreload 2 | |
import os | |
import sys | |
import io | |
import time | |
import numpy as np | |
#pip3 install --user numpy | |
from collections import OrderedDict | |
#from matplotlib import pylab as plt | |
import torch | |
#To install with CUDA 9.2. This worked for me | |
#https://developer.nvidia.com/cuda-92-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exenetwork | |
#pip3 install --user --no-cache-dir torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html | |
#For trying CUDA 10.0. This didn't work for me | |
#https://developer.nvidia.com/cuda-10.0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal | |
#pip3 install --no-cache-dir --user torch==1.2.0 torchvision==0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html | |
TTS_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\TTS') | |
WAVERNN_PATH = os.path.join(TTS_PATH, 'WaveRNN') | |
#%pylab inline | |
#rcParams["figure.figsize"] = (16,5) | |
# add libraries into environment | |
#import importlib | |
#importlib.import_module('TTS') | |
sys.path.append(TTS_PATH) # set this if TTS is not installed globally | |
sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally | |
import librosa | |
import librosa.display | |
from models.tacotron import Tacotron | |
from layers import * | |
from utils.data import * | |
from utils.audio import AudioProcessor | |
from utils.generic_utils import load_config, setup_model | |
from utils.text import text_to_sequence, cleaners | |
from utils.synthesis import synthesis | |
#from utils.visual import visualize | |
#import IPython | |
#from IPython.display import Audio | |
#pip3 install --user ipython | |
import os | |
import re | |
#os.environ['CUDA_VISIBLE_DEVICES']='1' | |
#os.environ['OMP_NUM_THREADS']='1' | |
iscuda = torch.cuda.is_available() | |
print('torch.cuda.is_available()=' + str(iscuda)) | |
runcounter = 0 | |
def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True): | |
global runcounter | |
t_1 = time.time() | |
submatch = re.sub(r'\s+',' ',text) | |
file_namematch = re.search( r'([^\s]+\s?\d+)', submatch) | |
if file_namematch: | |
file_name = file_namematch.group(0) + '_' + str(runcounter) + '.wav' | |
else: | |
file_name = 'tempout_' + str(runcounter) + '.wav' | |
runcounter += 1 | |
waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False) | |
if CONFIG.model == "Tacotron" and not use_gl: | |
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T | |
if not use_gl: | |
waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550) | |
print(" > Run-time: {}".format(time.time() - t_1)) | |
#if figures: | |
# visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec) | |
#IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'])) | |
os.makedirs(OUT_FOLDER, exist_ok=True) | |
out_path = os.path.join(OUT_FOLDER, file_name) | |
ap.save_wav(waveform, out_path) | |
return alignment, mel_postnet_spec, stop_tokens, waveform | |
# Set constants | |
ROOT_PATH = TTS_PATH | |
MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','checkpoint_261000.pth.tar') | |
CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','config.json') | |
OUT_FOLDER = os.path.join(ROOT_PATH,'AudioSamples/benchmark_samples/') | |
CONFIG = load_config(CONFIG_PATH) | |
VOCODER_MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'checkpoint_433000.pth.tar') | |
VOCODER_CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'config.json') | |
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH) | |
use_cuda = True | |
# Set some config fields manually for testing | |
# CONFIG.windowing = False | |
# CONFIG.prenet_dropout = False | |
# CONFIG.separate_stopnet = True | |
# CONFIG.stopnet = True | |
# Set the vocoder | |
use_gl = False # use GL if True | |
batched_wavernn = True # use batched wavernn inference if True | |
# LOAD TTS MODEL | |
from utils.text.symbols import symbols, phonemes | |
# load the model | |
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) | |
model = setup_model(num_chars, CONFIG) | |
# load the audio processor | |
ap = AudioProcessor(**CONFIG.audio) | |
# load model state | |
if use_cuda: | |
cp = torch.load(MODEL_PATH) | |
else: | |
cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) | |
# load the model | |
model.load_state_dict(cp['model']) | |
if use_cuda: | |
model.cuda() | |
model.eval() | |
print(cp['step']) | |
# LOAD WAVERNN | |
if use_gl == False: | |
from WaveRNN.models.wavernn import Model | |
bits = 10 | |
wavernn = Model( | |
rnn_dims=512, | |
fc_dims=512, | |
mode="mold", | |
pad=2, | |
upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset | |
feat_dims=VOCODER_CONFIG.audio["num_mels"], | |
compute_dims=128, | |
res_out_dims=128, | |
res_blocks=10, | |
hop_length=ap.hop_length, | |
sample_rate=ap.sample_rate, | |
).cuda() | |
check = torch.load(VOCODER_MODEL_PATH) | |
wavernn.load_state_dict(check['model']) | |
if use_cuda: | |
wavernn.cuda() | |
wavernn.eval() | |
print(check['step']) | |
illegalchars_exclusive = re.compile(r'[^\w\d\.\,\;\!\?\s]') | |
repitiion = re.compile(r'\s{2,}') | |
def custom_text_fix(sentence): | |
global illegalchars_exclusive | |
global repitiion | |
newsentance = illegalchars_exclusive.sub(' ', sentence) | |
newsentance = repitiion.sub(' ', newsentance) | |
return newsentance | |
model.eval() | |
model.decoder.max_decoder_steps = 2000 | |
speaker_id = 0 | |
sentences = ["Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go." ] | |
for sentence in sentences: | |
sentence = custom_text_fix(sentence) | |
sentence = cleaners.english_cleaners(sentence) | |
alizgn, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment