Skip to content

Instantly share code, notes, and snippets.

@xhluca
Created May 21, 2020 19:16
Show Gist options
  • Save xhluca/cd97178f007cfdd0ef7d5a594b81e93f to your computer and use it in GitHub Desktop.
Save xhluca/cd97178f007cfdd0ef7d5a594b81e93f to your computer and use it in GitHub Desktop.
Dash Text to Speech
# -*- coding: utf-8 -*-
"""
Speech Synthesis with Dash. This will only work with a Nvidia GPU.
Requirements (put them in requirements.txt):
apex
dash
numpy
scipy
torch
"""
import base64
import time
import io
from apex import amp
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import numpy as np
from scipy.io.wavfile import write
import torch
# Load model
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
tacotron2 = tacotron2.to('cuda')
tacotron2 = amp.initialize(tacotron2, opt_level="O1")
tacotron2.eval()
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow = amp.initialize(waveglow, opt_level="O1")
waveglow.eval()
# Dash app starts here
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div(children=[
html.H1(children='Dash Text-to-Speech'),
dcc.Textarea(
id='textarea-input',
value=text,
style={'width': '100%', 'height': '45vh'}
),
dcc.Loading([
html.Button("Generate", id='button'),
html.Audio(id='audio-out', controls=True)
])
])
@app.callback(Output("audio-out", "src"),
[Input("button", "n_clicks")],
[State("textarea-input", "value")])
def generate_audio(n_clicks, text):
if text == "":
text = "Sorry, there's nothing in the text input. Please write something."
t0 = time.time()
# preprocessing
sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)
# run the models
with torch.no_grad():
_, mel, _, _ = tacotron2.infer(sequence)
audio = waveglow.infer(mel)
audio_numpy = audio[0].data.cpu().numpy()
rate = 22050
t1 = time.time()
buffer = io.BytesIO()
write(buffer, rate, audio_numpy)
b64 = base64.b64encode(buffer.getvalue())
sound = "data:audio/x-wav;base64," + b64.decode("ascii")
t2 = time.time()
print(f"Completed in {t2-t0:.3f}s. Generation took {t1-t0:.3f}s, file creation took {t2-t1:.3f}s")
return sound
if __name__ == "__main__":
app.run_server(debug=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment