xhluca · May 21, 2020 19:16
diff --git a/speech_synthesis_app.py b/speech_synthesis_app.py
 # -*- coding: utf-8 -*-
 """
 Speech Synthesis with Dash. This will only work with a Nvidia GPU.

 Requirements (put them in requirements.txt):

 apex
 dash
 numpy
 scipy
 torch
 """
 import base64
 import time
 import io

 from apex import amp
 import dash
 import dash_core_components as dcc
 import dash_html_components as html
 from dash.dependencies import Input, Output, State
 import numpy as np
 from scipy.io.wavfile import write
 import torch


 # Load model
 tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
 tacotron2 = tacotron2.to('cuda')
 tacotron2 = amp.initialize(tacotron2, opt_level="O1")
 tacotron2.eval()

 waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
 waveglow = waveglow.remove_weightnorm(waveglow)
 waveglow = waveglow.to('cuda')
 waveglow = amp.initialize(waveglow, opt_level="O1")
 waveglow.eval()


 # Dash app starts here
 external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

 app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

 app.layout = html.Div(children=[
    html.H1(children='Dash Text-to-Speech'),

    dcc.Textarea(
        id='textarea-input',
        value=text,
        style={'width': '100%', 'height': '45vh'}
    ),
    
    dcc.Loading([
        html.Button("Generate", id='button'),
        html.Audio(id='audio-out', controls=True)
    ])
 ])


 @app.callback(Output("audio-out", "src"),
              [Input("button", "n_clicks")],
              [State("textarea-input", "value")])
 def generate_audio(n_clicks, text):
    if text == "":
        text = "Sorry, there's nothing in the text input. Please write something."
        
    t0 = time.time()
    # preprocessing
    sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)

    # run the models
    with torch.no_grad():
        _, mel, _, _ = tacotron2.infer(sequence)
        audio = waveglow.infer(mel)

    audio_numpy = audio[0].data.cpu().numpy()
    rate = 22050

    t1 = time.time()

    buffer = io.BytesIO()
    write(buffer, rate, audio_numpy)
    b64 = base64.b64encode(buffer.getvalue())
    sound = "data:audio/x-wav;base64," + b64.decode("ascii")

    t2 = time.time()

    print(f"Completed in {t2-t0:.3f}s. Generation took {t1-t0:.3f}s, file creation took {t2-t1:.3f}s")

    return sound

 if __name__ == "__main__":
    app.run_server(debug=False)
	# -- coding: utf-8 --
	"""
	Speech Synthesis with Dash. This will only work with a Nvidia GPU.

	Requirements (put them in requirements.txt):

	apex
	dash
	numpy
	scipy
	torch
	"""
	import base64
	import time
	import io

	from apex import amp
	import dash
	import dash_core_components as dcc
	import dash_html_components as html
	from dash.dependencies import Input, Output, State
	import numpy as np
	from scipy.io.wavfile import write
	import torch


	# Load model
	tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
	tacotron2 = tacotron2.to('cuda')
	tacotron2 = amp.initialize(tacotron2, opt_level="O1")
	tacotron2.eval()

	waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
	waveglow = waveglow.remove_weightnorm(waveglow)
	waveglow = waveglow.to('cuda')
	waveglow = amp.initialize(waveglow, opt_level="O1")
	waveglow.eval()


	# Dash app starts here
	external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

	app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

	app.layout = html.Div(children=[
	html.H1(children='Dash Text-to-Speech'),

	dcc.Textarea(
	id='textarea-input',
	value=text,
	style={'width': '100%', 'height': '45vh'}
	),

	dcc.Loading([
	html.Button("Generate", id='button'),
	html.Audio(id='audio-out', controls=True)
	])
	])


	@app.callback(Output("audio-out", "src"),
	[Input("button", "n_clicks")],
	[State("textarea-input", "value")])
	def generate_audio(n_clicks, text):
	if text == "":
	text = "Sorry, there's nothing in the text input. Please write something."

	t0 = time.time()
	# preprocessing
	sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]
	sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)

	# run the models
	with torch.no_grad():
	_, mel, _, _ = tacotron2.infer(sequence)
	audio = waveglow.infer(mel)

	audio_numpy = audio[0].data.cpu().numpy()
	rate = 22050

	t1 = time.time()

	buffer = io.BytesIO()
	write(buffer, rate, audio_numpy)
	b64 = base64.b64encode(buffer.getvalue())
	sound = "data:audio/x-wav;base64," + b64.decode("ascii")

	t2 = time.time()

	print(f"Completed in {t2-t0:.3f}s. Generation took {t1-t0:.3f}s, file creation took {t2-t1:.3f}s")

	return sound

	if __name__ == "__main__":
	app.run_server(debug=False)