Skip to content

Instantly share code, notes, and snippets.

@bahorn
Created February 25, 2020 18:50
Show Gist options
  • Save bahorn/c33df98bd2ddcdd172f8278c3db86d11 to your computer and use it in GitHub Desktop.
Save bahorn/c33df98bd2ddcdd172f8278c3db86d11 to your computer and use it in GitHub Desktop.
https://github.com/CorentinJ/Real-Time-Voice-Cloning playing around with the demo_cli script to implement other ideas
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa
import argparse
import torch
import sys
import random
import string
import re
import math
def smooth(x,window_len=11,window='hamming'):
if x.ndim != 1:
raise ValueError("smooth only accepts 1 dimension arrays.")
if x.size < window_len:
raise ValueError("Input vector needs to be bigger than window size.")
if window_len<3:
return x
if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")
if window == 'flat': #moving average
w=np.ones(window_len,'d')
else:
w=eval('np.'+window+'(window_len)')
y=np.convolve(w/w.sum(),x,mode='valid')
return y
def make_text_buckets(text, ideal=30):
# first remove special characters
striped = text #re.sub(r"[^a-zA-Z0-9]+", ' ', text)
# split on spaces so we can bucket each word
split = striped.split(' ')
bucket_size = math.ceil(((ideal/len(split)))*ideal)
items = [split[i:i + bucket_size] for i in range(0, len(split), bucket_size)]
res = []
for i in items:
res.append(' '.join(i))
return res
def random_string(N):
return ''.join(random.choices(string.ascii_uppercase + string.digits, k=N))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("-e", "--enc_model_fpath", type=Path,
default="encoder/saved_models/pretrained.pt",
help="Path to a saved encoder")
parser.add_argument("-s", "--syn_model_dir", type=Path,
default="synthesizer/saved_models/logs-pretrained/",
help="Directory containing the synthesizer model")
parser.add_argument("-v", "--voc_model_fpath", type=Path,
default="vocoder/saved_models/pretrained/pretrained.pt",
help="Path to a saved vocoder")
parser.add_argument("--low_mem", action="store_true", help=\
"If True, the memory used by the synthesizer will be freed after each use. Adds large "
"overhead but allows to save some GPU memory for lower-end GPUs.")
parser.add_argument("--window", type=int, default=5)
parser.add_argument("--length", type=int, default=30)
parser.add_argument("--output")
parser.add_argument("voice")
parser.add_argument("text")
args = parser.parse_args()
output_fname = '{}/{}.wav'.format('outputs',random_string(16))
if args.output:
output_fname= args.output
## Print some environment information (for debugging purposes)
if not torch.cuda.is_available():
print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
"for deep learning, ensure that the drivers are properly installed, and that your "
"CUDA version matches your PyTorch installation. CPU-only inference is currently "
"not supported.", file=sys.stderr)
quit(-1)
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
"%.1fGb total memory.\n" %
(torch.cuda.device_count(),
device_id,
gpu_properties.name,
gpu_properties.major,
gpu_properties.minor,
gpu_properties.total_memory / 1e9))
## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
encoder.load_model(args.enc_model_fpath)
synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
vocoder.load_model(args.voc_model_fpath)
try:
# Get the reference audio filepath
message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
"wav, m4a, flac, ...):\n"
in_fpath = Path(args.voice.replace("\"", "").replace("\'", ""))
## Computing the embedding
# First, we load the wav using the function that the speaker encoder provides. This is
# important: there is preprocessing that must be applied.
# The following two methods are equivalent:
# - Directly load from the filepath:
preprocessed_wav = encoder.preprocess_wav(in_fpath)
# - If the wav is already loaded:
original_wav, sampling_rate = librosa.load(in_fpath)
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
print("Loaded file succesfully")
# Then we derive the embedding. There are many functions and parameters that the
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
# only use this function (with its default parameters):
embed = encoder.embed_utterance(preprocessed_wav)
print("Created the embedding")
# into optimally sized buckets.
texts = make_text_buckets(args.text, ideal=args.length)
embeds = [embed for _ in texts]
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(texts, embeds)
generated_wav = []
for spec in specs:
curr_wav = vocoder.infer_waveform(spec)
yt, index = librosa.effects.trim(curr_wav, top_db=100)
generated_wav.append(yt)
combined = np.concatenate(generated_wav)
intervals = librosa.effects.split(combined, top_db=100)
res = []
for start, end in intervals:
res.append(
np.pad(
combined[start:end],
(0, int(synthesizer.sample_rate/2)),
mode='constant'
)
)
output = np.concatenate(res).astype(np.float32)
# finally smooth the output
output = smooth(output, window_len=args.window)
librosa.output.write_wav(
output_fname,
output,
synthesizer.sample_rate
)
print("\nSaved output as %s\n\n" % output_fname)
except Exception as e:
print("Caught exception: %s" % repr(e))
print("Restarting\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment