Created
February 25, 2020 18:50
-
-
Save bahorn/c33df98bd2ddcdd172f8278c3db86d11 to your computer and use it in GitHub Desktop.
https://github.com/CorentinJ/Real-Time-Voice-Cloning playing around with the demo_cli script to implement other ideas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from encoder.params_model import model_embedding_size as speaker_embedding_size | |
from utils.argutils import print_args | |
from synthesizer.inference import Synthesizer | |
from encoder import inference as encoder | |
from vocoder import inference as vocoder | |
from pathlib import Path | |
import numpy as np | |
import librosa | |
import argparse | |
import torch | |
import sys | |
import random | |
import string | |
import re | |
import math | |
def smooth(x,window_len=11,window='hamming'): | |
if x.ndim != 1: | |
raise ValueError("smooth only accepts 1 dimension arrays.") | |
if x.size < window_len: | |
raise ValueError("Input vector needs to be bigger than window size.") | |
if window_len<3: | |
return x | |
if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']: | |
raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'") | |
if window == 'flat': #moving average | |
w=np.ones(window_len,'d') | |
else: | |
w=eval('np.'+window+'(window_len)') | |
y=np.convolve(w/w.sum(),x,mode='valid') | |
return y | |
def make_text_buckets(text, ideal=30): | |
# first remove special characters | |
striped = text #re.sub(r"[^a-zA-Z0-9]+", ' ', text) | |
# split on spaces so we can bucket each word | |
split = striped.split(' ') | |
bucket_size = math.ceil(((ideal/len(split)))*ideal) | |
items = [split[i:i + bucket_size] for i in range(0, len(split), bucket_size)] | |
res = [] | |
for i in items: | |
res.append(' '.join(i)) | |
return res | |
def random_string(N): | |
return ''.join(random.choices(string.ascii_uppercase + string.digits, k=N)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument("-e", "--enc_model_fpath", type=Path, | |
default="encoder/saved_models/pretrained.pt", | |
help="Path to a saved encoder") | |
parser.add_argument("-s", "--syn_model_dir", type=Path, | |
default="synthesizer/saved_models/logs-pretrained/", | |
help="Directory containing the synthesizer model") | |
parser.add_argument("-v", "--voc_model_fpath", type=Path, | |
default="vocoder/saved_models/pretrained/pretrained.pt", | |
help="Path to a saved vocoder") | |
parser.add_argument("--low_mem", action="store_true", help=\ | |
"If True, the memory used by the synthesizer will be freed after each use. Adds large " | |
"overhead but allows to save some GPU memory for lower-end GPUs.") | |
parser.add_argument("--window", type=int, default=5) | |
parser.add_argument("--length", type=int, default=30) | |
parser.add_argument("--output") | |
parser.add_argument("voice") | |
parser.add_argument("text") | |
args = parser.parse_args() | |
output_fname = '{}/{}.wav'.format('outputs',random_string(16)) | |
if args.output: | |
output_fname= args.output | |
## Print some environment information (for debugging purposes) | |
if not torch.cuda.is_available(): | |
print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " | |
"for deep learning, ensure that the drivers are properly installed, and that your " | |
"CUDA version matches your PyTorch installation. CPU-only inference is currently " | |
"not supported.", file=sys.stderr) | |
quit(-1) | |
device_id = torch.cuda.current_device() | |
gpu_properties = torch.cuda.get_device_properties(device_id) | |
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " | |
"%.1fGb total memory.\n" % | |
(torch.cuda.device_count(), | |
device_id, | |
gpu_properties.name, | |
gpu_properties.major, | |
gpu_properties.minor, | |
gpu_properties.total_memory / 1e9)) | |
## Load the models one by one. | |
print("Preparing the encoder, the synthesizer and the vocoder...") | |
encoder.load_model(args.enc_model_fpath) | |
synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) | |
vocoder.load_model(args.voc_model_fpath) | |
try: | |
# Get the reference audio filepath | |
message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ | |
"wav, m4a, flac, ...):\n" | |
in_fpath = Path(args.voice.replace("\"", "").replace("\'", "")) | |
## Computing the embedding | |
# First, we load the wav using the function that the speaker encoder provides. This is | |
# important: there is preprocessing that must be applied. | |
# The following two methods are equivalent: | |
# - Directly load from the filepath: | |
preprocessed_wav = encoder.preprocess_wav(in_fpath) | |
# - If the wav is already loaded: | |
original_wav, sampling_rate = librosa.load(in_fpath) | |
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) | |
print("Loaded file succesfully") | |
# Then we derive the embedding. There are many functions and parameters that the | |
# speaker encoder interfaces. These are mostly for in-depth research. You will typically | |
# only use this function (with its default parameters): | |
embed = encoder.embed_utterance(preprocessed_wav) | |
print("Created the embedding") | |
# into optimally sized buckets. | |
texts = make_text_buckets(args.text, ideal=args.length) | |
embeds = [embed for _ in texts] | |
# If you know what the attention layer alignments are, you can retrieve them here by | |
# passing return_alignments=True | |
specs = synthesizer.synthesize_spectrograms(texts, embeds) | |
generated_wav = [] | |
for spec in specs: | |
curr_wav = vocoder.infer_waveform(spec) | |
yt, index = librosa.effects.trim(curr_wav, top_db=100) | |
generated_wav.append(yt) | |
combined = np.concatenate(generated_wav) | |
intervals = librosa.effects.split(combined, top_db=100) | |
res = [] | |
for start, end in intervals: | |
res.append( | |
np.pad( | |
combined[start:end], | |
(0, int(synthesizer.sample_rate/2)), | |
mode='constant' | |
) | |
) | |
output = np.concatenate(res).astype(np.float32) | |
# finally smooth the output | |
output = smooth(output, window_len=args.window) | |
librosa.output.write_wav( | |
output_fname, | |
output, | |
synthesizer.sample_rate | |
) | |
print("\nSaved output as %s\n\n" % output_fname) | |
except Exception as e: | |
print("Caught exception: %s" % repr(e)) | |
print("Restarting\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment