Skip to content

Instantly share code, notes, and snippets.

@SolomidHero
Last active May 18, 2021 21:53
Show Gist options
  • Save SolomidHero/a8669f107c788dbbce5d7899e2b1cb80 to your computer and use it in GitHub Desktop.
Save SolomidHero/a8669f107c788dbbce5d7899e2b1cb80 to your computer and use it in GitHub Desktop.
VC speaker embeddings (by AdaIN-VC speaker encoder, by Resemblyzer)
# speaker embeddings by AdaIN-VC speaker encoder
# ref: https://github.com/cyhuang-tw/AdaIN-VC
#
# 1. clone repo above
# $ git clone https://github.com/cyhuang-tw/AdaIN-VC
# $ cd AdaIN-VC
# 2. download pretrained checkpoint (by repo authors) and unzip it
# $ gdown --id 1d_llv1qaCpPjioReh4AT8K_-qqG2zoIx
# $ unzip -qo /content/vctk_model.zip
# 3. change pathes in this script and run it (with `python3 to_adain_emb.py`)
#
# Now in $result_folder there would be files '{file_prefix}-{filename}.npy' which is 1d-array with embedding
dataset_path = '~/data/LibriTTS/dev-clean'
file_prefix = 'LibriTTS-adain-'
result_folder = 'adain'
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import yaml
import os
from model import AE
from preprocess import get_spectrogram
import json
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
with open('config.yaml') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
with open('vocoder/config.json') as f:
data_cfg = json.load(f)
# load model
model = AE(config).eval()
model.load_state_dict(torch.load('model.ckpt'))
spk_encoder = model.speaker_encoder.to(device)
# process datafolder
os.makedirs(result_folder, exist_ok=True)
dataset_path = os.path.abspath(dataset_path)
for speaker_name in tqdm(os.listdir(dataset_path)):
for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)):
for fn in filenames:
if fn[-4:] != '.wav':
continue
fn, mel = get_spectrogram(os.path.join(root, fn), data_cfg)
with torch.no_grad():
emb = spk_encoder(torch.from_numpy(mel).T.unsqueeze(0).to(device)).squeeze(0).cpu()
res_fn = f'{file_prefix}{Path(fn).stem}.npy'
# res_fn = f'{file_prefix}{speaker_name}-{Path(fn).stem}.npy'
np.save(os.path.join(result_folder, res_fn), emb)
# speaker embeddings by Resemblyzer
# ref: https://github.com/resemble-ai/Resemblyzer
#
# 1. change pathes in this script and run it (with `python3 to_resemblyzer_emb.py`)
#
dataset_path = '~/data/LibriTTS/dev-clean'
file_prefix = 'LibriTTS-xvec-'
result_folder = 'xvec'
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from resemblyzer import VoiceEncoder, preprocess_wav
import os
import numpy as np
import torch.multiprocessing as mp
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from tqdm.auto import tqdm
# load encoder model
encoder = VoiceEncoder().to(device)
# process datafolder for wavs
class WavDataset(Dataset):
def __init__(self, dataset_path, ext='.wav'):
dataset_path = os.path.abspath(dataset_path)
self.filepathes = []
for speaker_name in os.listdir(dataset_path):
for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)):
for fn in filenames:
if fn[-4:] != ext:
continue
wav_path = Path(os.path.join(root, fn))
out_path = os.path.join(result_folder, f'{file_prefix}{Path(fn).stem}.npy')
self.filepathes.append((wav_path, out_path))
def __len__(self):
return len(self.filepathes)
def __getitem__(self, idx):
wav_path, out_path = self.filepathes[idx]
wav = preprocess_wav(Path(wav_path))
return wav, out_path
dataset = WavDataset(dataset_path)
loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, collate_fn=lambda x: x[0])
# process embedding in result
os.makedirs(result_folder, exist_ok=True)
for wav, out_path in tqdm(loader):
emb = encoder.embed_utterance(wav)
np.save(out_path, emb)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment