Last active
May 18, 2021 21:53
-
-
Save SolomidHero/a8669f107c788dbbce5d7899e2b1cb80 to your computer and use it in GitHub Desktop.
VC speaker embeddings (by AdaIN-VC speaker encoder, by Resemblyzer)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# speaker embeddings by AdaIN-VC speaker encoder | |
# ref: https://github.com/cyhuang-tw/AdaIN-VC | |
# | |
# 1. clone repo above | |
# $ git clone https://github.com/cyhuang-tw/AdaIN-VC | |
# $ cd AdaIN-VC | |
# 2. download pretrained checkpoint (by repo authors) and unzip it | |
# $ gdown --id 1d_llv1qaCpPjioReh4AT8K_-qqG2zoIx | |
# $ unzip -qo /content/vctk_model.zip | |
# 3. change pathes in this script and run it (with `python3 to_adain_emb.py`) | |
# | |
# Now in $result_folder there would be files '{file_prefix}-{filename}.npy' which is 1d-array with embedding | |
dataset_path = '~/data/LibriTTS/dev-clean' | |
file_prefix = 'LibriTTS-adain-' | |
result_folder = 'adain' | |
import torch | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
import yaml | |
import os | |
from model import AE | |
from preprocess import get_spectrogram | |
import json | |
from pathlib import Path | |
import numpy as np | |
from tqdm.auto import tqdm | |
with open('config.yaml') as f: | |
config = yaml.load(f, Loader=yaml.FullLoader) | |
with open('vocoder/config.json') as f: | |
data_cfg = json.load(f) | |
# load model | |
model = AE(config).eval() | |
model.load_state_dict(torch.load('model.ckpt')) | |
spk_encoder = model.speaker_encoder.to(device) | |
# process datafolder | |
os.makedirs(result_folder, exist_ok=True) | |
dataset_path = os.path.abspath(dataset_path) | |
for speaker_name in tqdm(os.listdir(dataset_path)): | |
for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)): | |
for fn in filenames: | |
if fn[-4:] != '.wav': | |
continue | |
fn, mel = get_spectrogram(os.path.join(root, fn), data_cfg) | |
with torch.no_grad(): | |
emb = spk_encoder(torch.from_numpy(mel).T.unsqueeze(0).to(device)).squeeze(0).cpu() | |
res_fn = f'{file_prefix}{Path(fn).stem}.npy' | |
# res_fn = f'{file_prefix}{speaker_name}-{Path(fn).stem}.npy' | |
np.save(os.path.join(result_folder, res_fn), emb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# speaker embeddings by Resemblyzer | |
# ref: https://github.com/resemble-ai/Resemblyzer | |
# | |
# 1. change pathes in this script and run it (with `python3 to_resemblyzer_emb.py`) | |
# | |
dataset_path = '~/data/LibriTTS/dev-clean' | |
file_prefix = 'LibriTTS-xvec-' | |
result_folder = 'xvec' | |
import torch | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
from resemblyzer import VoiceEncoder, preprocess_wav | |
import os | |
import numpy as np | |
import torch.multiprocessing as mp | |
from torch.utils.data import Dataset, DataLoader | |
from pathlib import Path | |
from tqdm.auto import tqdm | |
# load encoder model | |
encoder = VoiceEncoder().to(device) | |
# process datafolder for wavs | |
class WavDataset(Dataset): | |
def __init__(self, dataset_path, ext='.wav'): | |
dataset_path = os.path.abspath(dataset_path) | |
self.filepathes = [] | |
for speaker_name in os.listdir(dataset_path): | |
for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)): | |
for fn in filenames: | |
if fn[-4:] != ext: | |
continue | |
wav_path = Path(os.path.join(root, fn)) | |
out_path = os.path.join(result_folder, f'{file_prefix}{Path(fn).stem}.npy') | |
self.filepathes.append((wav_path, out_path)) | |
def __len__(self): | |
return len(self.filepathes) | |
def __getitem__(self, idx): | |
wav_path, out_path = self.filepathes[idx] | |
wav = preprocess_wav(Path(wav_path)) | |
return wav, out_path | |
dataset = WavDataset(dataset_path) | |
loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, collate_fn=lambda x: x[0]) | |
# process embedding in result | |
os.makedirs(result_folder, exist_ok=True) | |
for wav, out_path in tqdm(loader): | |
emb = encoder.embed_utterance(wav) | |
np.save(out_path, emb) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment