SolomidHero · May 18, 2021 21:53
diff --git a/to_adain_emb.py b/to_adain_emb.py
 # speaker embeddings by AdaIN-VC speaker encoder
 # ref: https://github.com/cyhuang-tw/AdaIN-VC
 #
 # 1. clone repo above
 #   $ git clone https://github.com/cyhuang-tw/AdaIN-VC
 #   $ cd AdaIN-VC
 # 2. download pretrained checkpoint (by repo authors) and unzip it
 #   $  gdown --id 1d_llv1qaCpPjioReh4AT8K_-qqG2zoIx
 #   $  unzip -qo /content/vctk_model.zip
 # 3. change pathes in this script and run it (with `python3 to_adain_emb.py`)
 #
 # Now in $result_folder there would be files '{file_prefix}-{filename}.npy' which is 1d-array with embedding


 dataset_path = '~/data/LibriTTS/dev-clean'
 file_prefix = 'LibriTTS-adain-'
 result_folder = 'adain'

 import torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 import yaml
 import os
 from model import AE

 from preprocess import get_spectrogram
 import json

 from pathlib import Path
 import numpy as np
 from tqdm.auto import tqdm

 with open('config.yaml') as f:
  config = yaml.load(f, Loader=yaml.FullLoader)

 with open('vocoder/config.json') as f:
  data_cfg = json.load(f)

 # load model
 model = AE(config).eval()
 model.load_state_dict(torch.load('model.ckpt'))
 spk_encoder = model.speaker_encoder.to(device)

 # process datafolder
 os.makedirs(result_folder, exist_ok=True)
 dataset_path = os.path.abspath(dataset_path)

 for speaker_name in tqdm(os.listdir(dataset_path)):
  for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)):
    for fn in filenames:
      if fn[-4:] != '.wav':
        continue

      fn, mel = get_spectrogram(os.path.join(root, fn), data_cfg)
      with torch.no_grad():
        emb = spk_encoder(torch.from_numpy(mel).T.unsqueeze(0).to(device)).squeeze(0).cpu()
      res_fn = f'{file_prefix}{Path(fn).stem}.npy'
      # res_fn = f'{file_prefix}{speaker_name}-{Path(fn).stem}.npy'
      np.save(os.path.join(result_folder, res_fn), emb)
diff --git a/to_resemblyzer_emb.py b/to_resemblyzer_emb.py
 # speaker embeddings by Resemblyzer
 # ref: https://github.com/resemble-ai/Resemblyzer
 #
 # 1. change pathes in this script and run it (with `python3 to_resemblyzer_emb.py`)
 #

 dataset_path = '~/data/LibriTTS/dev-clean'
 file_prefix = 'LibriTTS-xvec-'
 result_folder = 'xvec'

 import torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 from resemblyzer import VoiceEncoder, preprocess_wav

 import os
 import numpy as np
 import torch.multiprocessing as mp
 from torch.utils.data import Dataset, DataLoader
 from pathlib import Path
 from tqdm.auto import tqdm

 # load encoder model
 encoder = VoiceEncoder().to(device)

 # process datafolder for wavs
 class WavDataset(Dataset):
  def __init__(self, dataset_path, ext='.wav'):
    dataset_path = os.path.abspath(dataset_path)
    self.filepathes = []

    for speaker_name in os.listdir(dataset_path):
      for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)):
        for fn in filenames:
          if fn[-4:] != ext:
            continue
          wav_path = Path(os.path.join(root, fn))
          out_path = os.path.join(result_folder, f'{file_prefix}{Path(fn).stem}.npy')
          self.filepathes.append((wav_path, out_path))

  def __len__(self):
    return len(self.filepathes)

  def __getitem__(self, idx):
    wav_path, out_path = self.filepathes[idx]
    wav = preprocess_wav(Path(wav_path))
    return wav, out_path

 dataset = WavDataset(dataset_path)
 loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, collate_fn=lambda x: x[0])

 # process embedding in result
 os.makedirs(result_folder, exist_ok=True)
 for wav, out_path in tqdm(loader):
  emb = encoder.embed_utterance(wav)
  np.save(out_path, emb)
	# speaker embeddings by AdaIN-VC speaker encoder
	# ref: https://github.com/cyhuang-tw/AdaIN-VC
	#
	# 1. clone repo above
	# $ git clone https://github.com/cyhuang-tw/AdaIN-VC
	# $ cd AdaIN-VC
	# 2. download pretrained checkpoint (by repo authors) and unzip it
	# $ gdown --id 1d_llv1qaCpPjioReh4AT8K_-qqG2zoIx
	# $ unzip -qo /content/vctk_model.zip
	# 3. change pathes in this script and run it (with `python3 to_adain_emb.py`)
	#
	# Now in $result_folder there would be files '{file_prefix}-{filename}.npy' which is 1d-array with embedding


	dataset_path = '~/data/LibriTTS/dev-clean'
	file_prefix = 'LibriTTS-adain-'
	result_folder = 'adain'

	import torch
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	import yaml
	import os
	from model import AE

	from preprocess import get_spectrogram
	import json

	from pathlib import Path
	import numpy as np
	from tqdm.auto import tqdm

	with open('config.yaml') as f:
	config = yaml.load(f, Loader=yaml.FullLoader)

	with open('vocoder/config.json') as f:
	data_cfg = json.load(f)

	# load model
	model = AE(config).eval()
	model.load_state_dict(torch.load('model.ckpt'))
	spk_encoder = model.speaker_encoder.to(device)

	# process datafolder
	os.makedirs(result_folder, exist_ok=True)
	dataset_path = os.path.abspath(dataset_path)

	for speaker_name in tqdm(os.listdir(dataset_path)):
	for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)):
	for fn in filenames:
	if fn[-4:] != '.wav':
	continue

	fn, mel = get_spectrogram(os.path.join(root, fn), data_cfg)
	with torch.no_grad():
	emb = spk_encoder(torch.from_numpy(mel).T.unsqueeze(0).to(device)).squeeze(0).cpu()
	res_fn = f'{file_prefix}{Path(fn).stem}.npy'
	# res_fn = f'{file_prefix}{speaker_name}-{Path(fn).stem}.npy'
	np.save(os.path.join(result_folder, res_fn), emb)
	# speaker embeddings by Resemblyzer
	# ref: https://github.com/resemble-ai/Resemblyzer
	#
	# 1. change pathes in this script and run it (with `python3 to_resemblyzer_emb.py`)
	#

	dataset_path = '~/data/LibriTTS/dev-clean'
	file_prefix = 'LibriTTS-xvec-'
	result_folder = 'xvec'

	import torch
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	from resemblyzer import VoiceEncoder, preprocess_wav

	import os
	import numpy as np
	import torch.multiprocessing as mp
	from torch.utils.data import Dataset, DataLoader
	from pathlib import Path
	from tqdm.auto import tqdm

	# load encoder model
	encoder = VoiceEncoder().to(device)

	# process datafolder for wavs
	class WavDataset(Dataset):
	def __init__(self, dataset_path, ext='.wav'):
	dataset_path = os.path.abspath(dataset_path)
	self.filepathes = []

	for speaker_name in os.listdir(dataset_path):
	for root, _, filenames in os.walk(os.path.join(dataset_path, speaker_name)):
	for fn in filenames:
	if fn[-4:] != ext:
	continue
	wav_path = Path(os.path.join(root, fn))
	out_path = os.path.join(result_folder, f'{file_prefix}{Path(fn).stem}.npy')
	self.filepathes.append((wav_path, out_path))

	def __len__(self):
	return len(self.filepathes)

	def __getitem__(self, idx):
	wav_path, out_path = self.filepathes[idx]
	wav = preprocess_wav(Path(wav_path))
	return wav, out_path

	dataset = WavDataset(dataset_path)
	loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, collate_fn=lambda x: x[0])

	# process embedding in result
	os.makedirs(result_folder, exist_ok=True)
	for wav, out_path in tqdm(loader):
	emb = encoder.embed_utterance(wav)
	np.save(out_path, emb)